diff options
Diffstat (limited to 'drivers/nvme')
-rw-r--r-- | drivers/nvme/host/Kconfig | 12 | ||||
-rw-r--r-- | drivers/nvme/host/Makefile | 1 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 561 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.c | 73 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.h | 6 | ||||
-rw-r--r-- | drivers/nvme/host/fc.c | 223 | ||||
-rw-r--r-- | drivers/nvme/host/lightnvm.c | 18 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 46 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 757 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 318 | ||||
-rw-r--r-- | drivers/nvme/host/scsi.c | 2460 | ||||
-rw-r--r-- | drivers/nvme/target/admin-cmd.c | 65 | ||||
-rw-r--r-- | drivers/nvme/target/configfs.c | 68 | ||||
-rw-r--r-- | drivers/nvme/target/core.c | 3 | ||||
-rw-r--r-- | drivers/nvme/target/discovery.c | 4 | ||||
-rw-r--r-- | drivers/nvme/target/fc.c | 30 | ||||
-rw-r--r-- | drivers/nvme/target/fcloop.c | 2 | ||||
-rw-r--r-- | drivers/nvme/target/io-cmd.c | 6 | ||||
-rw-r--r-- | drivers/nvme/target/loop.c | 114 | ||||
-rw-r--r-- | drivers/nvme/target/nvmet.h | 2 | ||||
-rw-r--r-- | drivers/nvme/target/rdma.c | 102 |
21 files changed, 1424 insertions, 3447 deletions
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index 90745a616df7..46d6cb1e03bd 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -13,18 +13,6 @@ config BLK_DEV_NVME To compile this driver as a module, choose M here: the module will be called nvme. -config BLK_DEV_NVME_SCSI - bool "SCSI emulation for NVMe device nodes" - depends on NVME_CORE - ---help--- - This adds support for the SG_IO ioctl on the NVMe character - and block devices nodes, as well as a translation for a small - number of selected SCSI commands to NVMe commands to the NVMe - driver. If you don't know what this means you probably want - to say N here, unless you run a distro that abuses the SCSI - emulation to provide stable device names for mount by id, like - some OpenSuSE and SLES versions. - config NVME_FABRICS tristate diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index f1a7d945fbb6..cc0aacb4c8b4 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -5,7 +5,6 @@ obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o obj-$(CONFIG_NVME_FC) += nvme-fc.o nvme-core-y := core.o -nvme-core-$(CONFIG_BLK_DEV_NVME_SCSI) += scsi.o nvme-core-$(CONFIG_NVM) += lightnvm.o nvme-y += pci.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 903d5813023a..cb96f4a7ae3a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -27,7 +27,6 @@ #include <linux/nvme_ioctl.h> #include <linux/t10-pi.h> #include <linux/pm_qos.h> -#include <scsi/sg.h> #include <asm/unaligned.h> #include "nvme.h" @@ -45,7 +44,7 @@ module_param_named(io_timeout, nvme_io_timeout, byte, 0644); MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); EXPORT_SYMBOL_GPL(nvme_io_timeout); -unsigned char shutdown_timeout = 5; +static unsigned char shutdown_timeout = 5; module_param(shutdown_timeout, byte, 0644); MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); @@ -65,34 +64,53 @@ static bool force_apst; module_param(force_apst, bool, 0644); MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off"); +static bool streams; +module_param(streams, bool, 0644); +MODULE_PARM_DESC(streams, "turn on support for Streams write directives"); + +struct workqueue_struct *nvme_wq; +EXPORT_SYMBOL_GPL(nvme_wq); + static LIST_HEAD(nvme_ctrl_list); static DEFINE_SPINLOCK(dev_list_lock); static struct class *nvme_class; -static int nvme_error_status(struct request *req) +int nvme_reset_ctrl(struct nvme_ctrl *ctrl) +{ + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) + return -EBUSY; + if (!queue_work(nvme_wq, &ctrl->reset_work)) + return -EBUSY; + return 0; +} +EXPORT_SYMBOL_GPL(nvme_reset_ctrl); + +static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) +{ + int ret; + + ret = nvme_reset_ctrl(ctrl); + if (!ret) + flush_work(&ctrl->reset_work); + return ret; +} + +static blk_status_t nvme_error_status(struct request *req) { switch (nvme_req(req)->status & 0x7ff) { case NVME_SC_SUCCESS: - return 0; + return BLK_STS_OK; case NVME_SC_CAP_EXCEEDED: - return -ENOSPC; - default: - return -EIO; - - /* - * XXX: these errors are a nasty side-band protocol to - * drivers/md/dm-mpath.c:noretry_error() that aren't documented - * anywhere.. - */ - case NVME_SC_CMD_SEQ_ERROR: - return -EILSEQ; + return BLK_STS_NOSPC; case NVME_SC_ONCS_NOT_SUPPORTED: - return -EOPNOTSUPP; + return BLK_STS_NOTSUPP; case NVME_SC_WRITE_FAULT: case NVME_SC_READ_ERROR: case NVME_SC_UNWRITTEN_BLOCK: - return -ENODATA; + return BLK_STS_MEDIUM; + default: + return BLK_STS_IOERR; } } @@ -113,7 +131,7 @@ void nvme_complete_rq(struct request *req) { if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) { nvme_req(req)->retries++; - blk_mq_requeue_request(req, !blk_mq_queue_stopped(req->q)); + blk_mq_requeue_request(req, true); return; } @@ -165,7 +183,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, switch (old_state) { case NVME_CTRL_NEW: case NVME_CTRL_LIVE: - case NVME_CTRL_RECONNECTING: changed = true; /* FALLTHRU */ default: @@ -283,6 +300,105 @@ struct request *nvme_alloc_request(struct request_queue *q, } EXPORT_SYMBOL_GPL(nvme_alloc_request); +static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + + c.directive.opcode = nvme_admin_directive_send; + c.directive.nsid = cpu_to_le32(0xffffffff); + c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE; + c.directive.dtype = NVME_DIR_IDENTIFY; + c.directive.tdtype = NVME_DIR_STREAMS; + c.directive.endir = enable ? NVME_DIR_ENDIR : 0; + + return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0); +} + +static int nvme_disable_streams(struct nvme_ctrl *ctrl) +{ + return nvme_toggle_streams(ctrl, false); +} + +static int nvme_enable_streams(struct nvme_ctrl *ctrl) +{ + return nvme_toggle_streams(ctrl, true); +} + +static int nvme_get_stream_params(struct nvme_ctrl *ctrl, + struct streams_directive_params *s, u32 nsid) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + memset(s, 0, sizeof(*s)); + + c.directive.opcode = nvme_admin_directive_recv; + c.directive.nsid = cpu_to_le32(nsid); + c.directive.numd = sizeof(*s); + c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM; + c.directive.dtype = NVME_DIR_STREAMS; + + return nvme_submit_sync_cmd(ctrl->admin_q, &c, s, sizeof(*s)); +} + +static int nvme_configure_directives(struct nvme_ctrl *ctrl) +{ + struct streams_directive_params s; + int ret; + + if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES)) + return 0; + if (!streams) + return 0; + + ret = nvme_enable_streams(ctrl); + if (ret) + return ret; + + ret = nvme_get_stream_params(ctrl, &s, 0xffffffff); + if (ret) + return ret; + + ctrl->nssa = le16_to_cpu(s.nssa); + if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) { + dev_info(ctrl->device, "too few streams (%u) available\n", + ctrl->nssa); + nvme_disable_streams(ctrl); + return 0; + } + + ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1); + dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams); + return 0; +} + +/* + * Check if 'req' has a write hint associated with it. If it does, assign + * a valid namespace stream to the write. + */ +static void nvme_assign_write_stream(struct nvme_ctrl *ctrl, + struct request *req, u16 *control, + u32 *dsmgmt) +{ + enum rw_hint streamid = req->write_hint; + + if (streamid == WRITE_LIFE_NOT_SET || streamid == WRITE_LIFE_NONE) + streamid = 0; + else { + streamid--; + if (WARN_ON_ONCE(streamid > ctrl->nr_streams)) + return; + + *control |= NVME_RW_DTYPE_STREAMS; + *dsmgmt |= streamid << 16; + } + + if (streamid < ARRAY_SIZE(req->q->write_hints)) + req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9; +} + static inline void nvme_setup_flush(struct nvme_ns *ns, struct nvme_command *cmnd) { @@ -291,7 +407,7 @@ static inline void nvme_setup_flush(struct nvme_ns *ns, cmnd->common.nsid = cpu_to_le32(ns->ns_id); } -static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req, +static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, struct nvme_command *cmnd) { unsigned short segments = blk_rq_nr_discard_segments(req), n = 0; @@ -300,7 +416,7 @@ static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req, range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC); if (!range) - return BLK_MQ_RQ_QUEUE_BUSY; + return BLK_STS_RESOURCE; __rq_for_each_bio(bio, req) { u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector); @@ -314,7 +430,7 @@ static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req, if (WARN_ON_ONCE(n != segments)) { kfree(range); - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_IOERR; } memset(cmnd, 0, sizeof(*cmnd)); @@ -328,15 +444,26 @@ static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req, req->special_vec.bv_len = sizeof(*range) * segments; req->rq_flags |= RQF_SPECIAL_PAYLOAD; - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; } -static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req, - struct nvme_command *cmnd) +static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, + struct request *req, struct nvme_command *cmnd) { + struct nvme_ctrl *ctrl = ns->ctrl; u16 control = 0; u32 dsmgmt = 0; + /* + * If formated with metadata, require the block layer provide a buffer + * unless this namespace is formated such that the metadata can be + * stripped/generated by the controller with PRACT=1. + */ + if (ns && ns->ms && + (!ns->pi_type || ns->ms != sizeof(struct t10_pi_tuple)) && + !blk_integrity_rq(req) && !blk_rq_is_passthrough(req)) + return BLK_STS_NOTSUPP; + if (req->cmd_flags & REQ_FUA) control |= NVME_RW_FUA; if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) @@ -351,6 +478,9 @@ static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req, cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams) + nvme_assign_write_stream(ctrl, req, &control, &dsmgmt); + if (ns->ms) { switch (ns->pi_type) { case NVME_NS_DPS_PI_TYPE3: @@ -370,12 +500,13 @@ static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req, cmnd->rw.control = cpu_to_le16(control); cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); + return 0; } -int nvme_setup_cmd(struct nvme_ns *ns, struct request *req, +blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, struct nvme_command *cmd) { - int ret = BLK_MQ_RQ_QUEUE_OK; + blk_status_t ret = BLK_STS_OK; if (!(req->rq_flags & RQF_DONTPREP)) { nvme_req(req)->retries = 0; @@ -398,11 +529,11 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req, break; case REQ_OP_READ: case REQ_OP_WRITE: - nvme_setup_rw(ns, req, cmd); + ret = nvme_setup_rw(ns, req, cmd); break; default: WARN_ON_ONCE(1); - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_IOERR; } cmd->common.command_id = req->tag; @@ -555,15 +686,16 @@ int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, result, timeout); } -static void nvme_keep_alive_end_io(struct request *rq, int error) +static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) { struct nvme_ctrl *ctrl = rq->end_io_data; blk_mq_free_request(rq); - if (error) { + if (status) { dev_err(ctrl->device, - "failed nvme_keep_alive_end_io error=%d\n", error); + "failed nvme_keep_alive_end_io error=%d\n", + status); return; } @@ -599,7 +731,7 @@ static void nvme_keep_alive_work(struct work_struct *work) if (nvme_keep_alive(ctrl)) { /* allocation failure, reset the controller */ dev_err(ctrl->device, "keep-alive failed\n"); - ctrl->ops->reset_ctrl(ctrl); + nvme_reset_ctrl(ctrl); return; } } @@ -623,7 +755,7 @@ void nvme_stop_keep_alive(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_stop_keep_alive); -int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) +static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) { struct nvme_command c = { }; int error; @@ -643,6 +775,77 @@ int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) return error; } +static int nvme_identify_ns_descs(struct nvme_ns *ns, unsigned nsid) +{ + struct nvme_command c = { }; + int status; + void *data; + int pos; + int len; + + c.identify.opcode = nvme_admin_identify; + c.identify.nsid = cpu_to_le32(nsid); + c.identify.cns = NVME_ID_CNS_NS_DESC_LIST; + + data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); + if (!data) + return -ENOMEM; + + status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, data, + NVME_IDENTIFY_DATA_SIZE); + if (status) + goto free_data; + + for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) { + struct nvme_ns_id_desc *cur = data + pos; + + if (cur->nidl == 0) + break; + + switch (cur->nidt) { + case NVME_NIDT_EUI64: + if (cur->nidl != NVME_NIDT_EUI64_LEN) { + dev_warn(ns->ctrl->device, + "ctrl returned bogus length: %d for NVME_NIDT_EUI64\n", + cur->nidl); + goto free_data; + } + len = NVME_NIDT_EUI64_LEN; + memcpy(ns->eui, data + pos + sizeof(*cur), len); + break; + case NVME_NIDT_NGUID: + if (cur->nidl != NVME_NIDT_NGUID_LEN) { + dev_warn(ns->ctrl->device, + "ctrl returned bogus length: %d for NVME_NIDT_NGUID\n", + cur->nidl); + goto free_data; + } + len = NVME_NIDT_NGUID_LEN; + memcpy(ns->nguid, data + pos + sizeof(*cur), len); + break; + case NVME_NIDT_UUID: + if (cur->nidl != NVME_NIDT_UUID_LEN) { + dev_warn(ns->ctrl->device, + "ctrl returned bogus length: %d for NVME_NIDT_UUID\n", + cur->nidl); + goto free_data; + } + len = NVME_NIDT_UUID_LEN; + uuid_copy(&ns->uuid, data + pos + sizeof(*cur)); + break; + default: + /* Skip unnkown types */ + len = cur->nidl; + break; + } + + len += sizeof(*cur); + } +free_data: + kfree(data); + return status; +} + static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list) { struct nvme_command c = { }; @@ -653,7 +856,7 @@ static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *n return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000); } -int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, +static int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, struct nvme_id_ns **id) { struct nvme_command c = { }; @@ -675,26 +878,7 @@ int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, return error; } -int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, - void *buffer, size_t buflen, u32 *result) -{ - struct nvme_command c; - union nvme_result res; - int ret; - - memset(&c, 0, sizeof(c)); - c.features.opcode = nvme_admin_get_features; - c.features.nsid = cpu_to_le32(nsid); - c.features.fid = cpu_to_le32(fid); - - ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res, buffer, buflen, 0, - NVME_QID_ANY, 0, 0); - if (ret >= 0 && result) - *result = le32_to_cpu(res.u32); - return ret; -} - -int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, +static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, void *buffer, size_t buflen, u32 *result) { struct nvme_command c; @@ -713,28 +897,6 @@ int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, return ret; } -int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log) -{ - struct nvme_command c = { }; - int error; - - c.common.opcode = nvme_admin_get_log_page, - c.common.nsid = cpu_to_le32(0xFFFFFFFF), - c.common.cdw10[0] = cpu_to_le32( - (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) | - NVME_LOG_SMART), - - *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL); - if (!*log) - return -ENOMEM; - - error = nvme_submit_sync_cmd(dev->admin_q, &c, *log, - sizeof(struct nvme_smart_log)); - if (error) - kfree(*log); - return error; -} - int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) { u32 q_count = (*count - 1) | ((*count - 1) << 16); @@ -752,7 +914,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) * access to the admin queue, as that might be only way to fix them up. */ if (status > 0) { - dev_err(ctrl->dev, "Could not set queue count (%d)\n", status); + dev_err(ctrl->device, "Could not set queue count (%d)\n", status); *count = 0; } else { nr_io_queues = min(result & 0xffff, result >> 16) + 1; @@ -870,12 +1032,6 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg); case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, (void __user *)arg); -#ifdef CONFIG_BLK_DEV_NVME_SCSI - case SG_GET_VERSION_NUM: - return nvme_sg_get_version_num((void __user *)arg); - case SG_IO: - return nvme_sg_io(ns, (void __user *)arg); -#endif default: #ifdef CONFIG_NVM if (ns->ndev) @@ -892,10 +1048,6 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { - switch (cmd) { - case SG_IO: - return -ENOIOCTLCMD; - } return nvme_ioctl(bdev, mode, cmd, arg); } #else @@ -983,6 +1135,12 @@ static void nvme_init_integrity(struct nvme_ns *ns) } #endif /* CONFIG_BLK_DEV_INTEGRITY */ +static void nvme_set_chunk_size(struct nvme_ns *ns) +{ + u32 chunk_size = (((u32)ns->noiob) << (ns->lba_shift - 9)); + blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size)); +} + static void nvme_config_discard(struct nvme_ns *ns) { struct nvme_ctrl *ctrl = ns->ctrl; @@ -991,8 +1149,15 @@ static void nvme_config_discard(struct nvme_ns *ns) BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < NVME_DSM_MAX_RANGES); - ns->queue->limits.discard_alignment = logical_block_size; - ns->queue->limits.discard_granularity = logical_block_size; + if (ctrl->nr_streams && ns->sws && ns->sgs) { + unsigned int sz = logical_block_size * ns->sws * ns->sgs; + + ns->queue->limits.discard_alignment = sz; + ns->queue->limits.discard_granularity = sz; + } else { + ns->queue->limits.discard_alignment = logical_block_size; + ns->queue->limits.discard_granularity = logical_block_size; + } blk_queue_max_discard_sectors(ns->queue, UINT_MAX); blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES); queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); @@ -1016,7 +1181,15 @@ static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id) if (ns->ctrl->vs >= NVME_VS(1, 1, 0)) memcpy(ns->eui, (*id)->eui64, sizeof(ns->eui)); if (ns->ctrl->vs >= NVME_VS(1, 2, 0)) - memcpy(ns->uuid, (*id)->nguid, sizeof(ns->uuid)); + memcpy(ns->nguid, (*id)->nguid, sizeof(ns->nguid)); + if (ns->ctrl->vs >= NVME_VS(1, 3, 0)) { + /* Don't treat error as fatal we potentially + * already have a NGUID or EUI-64 + */ + if (nvme_identify_ns_descs(ns, ns->ns_id)) + dev_warn(ns->ctrl->device, + "%s: Identify Descriptors failed\n", __func__); + } return 0; } @@ -1024,6 +1197,7 @@ static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id) static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) { struct nvme_ns *ns = disk->private_data; + struct nvme_ctrl *ctrl = ns->ctrl; u16 bs; /* @@ -1034,12 +1208,15 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) if (ns->lba_shift == 0) ns->lba_shift = 9; bs = 1 << ns->lba_shift; + ns->noiob = le16_to_cpu(id->noiob); blk_mq_freeze_queue(disk->queue); - if (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) + if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) nvme_prep_integrity(disk, id, bs); blk_queue_logical_block_size(ns->queue, bs); + if (ns->noiob) + nvme_set_chunk_size(ns); if (ns->ms && !blk_get_integrity(disk) && !ns->ext) nvme_init_integrity(ns); if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) @@ -1047,7 +1224,7 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) else set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); - if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM) + if (ctrl->oncs & NVME_CTRL_ONCS_DSM) nvme_config_discard(ns); blk_mq_unfreeze_queue(disk->queue); } @@ -1283,7 +1460,7 @@ EXPORT_SYMBOL_GPL(nvme_enable_ctrl); int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) { - unsigned long timeout = SHUTDOWN_TIMEOUT + jiffies; + unsigned long timeout = jiffies + (shutdown_timeout * HZ); u32 csts; int ret; @@ -1372,7 +1549,7 @@ static void nvme_configure_apst(struct nvme_ctrl *ctrl) if (!table) return; - if (ctrl->ps_max_latency_us == 0) { + if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) { /* Turn off APST. */ apste = 0; dev_dbg(ctrl->device, "APST disabled\n"); @@ -1528,6 +1705,31 @@ static bool quirk_matches(const struct nvme_id_ctrl *id, string_matches(id->fr, q->fr, sizeof(id->fr)); } +static void nvme_init_subnqn(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) +{ + size_t nqnlen; + int off; + + nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE); + if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) { + strcpy(ctrl->subnqn, id->subnqn); + return; + } + + if (ctrl->vs >= NVME_VS(1, 2, 1)) + dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n"); + + /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */ + off = snprintf(ctrl->subnqn, NVMF_NQN_SIZE, + "nqn.2014.08.org.nvmexpress:%4x%4x", + le16_to_cpu(id->vid), le16_to_cpu(id->ssvid)); + memcpy(ctrl->subnqn + off, id->sn, sizeof(id->sn)); + off += sizeof(id->sn); + memcpy(ctrl->subnqn + off, id->mn, sizeof(id->mn)); + off += sizeof(id->mn); + memset(ctrl->subnqn + off, 0, sizeof(ctrl->subnqn) - off); +} + /* * Initialize the cached copies of the Identify data and various controller * register in our nvme_ctrl structure. This should be called as soon as @@ -1539,7 +1741,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) u64 cap; int ret, page_shift; u32 max_hw_sectors; - u8 prev_apsta; + bool prev_apst_enabled; ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); if (ret) { @@ -1563,6 +1765,8 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) return -EIO; } + nvme_init_subnqn(ctrl, id); + if (!ctrl->identified) { /* * Check for quirks. Quirk can depend on firmware version, @@ -1582,7 +1786,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) } if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) { - dev_warn(ctrl->dev, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n"); + dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n"); ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS; } @@ -1607,16 +1811,17 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->kas = le16_to_cpu(id->kas); ctrl->npss = id->npss; - prev_apsta = ctrl->apsta; + ctrl->apsta = id->apsta; + prev_apst_enabled = ctrl->apst_enabled; if (ctrl->quirks & NVME_QUIRK_NO_APST) { if (force_apst && id->apsta) { - dev_warn(ctrl->dev, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n"); - ctrl->apsta = 1; + dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n"); + ctrl->apst_enabled = true; } else { - ctrl->apsta = 0; + ctrl->apst_enabled = false; } } else { - ctrl->apsta = id->apsta; + ctrl->apst_enabled = id->apsta; } memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd)); @@ -1634,22 +1839,25 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ret = -EINVAL; if (!ctrl->opts->discovery_nqn && !ctrl->kas) { - dev_err(ctrl->dev, + dev_err(ctrl->device, "keep-alive support is mandatory for fabrics\n"); ret = -EINVAL; } } else { ctrl->cntlid = le16_to_cpu(id->cntlid); + ctrl->hmpre = le32_to_cpu(id->hmpre); + ctrl->hmmin = le32_to_cpu(id->hmmin); } kfree(id); - if (ctrl->apsta && !prev_apsta) + if (ctrl->apst_enabled && !prev_apst_enabled) dev_pm_qos_expose_latency_tolerance(ctrl->device); - else if (!ctrl->apsta && prev_apsta) + else if (!ctrl->apst_enabled && prev_apst_enabled) dev_pm_qos_hide_latency_tolerance(ctrl->device); nvme_configure_apst(ctrl); + nvme_configure_directives(ctrl); ctrl->identified = true; @@ -1735,7 +1943,7 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd, return nvme_dev_user_cmd(ctrl, argp); case NVME_IOCTL_RESET: dev_warn(ctrl->device, "resetting controller\n"); - return ctrl->ops->reset_ctrl(ctrl); + return nvme_reset_ctrl_sync(ctrl); case NVME_IOCTL_SUBSYS_RESET: return nvme_reset_subsystem(ctrl); case NVME_IOCTL_RESCAN: @@ -1761,7 +1969,7 @@ static ssize_t nvme_sysfs_reset(struct device *dev, struct nvme_ctrl *ctrl = dev_get_drvdata(dev); int ret; - ret = ctrl->ops->reset_ctrl(ctrl); + ret = nvme_reset_ctrl_sync(ctrl); if (ret < 0) return ret; return count; @@ -1787,8 +1995,8 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, int serial_len = sizeof(ctrl->serial); int model_len = sizeof(ctrl->model); - if (memchr_inv(ns->uuid, 0, sizeof(ns->uuid))) - return sprintf(buf, "eui.%16phN\n", ns->uuid); + if (memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) + return sprintf(buf, "eui.%16phN\n", ns->nguid); if (memchr_inv(ns->eui, 0, sizeof(ns->eui))) return sprintf(buf, "eui.%8phN\n", ns->eui); @@ -1803,11 +2011,28 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL); +static ssize_t nguid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); + return sprintf(buf, "%pU\n", ns->nguid); +} +static DEVICE_ATTR(nguid, S_IRUGO, nguid_show, NULL); + static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - return sprintf(buf, "%pU\n", ns->uuid); + + /* For backward compatibility expose the NGUID to userspace if + * we have no UUID set + */ + if (uuid_is_null(&ns->uuid)) { + printk_ratelimited(KERN_WARNING + "No UUID available providing old NGUID\n"); + return sprintf(buf, "%pU\n", ns->nguid); + } + return sprintf(buf, "%pU\n", &ns->uuid); } static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL); @@ -1830,6 +2055,7 @@ static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL); static struct attribute *nvme_ns_attrs[] = { &dev_attr_wwid.attr, &dev_attr_uuid.attr, + &dev_attr_nguid.attr, &dev_attr_eui.attr, &dev_attr_nsid.attr, NULL, @@ -1842,7 +2068,12 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, struct nvme_ns *ns = nvme_get_ns_from_dev(dev); if (a == &dev_attr_uuid.attr) { - if (!memchr_inv(ns->uuid, 0, sizeof(ns->uuid))) + if (uuid_is_null(&ns->uuid) || + !memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) + return 0; + } + if (a == &dev_attr_nguid.attr) { + if (!memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) return 0; } if (a == &dev_attr_eui.attr) { @@ -1931,8 +2162,7 @@ static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev, { struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - return snprintf(buf, PAGE_SIZE, "%s\n", - ctrl->ops->get_subsysnqn(ctrl)); + return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subnqn); } static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL); @@ -1961,24 +2191,16 @@ static struct attribute *nvme_dev_attrs[] = { NULL }; -#define CHECK_ATTR(ctrl, a, name) \ - if ((a) == &dev_attr_##name.attr && \ - !(ctrl)->ops->get_##name) \ - return 0 - static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = container_of(kobj, struct device, kobj); struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - if (a == &dev_attr_delete_controller.attr) { - if (!ctrl->ops->delete_ctrl) - return 0; - } - - CHECK_ATTR(ctrl, a, subsysnqn); - CHECK_ATTR(ctrl, a, address); + if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl) + return 0; + if (a == &dev_attr_address.attr && !ctrl->ops->get_address) + return 0; return a->mode; } @@ -2019,6 +2241,32 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) return ret; } +static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns) +{ + struct streams_directive_params s; + int ret; + + if (!ctrl->nr_streams) + return 0; + + ret = nvme_get_stream_params(ctrl, &s, ns->ns_id); + if (ret) + return ret; + + ns->sws = le32_to_cpu(s.sws); + ns->sgs = le16_to_cpu(s.sgs); + + if (ns->sws) { + unsigned int bs = 1 << ns->lba_shift; + + blk_queue_io_min(ns->queue, bs * ns->sws); + if (ns->sgs) + blk_queue_io_opt(ns->queue, bs * ns->sws * ns->sgs); + } + + return 0; +} + static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns *ns; @@ -2048,6 +2296,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); nvme_set_queue_limits(ctrl, ns->queue); + nvme_setup_streams_ns(ctrl, ns); sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->instance); @@ -2056,7 +2305,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (nvme_nvm_ns_supported(ns, id) && nvme_nvm_register(ns, disk_name, node)) { - dev_warn(ctrl->dev, "%s: LightNVM init failure\n", __func__); + dev_warn(ctrl->device, "%s: LightNVM init failure\n", __func__); goto out_free_id; } @@ -2231,7 +2480,7 @@ void nvme_queue_scan(struct nvme_ctrl *ctrl) * removal. */ if (ctrl->state == NVME_CTRL_LIVE) - schedule_work(&ctrl->scan_work); + queue_work(nvme_wq, &ctrl->scan_work); } EXPORT_SYMBOL_GPL(nvme_queue_scan); @@ -2286,7 +2535,7 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, /*FALLTHRU*/ case NVME_SC_ABORT_REQ: ++ctrl->event_limit; - schedule_work(&ctrl->async_event_work); + queue_work(nvme_wq, &ctrl->async_event_work); break; default: break; @@ -2309,7 +2558,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event); void nvme_queue_async_events(struct nvme_ctrl *ctrl) { ctrl->event_limit = NVME_NR_AERS; - schedule_work(&ctrl->async_event_work); + queue_work(nvme_wq, &ctrl->async_event_work); } EXPORT_SYMBOL_GPL(nvme_queue_async_events); @@ -2342,12 +2591,29 @@ static void nvme_release_instance(struct nvme_ctrl *ctrl) spin_unlock(&dev_list_lock); } -void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) +void nvme_stop_ctrl(struct nvme_ctrl *ctrl) { + nvme_stop_keep_alive(ctrl); flush_work(&ctrl->async_event_work); flush_work(&ctrl->scan_work); - nvme_remove_namespaces(ctrl); +} +EXPORT_SYMBOL_GPL(nvme_stop_ctrl); + +void nvme_start_ctrl(struct nvme_ctrl *ctrl) +{ + if (ctrl->kato) + nvme_start_keep_alive(ctrl); + + if (ctrl->queue_count > 1) { + nvme_queue_scan(ctrl); + nvme_queue_async_events(ctrl); + nvme_start_queues(ctrl); + } +} +EXPORT_SYMBOL_GPL(nvme_start_ctrl); +void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) +{ device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance)); spin_lock(&dev_list_lock); @@ -2442,8 +2708,8 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) mutex_lock(&ctrl->namespaces_mutex); - /* Forcibly start all queues to avoid having stuck requests */ - blk_mq_start_hw_queues(ctrl->admin_q); + /* Forcibly unquiesce queues to avoid blocking dispatch */ + blk_mq_unquiesce_queue(ctrl->admin_q); list_for_each_entry(ns, &ctrl->namespaces, list) { /* @@ -2455,15 +2721,8 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) revalidate_disk(ns->disk); blk_set_queue_dying(ns->queue); - /* - * Forcibly start all queues to avoid having stuck requests. - * Note that we must ensure the queues are not stopped - * when the final removal happens. - */ - blk_mq_start_hw_queues(ns->queue); - - /* draining requests in requeue list */ - blk_mq_kick_requeue_list(ns->queue); + /* Forcibly unquiesce queues to avoid blocking dispatch */ + blk_mq_unquiesce_queue(ns->queue); } mutex_unlock(&ctrl->namespaces_mutex); } @@ -2532,10 +2791,8 @@ void nvme_start_queues(struct nvme_ctrl *ctrl) struct nvme_ns *ns; mutex_lock(&ctrl->namespaces_mutex); - list_for_each_entry(ns, &ctrl->namespaces, list) { - blk_mq_start_stopped_hw_queues(ns->queue, true); - blk_mq_kick_requeue_list(ns->queue); - } + list_for_each_entry(ns, &ctrl->namespaces, list) + blk_mq_unquiesce_queue(ns->queue); mutex_unlock(&ctrl->namespaces_mutex); } EXPORT_SYMBOL_GPL(nvme_start_queues); @@ -2544,10 +2801,15 @@ int __init nvme_core_init(void) { int result; + nvme_wq = alloc_workqueue("nvme-wq", + WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); + if (!nvme_wq) + return -ENOMEM; + result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", &nvme_dev_fops); if (result < 0) - return result; + goto destroy_wq; else if (result > 0) nvme_char_major = result; @@ -2559,8 +2821,10 @@ int __init nvme_core_init(void) return 0; - unregister_chrdev: +unregister_chrdev: __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); +destroy_wq: + destroy_workqueue(nvme_wq); return result; } @@ -2568,6 +2832,7 @@ void nvme_core_exit(void) { class_destroy(nvme_class); __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); + destroy_workqueue(nvme_wq); } MODULE_LICENSE("GPL"); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 990e6fb32a63..2e582a240943 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -58,7 +58,6 @@ static struct nvmf_host *nvmf_host_add(const char *hostnqn) kref_init(&host->ref); memcpy(host->nqn, hostnqn, NVMF_NQN_SIZE); - uuid_be_gen(&host->id); list_add_tail(&host->list, &nvmf_hosts); out_unlock: @@ -75,7 +74,6 @@ static struct nvmf_host *nvmf_host_default(void) return NULL; kref_init(&host->ref); - uuid_be_gen(&host->id); snprintf(host->nqn, NVMF_NQN_SIZE, "nqn.2014-08.org.nvmexpress:NVMf:uuid:%pUb", &host->id); @@ -128,16 +126,6 @@ int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size) EXPORT_SYMBOL_GPL(nvmf_get_address); /** - * nvmf_get_subsysnqn() - Get subsystem NQN - * @ctrl: Host NVMe controller instance which we got the NQN - */ -const char *nvmf_get_subsysnqn(struct nvme_ctrl *ctrl) -{ - return ctrl->opts->subsysnqn; -} -EXPORT_SYMBOL_GPL(nvmf_get_subsysnqn); - -/** * nvmf_reg_read32() - NVMe Fabrics "Property Get" API function. * @ctrl: Host NVMe controller instance maintaining the admin * queue used to submit the property read command to @@ -337,6 +325,24 @@ static void nvmf_log_connect_error(struct nvme_ctrl *ctrl, } } break; + + case NVME_SC_CONNECT_INVALID_HOST: + dev_err(ctrl->device, + "Connect for subsystem %s is not allowed, hostnqn: %s\n", + data->subsysnqn, data->hostnqn); + break; + + case NVME_SC_CONNECT_CTRL_BUSY: + dev_err(ctrl->device, + "Connect command failed: controller is busy or not available\n"); + break; + + case NVME_SC_CONNECT_FORMAT: + dev_err(ctrl->device, + "Connect incompatible format: %d", + cmd->connect.recfmt); + break; + default: dev_err(ctrl->device, "Connect command failed, error wo/DNR bit: %d\n", @@ -376,13 +382,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) cmd.connect.opcode = nvme_fabrics_command; cmd.connect.fctype = nvme_fabrics_type_connect; cmd.connect.qid = 0; - - /* - * fabrics spec sets a minimum of depth 32 for admin queue, - * so set the queue with this depth always until - * justification otherwise. - */ - cmd.connect.sqsize = cpu_to_le16(NVMF_AQ_DEPTH - 1); + cmd.connect.sqsize = cpu_to_le16(NVME_AQ_DEPTH - 1); /* * Set keep-alive timeout in seconds granularity (ms * 1000) @@ -395,7 +395,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) if (!data) return -ENOMEM; - memcpy(&data->hostid, &ctrl->opts->host->id, sizeof(uuid_be)); + uuid_copy(&data->hostid, &ctrl->opts->host->id); data->cntlid = cpu_to_le16(0xffff); strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE); strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE); @@ -454,7 +454,7 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) if (!data) return -ENOMEM; - memcpy(&data->hostid, &ctrl->opts->host->id, sizeof(uuid_be)); + uuid_copy(&data->hostid, &ctrl->opts->host->id); data->cntlid = cpu_to_le16(ctrl->cntlid); strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE); strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE); @@ -474,7 +474,7 @@ EXPORT_SYMBOL_GPL(nvmf_connect_io_queue); bool nvmf_should_reconnect(struct nvme_ctrl *ctrl) { if (ctrl->opts->max_reconnects != -1 && - ctrl->opts->nr_reconnects < ctrl->opts->max_reconnects) + ctrl->nr_reconnects < ctrl->opts->max_reconnects) return true; return false; @@ -547,6 +547,7 @@ static const match_table_t opt_tokens = { { NVMF_OPT_KATO, "keep_alive_tmo=%d" }, { NVMF_OPT_HOSTNQN, "hostnqn=%s" }, { NVMF_OPT_HOST_TRADDR, "host_traddr=%s" }, + { NVMF_OPT_HOST_ID, "hostid=%s" }, { NVMF_OPT_ERR, NULL } }; @@ -558,6 +559,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, int token, ret = 0; size_t nqnlen = 0; int ctrl_loss_tmo = NVMF_DEF_CTRL_LOSS_TMO; + uuid_t hostid; /* Set defaults */ opts->queue_size = NVMF_DEF_QUEUE_SIZE; @@ -568,6 +570,8 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, if (!options) return -ENOMEM; + uuid_gen(&hostid); + while ((p = strsep(&o, ",\n")) != NULL) { if (!*p) continue; @@ -724,6 +728,17 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, } opts->host_traddr = p; break; + case NVMF_OPT_HOST_ID: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + if (uuid_parse(p, &hostid)) { + ret = -EINVAL; + goto out; + } + break; default: pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", p); @@ -743,6 +758,8 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->host = nvmf_default_host; } + uuid_copy(&opts->host->id, &hostid); + out: if (!opts->discovery_nqn && !opts->kato) opts->kato = NVME_DEFAULT_KATO; @@ -803,7 +820,8 @@ EXPORT_SYMBOL_GPL(nvmf_free_options); #define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN) #define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \ - NVMF_OPT_KATO | NVMF_OPT_HOSTNQN) + NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \ + NVMF_OPT_HOST_ID) static struct nvme_ctrl * nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) @@ -854,6 +872,15 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) goto out_unlock; } + if (strcmp(ctrl->subnqn, opts->subsysnqn)) { + dev_warn(ctrl->device, + "controller returned incorrect NQN: \"%s\".\n", + ctrl->subnqn); + mutex_unlock(&nvmf_transports_mutex); + ctrl->ops->delete_ctrl(ctrl); + return ERR_PTR(-EINVAL); + } + mutex_unlock(&nvmf_transports_mutex); return ctrl; diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index f5a9c1fb186f..bf33663218cd 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -36,7 +36,7 @@ struct nvmf_host { struct kref ref; struct list_head list; char nqn[NVMF_NQN_SIZE]; - uuid_be id; + uuid_t id; }; /** @@ -56,6 +56,7 @@ enum { NVMF_OPT_RECONNECT_DELAY = 1 << 9, NVMF_OPT_HOST_TRADDR = 1 << 10, NVMF_OPT_CTRL_LOSS_TMO = 1 << 11, + NVMF_OPT_HOST_ID = 1 << 12, }; /** @@ -80,7 +81,6 @@ enum { * @discovery_nqn: indicates if the subsysnqn is the well-known discovery NQN. * @kato: Keep-alive timeout. * @host: Virtual NVMe host, contains the NQN and Host ID. - * @nr_reconnects: number of reconnect attempted since the last ctrl failure * @max_reconnects: maximum number of allowed reconnect attempts before removing * the controller, (-1) means reconnect forever, zero means remove * immediately; @@ -98,7 +98,6 @@ struct nvmf_ctrl_options { bool discovery_nqn; unsigned int kato; struct nvmf_host *host; - int nr_reconnects; int max_reconnects; }; @@ -140,7 +139,6 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid); int nvmf_register_transport(struct nvmf_transport_ops *ops); void nvmf_unregister_transport(struct nvmf_transport_ops *ops); void nvmf_free_options(struct nvmf_ctrl_options *opts); -const char *nvmf_get_subsysnqn(struct nvme_ctrl *ctrl); int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size); bool nvmf_should_reconnect(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 92964cef0f4b..d666ada39a9b 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -36,7 +36,7 @@ */ #define NVME_FC_NR_AEN_COMMANDS 1 #define NVME_FC_AQ_BLKMQ_DEPTH \ - (NVMF_AQ_DEPTH - NVME_FC_NR_AEN_COMMANDS) + (NVME_AQ_DEPTH - NVME_FC_NR_AEN_COMMANDS) #define AEN_CMDID_BASE (NVME_FC_AQ_BLKMQ_DEPTH + 1) enum nvme_fc_queue_flags { @@ -148,25 +148,22 @@ struct nvme_fc_ctrl { struct device *dev; struct nvme_fc_lport *lport; struct nvme_fc_rport *rport; - u32 queue_count; u32 cnum; u64 association_id; - u64 cap; - struct list_head ctrl_list; /* rport->ctrl_list */ struct blk_mq_tag_set admin_tag_set; struct blk_mq_tag_set tag_set; struct work_struct delete_work; - struct work_struct reset_work; struct delayed_work connect_work; struct kref ref; u32 flags; u32 iocnt; + wait_queue_head_t ioabort_wait; struct nvme_fc_fcp_op aen_ops[NVME_FC_NR_AEN_COMMANDS]; @@ -214,7 +211,6 @@ static LIST_HEAD(nvme_fc_lport_list); static DEFINE_IDA(nvme_fc_local_port_cnt); static DEFINE_IDA(nvme_fc_ctrl_cnt); -static struct workqueue_struct *nvme_fc_wq; @@ -878,8 +874,7 @@ nvme_fc_connect_admin_queue(struct nvme_fc_ctrl *ctrl, assoc_rqst->assoc_cmd.sqsize = cpu_to_be16(qsize); /* Linux supports only Dynamic controllers */ assoc_rqst->assoc_cmd.cntlid = cpu_to_be16(0xffff); - memcpy(&assoc_rqst->assoc_cmd.hostid, &ctrl->ctrl.opts->host->id, - min_t(size_t, FCNVME_ASSOC_HOSTID_LEN, sizeof(uuid_be))); + uuid_copy(&assoc_rqst->assoc_cmd.hostid, &ctrl->ctrl.opts->host->id); strncpy(assoc_rqst->assoc_cmd.hostnqn, ctrl->ctrl.opts->host->nqn, min(FCNVME_ASSOC_HOSTNQN_LEN, NVMF_NQN_SIZE)); strncpy(assoc_rqst->assoc_cmd.subnqn, ctrl->ctrl.opts->subsysnqn, @@ -1242,8 +1237,10 @@ __nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl, spin_lock_irqsave(&ctrl->lock, flags); if (unlikely(op->flags & FCOP_FLAGS_TERMIO)) { - if (ctrl->flags & FCCTRL_TERMIO) - ctrl->iocnt--; + if (ctrl->flags & FCCTRL_TERMIO) { + if (!--ctrl->iocnt) + wake_up(&ctrl->ioabort_wait); + } } if (op->flags & FCOP_FLAGS_RELEASED) complete_rq = true; @@ -1450,18 +1447,8 @@ nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq, { struct nvme_fc_ctrl *ctrl = set->driver_data; struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); - struct nvme_fc_queue *queue = &ctrl->queues[hctx_idx+1]; - - return __nvme_fc_init_request(ctrl, queue, op, rq, queue->rqcnt++); -} - -static int -nvme_fc_init_admin_request(struct blk_mq_tag_set *set, struct request *rq, - unsigned int hctx_idx, unsigned int numa_node) -{ - struct nvme_fc_ctrl *ctrl = set->driver_data; - struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); - struct nvme_fc_queue *queue = &ctrl->queues[0]; + int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; + struct nvme_fc_queue *queue = &ctrl->queues[queue_idx]; return __nvme_fc_init_request(ctrl, queue, op, rq, queue->rqcnt++); } @@ -1624,7 +1611,7 @@ nvme_fc_free_io_queues(struct nvme_fc_ctrl *ctrl) { int i; - for (i = 1; i < ctrl->queue_count; i++) + for (i = 1; i < ctrl->ctrl.queue_count; i++) nvme_fc_free_queue(&ctrl->queues[i]); } @@ -1645,10 +1632,10 @@ __nvme_fc_create_hw_queue(struct nvme_fc_ctrl *ctrl, static void nvme_fc_delete_hw_io_queues(struct nvme_fc_ctrl *ctrl) { - struct nvme_fc_queue *queue = &ctrl->queues[ctrl->queue_count - 1]; + struct nvme_fc_queue *queue = &ctrl->queues[ctrl->ctrl.queue_count - 1]; int i; - for (i = ctrl->queue_count - 1; i >= 1; i--, queue--) + for (i = ctrl->ctrl.queue_count - 1; i >= 1; i--, queue--) __nvme_fc_delete_hw_queue(ctrl, queue, i); } @@ -1658,7 +1645,7 @@ nvme_fc_create_hw_io_queues(struct nvme_fc_ctrl *ctrl, u16 qsize) struct nvme_fc_queue *queue = &ctrl->queues[1]; int i, ret; - for (i = 1; i < ctrl->queue_count; i++, queue++) { + for (i = 1; i < ctrl->ctrl.queue_count; i++, queue++) { ret = __nvme_fc_create_hw_queue(ctrl, queue, i, qsize); if (ret) goto delete_queues; @@ -1677,7 +1664,7 @@ nvme_fc_connect_io_queues(struct nvme_fc_ctrl *ctrl, u16 qsize) { int i, ret = 0; - for (i = 1; i < ctrl->queue_count; i++) { + for (i = 1; i < ctrl->ctrl.queue_count; i++) { ret = nvme_fc_connect_queue(ctrl, &ctrl->queues[i], qsize, (qsize / 5)); if (ret) @@ -1695,7 +1682,7 @@ nvme_fc_init_io_queues(struct nvme_fc_ctrl *ctrl) { int i; - for (i = 1; i < ctrl->queue_count; i++) + for (i = 1; i < ctrl->ctrl.queue_count; i++) nvme_fc_init_queue(ctrl, i, ctrl->ctrl.sqsize); } @@ -1716,6 +1703,7 @@ nvme_fc_ctrl_free(struct kref *ref) list_del(&ctrl->ctrl_list); spin_unlock_irqrestore(&ctrl->rport->lock, flags); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); blk_cleanup_queue(ctrl->ctrl.admin_q); blk_mq_free_tag_set(&ctrl->admin_tag_set); @@ -1759,16 +1747,16 @@ nvme_fc_nvme_ctrl_freed(struct nvme_ctrl *nctrl) static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) { + /* only proceed if in LIVE state - e.g. on first error */ + if (ctrl->ctrl.state != NVME_CTRL_LIVE) + return; + dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: transport association error detected: %s\n", ctrl->cnum, errmsg); dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: resetting controller\n", ctrl->cnum); - /* stop the queues on error, cleanup is in reset thread */ - if (ctrl->queue_count > 1) - nvme_stop_queues(&ctrl->ctrl); - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) { dev_err(ctrl->ctrl.device, "NVME-FC{%d}: error_recovery: Couldn't change state " @@ -1776,10 +1764,7 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) return; } - if (!queue_work(nvme_fc_wq, &ctrl->reset_work)) - dev_err(ctrl->ctrl.device, - "NVME-FC{%d}: error_recovery: Failed to schedule " - "reset work\n", ctrl->cnum); + nvme_reset_ctrl(&ctrl->ctrl); } static enum blk_eh_timer_return @@ -1888,7 +1873,7 @@ nvme_fc_unmap_data(struct nvme_fc_ctrl *ctrl, struct request *rq, * level FC exchange resource that is also outstanding. This must be * considered in all cleanup operations. */ -static int +static blk_status_t nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, struct nvme_fc_fcp_op *op, u32 data_len, enum nvmefc_fcp_datadir io_dir) @@ -1903,10 +1888,10 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, * the target device is present */ if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE) - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_IOERR; if (!nvme_fc_ctrl_get(ctrl)) - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_IOERR; /* format the FC-NVME CMD IU and fcp_req */ cmdiu->connection_id = cpu_to_be64(queue->connection_id); @@ -1954,8 +1939,9 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, if (ret < 0) { nvme_cleanup_cmd(op->rq); nvme_fc_ctrl_put(ctrl); - return (ret == -ENOMEM || ret == -EAGAIN) ? - BLK_MQ_RQ_QUEUE_BUSY : BLK_MQ_RQ_QUEUE_ERROR; + if (ret == -ENOMEM || ret == -EAGAIN) + return BLK_STS_RESOURCE; + return BLK_STS_IOERR; } } @@ -1972,28 +1958,25 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, queue->lldd_handle, &op->fcp_req); if (ret) { - if (op->rq) { /* normal request */ + if (op->rq) /* normal request */ nvme_fc_unmap_data(ctrl, op->rq, op); - nvme_cleanup_cmd(op->rq); - } /* else - aen. no cleanup needed */ nvme_fc_ctrl_put(ctrl); if (ret != -EBUSY) - return BLK_MQ_RQ_QUEUE_ERROR; + return BLK_STS_IOERR; - if (op->rq) { - blk_mq_stop_hw_queues(op->rq->q); - blk_mq_delay_queue(queue->hctx, NVMEFC_QUEUE_DELAY); - } - return BLK_MQ_RQ_QUEUE_BUSY; + if (op->rq) + blk_mq_delay_run_hw_queue(queue->hctx, NVMEFC_QUEUE_DELAY); + + return BLK_STS_RESOURCE; } - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; } -static int +static blk_status_t nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -2006,7 +1989,7 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_command *sqe = &cmdiu->sqe; enum nvmefc_fcp_datadir io_dir; u32 data_len; - int ret; + blk_status_t ret; ret = nvme_setup_cmd(ns, rq, sqe); if (ret) @@ -2061,7 +2044,7 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx) struct nvme_fc_fcp_op *aen_op; unsigned long flags; bool terminating = false; - int ret; + blk_status_t ret; if (aer_idx > NVME_FC_NR_AEN_COMMANDS) return; @@ -2093,7 +2076,6 @@ __nvme_fc_final_op_cleanup(struct request *rq) op->flags &= ~(FCOP_FLAGS_TERMIO | FCOP_FLAGS_RELEASED | FCOP_FLAGS_COMPLETE); - nvme_cleanup_cmd(rq); nvme_fc_unmap_data(ctrl, rq, op); nvme_complete_rq(rq); nvme_fc_ctrl_put(ctrl); @@ -2193,17 +2175,20 @@ static int nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) { struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + unsigned int nr_io_queues; int ret; - ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues); + nr_io_queues = min(min(opts->nr_io_queues, num_online_cpus()), + ctrl->lport->ops->max_hw_queues); + ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); if (ret) { dev_info(ctrl->ctrl.device, "set_queue_count failed: %d\n", ret); return ret; } - ctrl->queue_count = opts->nr_io_queues + 1; - if (!opts->nr_io_queues) + ctrl->ctrl.queue_count = nr_io_queues + 1; + if (!nr_io_queues) return 0; nvme_fc_init_io_queues(ctrl); @@ -2219,7 +2204,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) sizeof(struct scatterlist)) + ctrl->lport->ops->fcprqst_priv_sz; ctrl->tag_set.driver_data = ctrl; - ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1; + ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1; ctrl->tag_set.timeout = NVME_IO_TIMEOUT; ret = blk_mq_alloc_tag_set(&ctrl->tag_set); @@ -2247,7 +2232,6 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) out_delete_hw_queues: nvme_fc_delete_hw_io_queues(ctrl); out_cleanup_blk_queue: - nvme_stop_keep_alive(&ctrl->ctrl); blk_cleanup_queue(ctrl->ctrl.connect_q); out_free_tag_set: blk_mq_free_tag_set(&ctrl->tag_set); @@ -2263,17 +2247,21 @@ static int nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl) { struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + unsigned int nr_io_queues; int ret; - ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues); + nr_io_queues = min(min(opts->nr_io_queues, num_online_cpus()), + ctrl->lport->ops->max_hw_queues); + ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); if (ret) { dev_info(ctrl->ctrl.device, "set_queue_count failed: %d\n", ret); return ret; } + ctrl->ctrl.queue_count = nr_io_queues + 1; /* check for io queues existing */ - if (ctrl->queue_count == 1) + if (ctrl->ctrl.queue_count == 1) return 0; nvme_fc_init_io_queues(ctrl); @@ -2290,6 +2278,8 @@ nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl) if (ret) goto out_delete_hw_queues; + blk_mq_update_nr_hw_queues(&ctrl->tag_set, nr_io_queues); + return 0; out_delete_hw_queues: @@ -2311,7 +2301,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) int ret; bool changed; - ++ctrl->ctrl.opts->nr_reconnects; + ++ctrl->ctrl.nr_reconnects; /* * Create the admin queue @@ -2331,7 +2321,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) goto out_delete_hw_queue; if (ctrl->ctrl.state != NVME_CTRL_NEW) - blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); ret = nvmf_connect_admin_queue(&ctrl->ctrl); if (ret) @@ -2344,7 +2334,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) * prior connection values */ - ret = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap); + ret = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap); if (ret) { dev_err(ctrl->ctrl.device, "prop_get NVME_REG_CAP failed\n"); @@ -2352,9 +2342,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) } ctrl->ctrl.sqsize = - min_t(int, NVME_CAP_MQES(ctrl->cap) + 1, ctrl->ctrl.sqsize); + min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap) + 1, ctrl->ctrl.sqsize); - ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); + ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); if (ret) goto out_disconnect_admin_queue; @@ -2375,8 +2365,6 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) goto out_disconnect_admin_queue; } - nvme_start_keep_alive(&ctrl->ctrl); - /* FC-NVME supports normal SGL Data Block Descriptors */ if (opts->queue_size > ctrl->ctrl.maxcmd) { @@ -2396,7 +2384,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) * Create the io queues */ - if (ctrl->queue_count > 1) { + if (ctrl->ctrl.queue_count > 1) { if (ctrl->ctrl.state == NVME_CTRL_NEW) ret = nvme_fc_create_io_queues(ctrl); else @@ -2408,19 +2396,14 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); - ctrl->ctrl.opts->nr_reconnects = 0; + ctrl->ctrl.nr_reconnects = 0; - if (ctrl->queue_count > 1) { - nvme_start_queues(&ctrl->ctrl); - nvme_queue_scan(&ctrl->ctrl); - nvme_queue_async_events(&ctrl->ctrl); - } + nvme_start_ctrl(&ctrl->ctrl); return 0; /* Success */ out_term_aen_ops: nvme_fc_term_aen_ops(ctrl); - nvme_stop_keep_alive(&ctrl->ctrl); out_disconnect_admin_queue: /* send a Disconnect(association) LS to fc-nvme target */ nvme_fc_xmt_disconnect_assoc(ctrl); @@ -2443,8 +2426,6 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) { unsigned long flags; - nvme_stop_keep_alive(&ctrl->ctrl); - spin_lock_irqsave(&ctrl->lock, flags); ctrl->flags |= FCCTRL_TERMIO; ctrl->iocnt = 0; @@ -2462,7 +2443,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) * io requests back to the block layer as part of normal completions * (but with error status). */ - if (ctrl->queue_count > 1) { + if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); @@ -2485,7 +2466,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) * use blk_mq_tagset_busy_itr() and the transport routine to * terminate the exchanges. */ - blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); @@ -2494,11 +2475,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) /* wait for all io that had to be aborted */ spin_lock_irqsave(&ctrl->lock, flags); - while (ctrl->iocnt) { - spin_unlock_irqrestore(&ctrl->lock, flags); - msleep(1000); - spin_lock_irqsave(&ctrl->lock, flags); - } + wait_event_lock_irq(ctrl->ioabort_wait, ctrl->iocnt == 0, ctrl->lock); ctrl->flags &= ~FCCTRL_TERMIO; spin_unlock_irqrestore(&ctrl->lock, flags); @@ -2528,9 +2505,10 @@ nvme_fc_delete_ctrl_work(struct work_struct *work) struct nvme_fc_ctrl *ctrl = container_of(work, struct nvme_fc_ctrl, delete_work); - cancel_work_sync(&ctrl->reset_work); + cancel_work_sync(&ctrl->ctrl.reset_work); cancel_delayed_work_sync(&ctrl->connect_work); - + nvme_stop_ctrl(&ctrl->ctrl); + nvme_remove_namespaces(&ctrl->ctrl); /* * kill the association on the link side. this will block * waiting for io to terminate @@ -2555,7 +2533,7 @@ __nvme_fc_schedule_delete_work(struct nvme_fc_ctrl *ctrl) if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) return true; - if (!queue_work(nvme_fc_wq, &ctrl->delete_work)) + if (!queue_work(nvme_wq, &ctrl->delete_work)) return true; return false; @@ -2582,7 +2560,7 @@ nvme_fc_del_nvme_ctrl(struct nvme_ctrl *nctrl) ret = __nvme_fc_del_ctrl(ctrl); if (!ret) - flush_workqueue(nvme_fc_wq); + flush_workqueue(nvme_wq); nvme_put_ctrl(&ctrl->ctrl); @@ -2607,13 +2585,13 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) dev_info(ctrl->ctrl.device, "NVME-FC{%d}: Reconnect attempt in %d seconds.\n", ctrl->cnum, ctrl->ctrl.opts->reconnect_delay); - queue_delayed_work(nvme_fc_wq, &ctrl->connect_work, + queue_delayed_work(nvme_wq, &ctrl->connect_work, ctrl->ctrl.opts->reconnect_delay * HZ); } else { dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: Max reconnect attempts (%d) " "reached. Removing controller\n", - ctrl->cnum, ctrl->ctrl.opts->nr_reconnects); + ctrl->cnum, ctrl->ctrl.nr_reconnects); WARN_ON(__nvme_fc_schedule_delete_work(ctrl)); } } @@ -2622,9 +2600,10 @@ static void nvme_fc_reset_ctrl_work(struct work_struct *work) { struct nvme_fc_ctrl *ctrl = - container_of(work, struct nvme_fc_ctrl, reset_work); + container_of(work, struct nvme_fc_ctrl, ctrl.reset_work); int ret; + nvme_stop_ctrl(&ctrl->ctrl); /* will block will waiting for io to terminate */ nvme_fc_delete_association(ctrl); @@ -2636,29 +2615,6 @@ nvme_fc_reset_ctrl_work(struct work_struct *work) "NVME-FC{%d}: controller reset complete\n", ctrl->cnum); } -/* - * called by the nvme core layer, for sysfs interface that requests - * a reset of the nvme controller - */ -static int -nvme_fc_reset_nvme_ctrl(struct nvme_ctrl *nctrl) -{ - struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); - - dev_info(ctrl->ctrl.device, - "NVME-FC{%d}: admin requested controller reset\n", ctrl->cnum); - - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) - return -EBUSY; - - if (!queue_work(nvme_fc_wq, &ctrl->reset_work)) - return -EBUSY; - - flush_work(&ctrl->reset_work); - - return 0; -} - static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { .name = "fc", .module = THIS_MODULE, @@ -2666,11 +2622,9 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, - .reset_ctrl = nvme_fc_reset_nvme_ctrl, .free_ctrl = nvme_fc_nvme_ctrl_freed, .submit_async_event = nvme_fc_submit_async_event, .delete_ctrl = nvme_fc_del_nvme_ctrl, - .get_subsysnqn = nvmf_get_subsysnqn, .get_address = nvmf_get_address, }; @@ -2696,7 +2650,7 @@ nvme_fc_connect_ctrl_work(struct work_struct *work) static const struct blk_mq_ops nvme_fc_admin_mq_ops = { .queue_rq = nvme_fc_queue_rq, .complete = nvme_fc_complete_rq, - .init_request = nvme_fc_init_admin_request, + .init_request = nvme_fc_init_request, .exit_request = nvme_fc_exit_request, .reinit_request = nvme_fc_reinit_request, .init_hctx = nvme_fc_init_admin_hctx, @@ -2741,23 +2695,22 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, kref_init(&ctrl->ref); INIT_WORK(&ctrl->delete_work, nvme_fc_delete_ctrl_work); - INIT_WORK(&ctrl->reset_work, nvme_fc_reset_ctrl_work); + INIT_WORK(&ctrl->ctrl.reset_work, nvme_fc_reset_ctrl_work); INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work); spin_lock_init(&ctrl->lock); /* io queue count */ - ctrl->queue_count = min_t(unsigned int, + ctrl->ctrl.queue_count = min_t(unsigned int, opts->nr_io_queues, lport->ops->max_hw_queues); - opts->nr_io_queues = ctrl->queue_count; /* so opts has valid value */ - ctrl->queue_count++; /* +1 for admin queue */ + ctrl->ctrl.queue_count++; /* +1 for admin queue */ ctrl->ctrl.sqsize = opts->queue_size - 1; ctrl->ctrl.kato = opts->kato; ret = -ENOMEM; - ctrl->queues = kcalloc(ctrl->queue_count, sizeof(struct nvme_fc_queue), - GFP_KERNEL); + ctrl->queues = kcalloc(ctrl->ctrl.queue_count, + sizeof(struct nvme_fc_queue), GFP_KERNEL); if (!ctrl->queues) goto out_free_ida; @@ -2808,6 +2761,9 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); + /* Remove core ctrl ref. */ + nvme_put_ctrl(&ctrl->ctrl); + /* as we're past the point where we transition to the ref * counting teardown path, if we return a bad pointer here, * the calling routine, thinking it's prior to the @@ -2966,20 +2922,7 @@ static struct nvmf_transport_ops nvme_fc_transport = { static int __init nvme_fc_init_module(void) { - int ret; - - nvme_fc_wq = create_workqueue("nvme_fc_wq"); - if (!nvme_fc_wq) - return -ENOMEM; - - ret = nvmf_register_transport(&nvme_fc_transport); - if (ret) - goto err; - - return 0; -err: - destroy_workqueue(nvme_fc_wq); - return ret; + return nvmf_register_transport(&nvme_fc_transport); } static void __exit nvme_fc_exit_module(void) @@ -2990,8 +2933,6 @@ static void __exit nvme_fc_exit_module(void) nvmf_unregister_transport(&nvme_fc_transport); - destroy_workqueue(nvme_fc_wq); - ida_destroy(&nvme_fc_local_port_cnt); ida_destroy(&nvme_fc_ctrl_cnt); } diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index f5df78ed1e10..be8541335e31 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -242,7 +242,7 @@ static inline void _nvme_nvm_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64); BUILD_BUG_ON(sizeof(struct nvme_nvm_id_group) != 960); BUILD_BUG_ON(sizeof(struct nvme_nvm_addr_format) != 16); - BUILD_BUG_ON(sizeof(struct nvme_nvm_id) != 4096); + BUILD_BUG_ON(sizeof(struct nvme_nvm_id) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_nvm_bb_tbl) != 64); } @@ -480,7 +480,7 @@ static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns, rqd->bio->bi_iter.bi_sector)); } -static void nvme_nvm_end_io(struct request *rq, int error) +static void nvme_nvm_end_io(struct request *rq, blk_status_t status) { struct nvm_rq *rqd = rq->end_io_data; @@ -509,7 +509,7 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) rq = nvme_alloc_request(q, (struct nvme_command *)cmd, 0, NVME_QID_ANY); if (IS_ERR(rq)) { kfree(cmd); - return -ENOMEM; + return PTR_ERR(rq); } rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; @@ -571,13 +571,6 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = { .max_phys_sect = 64, }; -static void nvme_nvm_end_user_vio(struct request *rq, int error) -{ - struct completion *waiting = rq->end_io_data; - - complete(waiting); -} - static int nvme_nvm_submit_user_cmd(struct request_queue *q, struct nvme_ns *ns, struct nvme_nvm_command *vcmd, @@ -608,7 +601,6 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q, rq->timeout = timeout ? timeout : ADMIN_TIMEOUT; rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; - rq->end_io_data = &wait; if (ppa_buf && ppa_len) { ppa_list = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, &ppa_dma); @@ -662,9 +654,7 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q, } submit: - blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_user_vio); - - wait_for_completion_io(&wait); + blk_execute_rq(q, NULL, rq, 0); if (nvme_req(rq)->flags & NVME_REQ_CANCELLED) ret = -EINTR; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9d6a070d4391..8f2a168ddc01 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -27,12 +27,11 @@ extern unsigned char nvme_io_timeout; extern unsigned char admin_timeout; #define ADMIN_TIMEOUT (admin_timeout * HZ) -extern unsigned char shutdown_timeout; -#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) - #define NVME_DEFAULT_KATO 5 #define NVME_KATO_GRACE 10 +extern struct workqueue_struct *nvme_wq; + enum { NVME_NS_LBA = 0, NVME_NS_LIGHTNVM = 1, @@ -131,6 +130,7 @@ struct nvme_ctrl { struct device *device; /* char device */ struct list_head node; struct ida ns_ida; + struct work_struct reset_work; struct opal_dev *opal_dev; @@ -138,15 +138,20 @@ struct nvme_ctrl { char serial[20]; char model[40]; char firmware_rev[8]; + char subnqn[NVMF_NQN_SIZE]; u16 cntlid; u32 ctrl_config; + u32 queue_count; + u64 cap; u32 page_size; u32 max_hw_sectors; u16 oncs; u16 vid; u16 oacs; + u16 nssa; + u16 nr_streams; atomic_t abort_limit; u8 event_limit; u8 vwc; @@ -165,6 +170,10 @@ struct nvme_ctrl { /* Power saving configuration */ u64 ps_max_latency_us; + bool apst_enabled; + + u32 hmpre; + u32 hmmin; /* Fabrics only */ u16 sqsize; @@ -172,12 +181,10 @@ struct nvme_ctrl { u32 iorcsz; u16 icdoff; u16 maxcmd; + int nr_reconnects; struct nvmf_ctrl_options *opts; }; -/* - * An NVM Express namespace is equivalent to a SCSI LUN - */ struct nvme_ns { struct list_head list; @@ -189,14 +196,18 @@ struct nvme_ns { int instance; u8 eui[8]; - u8 uuid[16]; + u8 nguid[16]; + uuid_t uuid; unsigned ns_id; int lba_shift; u16 ms; + u16 sgs; + u32 sws; bool ext; u8 pi_type; unsigned long flags; + u16 noiob; #define NVME_NS_REMOVING 0 #define NVME_NS_DEAD 1 @@ -214,11 +225,9 @@ struct nvme_ctrl_ops { int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); - int (*reset_ctrl)(struct nvme_ctrl *ctrl); void (*free_ctrl)(struct nvme_ctrl *ctrl); void (*submit_async_event)(struct nvme_ctrl *ctrl, int aer_idx); int (*delete_ctrl)(struct nvme_ctrl *ctrl); - const char *(*get_subsysnqn)(struct nvme_ctrl *ctrl); int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); }; @@ -271,6 +280,8 @@ int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, const struct nvme_ctrl_ops *ops, unsigned long quirks); void nvme_uninit_ctrl(struct nvme_ctrl *ctrl); +void nvme_start_ctrl(struct nvme_ctrl *ctrl); +void nvme_stop_ctrl(struct nvme_ctrl *ctrl); void nvme_put_ctrl(struct nvme_ctrl *ctrl); int nvme_init_identify(struct nvme_ctrl *ctrl); @@ -296,7 +307,7 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl); #define NVME_QID_ANY -1 struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, unsigned int flags, int qid); -int nvme_setup_cmd(struct nvme_ns *ns, struct request *req, +blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, struct nvme_command *cmd); int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buf, unsigned bufflen); @@ -310,23 +321,10 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, u32 meta_seed, u32 *result, unsigned timeout); -int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id); -int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, - struct nvme_id_ns **id); -int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log); -int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, - void *buffer, size_t buflen, u32 *result); -int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, - void *buffer, size_t buflen, u32 *result); int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_start_keep_alive(struct nvme_ctrl *ctrl); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); - -struct sg_io_hdr; - -int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr); -int nvme_sg_io32(struct nvme_ns *ns, unsigned long arg); -int nvme_sg_get_version_num(int __user *ip); +int nvme_reset_ctrl(struct nvme_ctrl *ctrl); #ifdef CONFIG_NVM int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 951042a375d6..d10d2f279d19 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -17,28 +17,15 @@ #include <linux/blkdev.h> #include <linux/blk-mq.h> #include <linux/blk-mq-pci.h> -#include <linux/cpu.h> -#include <linux/delay.h> #include <linux/dmi.h> -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/genhd.h> -#include <linux/hdreg.h> -#include <linux/idr.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/io.h> -#include <linux/kdev_t.h> -#include <linux/kernel.h> #include <linux/mm.h> #include <linux/module.h> -#include <linux/moduleparam.h> #include <linux/mutex.h> #include <linux/pci.h> #include <linux/poison.h> -#include <linux/ptrace.h> -#include <linux/sched.h> -#include <linux/slab.h> #include <linux/t10-pi.h> #include <linux/timer.h> #include <linux/types.h> @@ -48,8 +35,6 @@ #include "nvme.h" -#define NVME_Q_DEPTH 1024 -#define NVME_AQ_DEPTH 256 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) @@ -66,12 +51,24 @@ static bool use_cmb_sqes = true; module_param(use_cmb_sqes, bool, 0644); MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes"); -static struct workqueue_struct *nvme_workq; +static unsigned int max_host_mem_size_mb = 128; +module_param(max_host_mem_size_mb, uint, 0444); +MODULE_PARM_DESC(max_host_mem_size_mb, + "Maximum Host Memory Buffer (HMB) size per controller (in MiB)"); + +static int io_queue_depth_set(const char *val, const struct kernel_param *kp); +static const struct kernel_param_ops io_queue_depth_ops = { + .set = io_queue_depth_set, + .get = param_get_int, +}; + +static int io_queue_depth = 1024; +module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644); +MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2"); struct nvme_dev; struct nvme_queue; -static int nvme_reset(struct nvme_dev *dev); static void nvme_process_cq(struct nvme_queue *nvmeq); static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); @@ -86,15 +83,13 @@ struct nvme_dev { struct device *dev; struct dma_pool *prp_page_pool; struct dma_pool *prp_small_pool; - unsigned queue_count; unsigned online_queues; unsigned max_qid; int q_depth; u32 db_stride; void __iomem *bar; - struct work_struct reset_work; + unsigned long bar_mapped_size; struct work_struct remove_work; - struct timer_list watchdog_timer; struct mutex shutdown_lock; bool subsystem; void __iomem *cmb; @@ -104,12 +99,31 @@ struct nvme_dev { u32 cmbloc; struct nvme_ctrl ctrl; struct completion ioq_wait; + + /* shadow doorbell buffer support: */ u32 *dbbuf_dbs; dma_addr_t dbbuf_dbs_dma_addr; u32 *dbbuf_eis; dma_addr_t dbbuf_eis_dma_addr; + + /* host memory buffer support: */ + u64 host_mem_size; + u32 nr_host_mem_descs; + struct nvme_host_mem_buf_desc *host_mem_descs; + void **host_mem_desc_bufs; }; +static int io_queue_depth_set(const char *val, const struct kernel_param *kp) +{ + int n = 0, ret; + + ret = kstrtoint(val, 10, &n); + if (ret != 0 || n < 2) + return -EINVAL; + + return param_set_int(val, kp); +} + static inline unsigned int sq_idx(unsigned int qid, u32 stride) { return qid * 2 * stride; @@ -185,8 +199,8 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64); BUILD_BUG_ON(sizeof(struct nvme_command) != 64); - BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); - BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); + BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE); + BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64); @@ -350,19 +364,6 @@ static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_i nvmeq->tags = NULL; } -static int nvme_admin_init_request(struct blk_mq_tag_set *set, - struct request *req, unsigned int hctx_idx, - unsigned int numa_node) -{ - struct nvme_dev *dev = set->driver_data; - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = dev->queues[0]; - - BUG_ON(!nvmeq); - iod->nvmeq = nvmeq; - return 0; -} - static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { @@ -382,7 +383,8 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req, { struct nvme_dev *dev = set->driver_data; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; + int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0; + struct nvme_queue *nvmeq = dev->queues[queue_idx]; BUG_ON(!nvmeq); iod->nvmeq = nvmeq; @@ -427,7 +429,7 @@ static __le64 **iod_list(struct request *req) return (__le64 **)(iod->sg + blk_rq_nr_phys_segments(req)); } -static int nvme_init_iod(struct request *rq, struct nvme_dev *dev) +static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev) { struct nvme_iod *iod = blk_mq_rq_to_pdu(rq); int nseg = blk_rq_nr_phys_segments(rq); @@ -436,7 +438,7 @@ static int nvme_init_iod(struct request *rq, struct nvme_dev *dev) if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC); if (!iod->sg) - return BLK_MQ_RQ_QUEUE_BUSY; + return BLK_STS_RESOURCE; } else { iod->sg = iod->inline_sg; } @@ -446,7 +448,7 @@ static int nvme_init_iod(struct request *rq, struct nvme_dev *dev) iod->nents = 0; iod->length = size; - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; } static void nvme_free_iod(struct nvme_dev *dev, struct request *req) @@ -616,21 +618,21 @@ static bool nvme_setup_prps(struct nvme_dev *dev, struct request *req) return true; } -static int nvme_map_data(struct nvme_dev *dev, struct request *req, +static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, struct nvme_command *cmnd) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct request_queue *q = req->q; enum dma_data_direction dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; - int ret = BLK_MQ_RQ_QUEUE_ERROR; + blk_status_t ret = BLK_STS_IOERR; sg_init_table(iod->sg, blk_rq_nr_phys_segments(req)); iod->nents = blk_rq_map_sg(q, req, iod->sg); if (!iod->nents) goto out; - ret = BLK_MQ_RQ_QUEUE_BUSY; + ret = BLK_STS_RESOURCE; if (!dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir, DMA_ATTR_NO_WARN)) goto out; @@ -638,7 +640,7 @@ static int nvme_map_data(struct nvme_dev *dev, struct request *req, if (!nvme_setup_prps(dev, req)) goto out_unmap; - ret = BLK_MQ_RQ_QUEUE_ERROR; + ret = BLK_STS_IOERR; if (blk_integrity_rq(req)) { if (blk_rq_count_integrity_sg(q, req->bio) != 1) goto out_unmap; @@ -658,7 +660,7 @@ static int nvme_map_data(struct nvme_dev *dev, struct request *req, cmnd->rw.dptr.prp2 = cpu_to_le64(iod->first_dma); if (blk_integrity_rq(req)) cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg)); - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; out_unmap: dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); @@ -688,7 +690,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) /* * NOTE: ns is NULL when called on the admin queue. */ -static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct nvme_ns *ns = hctx->queue->queuedata; @@ -696,47 +698,34 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_dev *dev = nvmeq->dev; struct request *req = bd->rq; struct nvme_command cmnd; - int ret = BLK_MQ_RQ_QUEUE_OK; - - /* - * If formated with metadata, require the block layer provide a buffer - * unless this namespace is formated such that the metadata can be - * stripped/generated by the controller with PRACT=1. - */ - if (ns && ns->ms && !blk_integrity_rq(req)) { - if (!(ns->pi_type && ns->ms == 8) && - !blk_rq_is_passthrough(req)) { - blk_mq_end_request(req, -EFAULT); - return BLK_MQ_RQ_QUEUE_OK; - } - } + blk_status_t ret; ret = nvme_setup_cmd(ns, req, &cmnd); - if (ret != BLK_MQ_RQ_QUEUE_OK) + if (ret) return ret; ret = nvme_init_iod(req, dev); - if (ret != BLK_MQ_RQ_QUEUE_OK) + if (ret) goto out_free_cmd; - if (blk_rq_nr_phys_segments(req)) + if (blk_rq_nr_phys_segments(req)) { ret = nvme_map_data(dev, req, &cmnd); - - if (ret != BLK_MQ_RQ_QUEUE_OK) - goto out_cleanup_iod; + if (ret) + goto out_cleanup_iod; + } blk_mq_start_request(req); spin_lock_irq(&nvmeq->q_lock); if (unlikely(nvmeq->cq_vector < 0)) { - ret = BLK_MQ_RQ_QUEUE_ERROR; + ret = BLK_STS_IOERR; spin_unlock_irq(&nvmeq->q_lock); goto out_cleanup_iod; } __nvme_submit_cmd(nvmeq, &cmnd); nvme_process_cq(nvmeq); spin_unlock_irq(&nvmeq->q_lock); - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; out_cleanup_iod: nvme_free_iod(dev, req); out_free_cmd: @@ -759,65 +748,75 @@ static inline bool nvme_cqe_valid(struct nvme_queue *nvmeq, u16 head, return (le16_to_cpu(nvmeq->cqes[head].status) & 1) == phase; } -static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) +static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq) { - u16 head, phase; - - head = nvmeq->cq_head; - phase = nvmeq->cq_phase; - - while (nvme_cqe_valid(nvmeq, head, phase)) { - struct nvme_completion cqe = nvmeq->cqes[head]; - struct request *req; - - if (++head == nvmeq->q_depth) { - head = 0; - phase = !phase; - } + u16 head = nvmeq->cq_head; - if (tag && *tag == cqe.command_id) - *tag = -1; - - if (unlikely(cqe.command_id >= nvmeq->q_depth)) { - dev_warn(nvmeq->dev->ctrl.device, - "invalid id %d completed on queue %d\n", - cqe.command_id, le16_to_cpu(cqe.sq_id)); - continue; - } + if (likely(nvmeq->cq_vector >= 0)) { + if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db, + nvmeq->dbbuf_cq_ei)) + writel(head, nvmeq->q_db + nvmeq->dev->db_stride); + } +} - /* - * AEN requests are special as they don't time out and can - * survive any kind of queue freeze and often don't respond to - * aborts. We don't even bother to allocate a struct request - * for them but rather special case them here. - */ - if (unlikely(nvmeq->qid == 0 && - cqe.command_id >= NVME_AQ_BLKMQ_DEPTH)) { - nvme_complete_async_event(&nvmeq->dev->ctrl, - cqe.status, &cqe.result); - continue; - } +static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, + struct nvme_completion *cqe) +{ + struct request *req; - req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id); - nvme_end_request(req, cqe.status, cqe.result); + if (unlikely(cqe->command_id >= nvmeq->q_depth)) { + dev_warn(nvmeq->dev->ctrl.device, + "invalid id %d completed on queue %d\n", + cqe->command_id, le16_to_cpu(cqe->sq_id)); + return; } - if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) + /* + * AEN requests are special as they don't time out and can + * survive any kind of queue freeze and often don't respond to + * aborts. We don't even bother to allocate a struct request + * for them but rather special case them here. + */ + if (unlikely(nvmeq->qid == 0 && + cqe->command_id >= NVME_AQ_BLKMQ_DEPTH)) { + nvme_complete_async_event(&nvmeq->dev->ctrl, + cqe->status, &cqe->result); return; + } - if (likely(nvmeq->cq_vector >= 0)) - if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db, - nvmeq->dbbuf_cq_ei)) - writel(head, nvmeq->q_db + nvmeq->dev->db_stride); - nvmeq->cq_head = head; - nvmeq->cq_phase = phase; + req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id); + nvme_end_request(req, cqe->status, cqe->result); +} + +static inline bool nvme_read_cqe(struct nvme_queue *nvmeq, + struct nvme_completion *cqe) +{ + if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) { + *cqe = nvmeq->cqes[nvmeq->cq_head]; - nvmeq->cqe_seen = 1; + if (++nvmeq->cq_head == nvmeq->q_depth) { + nvmeq->cq_head = 0; + nvmeq->cq_phase = !nvmeq->cq_phase; + } + return true; + } + return false; } static void nvme_process_cq(struct nvme_queue *nvmeq) { - __nvme_process_cq(nvmeq, NULL); + struct nvme_completion cqe; + int consumed = 0; + + while (nvme_read_cqe(nvmeq, &cqe)) { + nvme_handle_cqe(nvmeq, &cqe); + consumed++; + } + + if (consumed) { + nvme_ring_cq_doorbell(nvmeq); + nvmeq->cqe_seen = 1; + } } static irqreturn_t nvme_irq(int irq, void *data) @@ -842,16 +841,28 @@ static irqreturn_t nvme_irq_check(int irq, void *data) static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag) { - if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) { - spin_lock_irq(&nvmeq->q_lock); - __nvme_process_cq(nvmeq, &tag); - spin_unlock_irq(&nvmeq->q_lock); + struct nvme_completion cqe; + int found = 0, consumed = 0; - if (tag == -1) - return 1; - } + if (!nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) + return 0; - return 0; + spin_lock_irq(&nvmeq->q_lock); + while (nvme_read_cqe(nvmeq, &cqe)) { + nvme_handle_cqe(nvmeq, &cqe); + consumed++; + + if (tag == cqe.command_id) { + found = 1; + break; + } + } + + if (consumed) + nvme_ring_cq_doorbell(nvmeq); + spin_unlock_irq(&nvmeq->q_lock); + + return found; } static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) @@ -939,7 +950,7 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); } -static void abort_endio(struct request *req, int error) +static void abort_endio(struct request *req, blk_status_t error) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = iod->nvmeq; @@ -950,6 +961,51 @@ static void abort_endio(struct request *req, int error) blk_mq_free_request(req); } +static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) +{ + + /* If true, indicates loss of adapter communication, possibly by a + * NVMe Subsystem reset. + */ + bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); + + /* If there is a reset ongoing, we shouldn't reset again. */ + if (dev->ctrl.state == NVME_CTRL_RESETTING) + return false; + + /* We shouldn't reset unless the controller is on fatal error state + * _or_ if we lost the communication with it. + */ + if (!(csts & NVME_CSTS_CFS) && !nssro) + return false; + + /* If PCI error recovery process is happening, we cannot reset or + * the recovery mechanism will surely fail. + */ + if (pci_channel_offline(to_pci_dev(dev->dev))) + return false; + + return true; +} + +static void nvme_warn_reset(struct nvme_dev *dev, u32 csts) +{ + /* Read a config register to help see what died. */ + u16 pci_status; + int result; + + result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS, + &pci_status); + if (result == PCIBIOS_SUCCESSFUL) + dev_warn(dev->ctrl.device, + "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n", + csts, pci_status); + else + dev_warn(dev->ctrl.device, + "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n", + csts, result); +} + static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -957,6 +1013,17 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) struct nvme_dev *dev = nvmeq->dev; struct request *abort_req; struct nvme_command cmd; + u32 csts = readl(dev->bar + NVME_REG_CSTS); + + /* + * Reset immediately if the controller is failed + */ + if (nvme_should_reset(dev, csts)) { + nvme_warn_reset(dev, csts); + nvme_dev_disable(dev, false); + nvme_reset_ctrl(&dev->ctrl); + return BLK_EH_HANDLED; + } /* * Did we miss an interrupt? @@ -993,7 +1060,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) "I/O %d QID %d timeout, reset controller\n", req->tag, nvmeq->qid); nvme_dev_disable(dev, false); - nvme_reset(dev); + nvme_reset_ctrl(&dev->ctrl); /* * Mark the request as handled, since the inline shutdown @@ -1051,9 +1118,9 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest) { int i; - for (i = dev->queue_count - 1; i >= lowest; i--) { + for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) { struct nvme_queue *nvmeq = dev->queues[i]; - dev->queue_count--; + dev->ctrl.queue_count--; dev->queues[i] = NULL; nvme_free_queue(nvmeq); } @@ -1078,7 +1145,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) spin_unlock_irq(&nvmeq->q_lock); if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) - blk_mq_stop_hw_queues(nvmeq->dev->ctrl.admin_q); + blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q); pci_free_irq(to_pci_dev(nvmeq->dev->dev), vector, nvmeq); @@ -1097,8 +1164,7 @@ static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) if (shutdown) nvme_shutdown_ctrl(&dev->ctrl); else - nvme_disable_ctrl(&dev->ctrl, lo_hi_readq( - dev->bar + NVME_REG_CAP)); + nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap); spin_lock_irq(&nvmeq->q_lock); nvme_process_cq(nvmeq); @@ -1173,7 +1239,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, nvmeq->qid = qid; nvmeq->cq_vector = -1; dev->queues[qid] = nvmeq; - dev->queue_count++; + dev->ctrl.queue_count++; return nvmeq; @@ -1247,7 +1313,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = { .complete = nvme_pci_complete_rq, .init_hctx = nvme_admin_init_hctx, .exit_hctx = nvme_admin_exit_hctx, - .init_request = nvme_admin_init_request, + .init_request = nvme_init_request, .timeout = nvme_timeout, }; @@ -1269,7 +1335,7 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev) * user requests may be waiting on a stopped queue. Start the * queue to flush these to completion. */ - blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true); + blk_mq_unquiesce_queue(dev->ctrl.admin_q); blk_cleanup_queue(dev->ctrl.admin_q); blk_mq_free_tag_set(&dev->admin_tagset); } @@ -1306,26 +1372,55 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) return -ENODEV; } } else - blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true); + blk_mq_unquiesce_queue(dev->ctrl.admin_q); return 0; } -static int nvme_configure_admin_queue(struct nvme_dev *dev) +static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) +{ + return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride); +} + +static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size) +{ + struct pci_dev *pdev = to_pci_dev(dev->dev); + + if (size <= dev->bar_mapped_size) + return 0; + if (size > pci_resource_len(pdev, 0)) + return -ENOMEM; + if (dev->bar) + iounmap(dev->bar); + dev->bar = ioremap(pci_resource_start(pdev, 0), size); + if (!dev->bar) { + dev->bar_mapped_size = 0; + return -ENOMEM; + } + dev->bar_mapped_size = size; + dev->dbs = dev->bar + NVME_REG_DBS; + + return 0; +} + +static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) { int result; u32 aqa; - u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP); struct nvme_queue *nvmeq; + result = nvme_remap_bar(dev, db_bar_size(dev, 0)); + if (result < 0) + return result; + dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ? - NVME_CAP_NSSRC(cap) : 0; + NVME_CAP_NSSRC(dev->ctrl.cap) : 0; if (dev->subsystem && (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); - result = nvme_disable_ctrl(&dev->ctrl, cap); + result = nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap); if (result < 0) return result; @@ -1344,7 +1439,7 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ); lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ); - result = nvme_enable_ctrl(&dev->ctrl, cap); + result = nvme_enable_ctrl(&dev->ctrl, dev->ctrl.cap); if (result) return result; @@ -1358,72 +1453,12 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) return result; } -static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) -{ - - /* If true, indicates loss of adapter communication, possibly by a - * NVMe Subsystem reset. - */ - bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); - - /* If there is a reset ongoing, we shouldn't reset again. */ - if (dev->ctrl.state == NVME_CTRL_RESETTING) - return false; - - /* We shouldn't reset unless the controller is on fatal error state - * _or_ if we lost the communication with it. - */ - if (!(csts & NVME_CSTS_CFS) && !nssro) - return false; - - /* If PCI error recovery process is happening, we cannot reset or - * the recovery mechanism will surely fail. - */ - if (pci_channel_offline(to_pci_dev(dev->dev))) - return false; - - return true; -} - -static void nvme_warn_reset(struct nvme_dev *dev, u32 csts) -{ - /* Read a config register to help see what died. */ - u16 pci_status; - int result; - - result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS, - &pci_status); - if (result == PCIBIOS_SUCCESSFUL) - dev_warn(dev->ctrl.device, - "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n", - csts, pci_status); - else - dev_warn(dev->ctrl.device, - "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n", - csts, result); -} - -static void nvme_watchdog_timer(unsigned long data) -{ - struct nvme_dev *dev = (struct nvme_dev *)data; - u32 csts = readl(dev->bar + NVME_REG_CSTS); - - /* Skip controllers under certain specific conditions. */ - if (nvme_should_reset(dev, csts)) { - if (!nvme_reset(dev)) - nvme_warn_reset(dev, csts); - return; - } - - mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ)); -} - static int nvme_create_io_queues(struct nvme_dev *dev) { unsigned i, max; int ret = 0; - for (i = dev->queue_count; i <= dev->max_qid; i++) { + for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) { /* vector == qid - 1, match nvme_create_queue */ if (!nvme_alloc_queue(dev, i, dev->q_depth, pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) { @@ -1432,7 +1467,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev) } } - max = min(dev->max_qid, dev->queue_count - 1); + max = min(dev->max_qid, dev->ctrl.queue_count - 1); for (i = dev->online_queues; i <= max; i++) { ret = nvme_create_queue(dev->queues[i], i); if (ret) @@ -1514,18 +1549,171 @@ static inline void nvme_release_cmb(struct nvme_dev *dev) } } -static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) +static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) +{ + size_t len = dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs); + struct nvme_command c; + u64 dma_addr; + int ret; + + dma_addr = dma_map_single(dev->dev, dev->host_mem_descs, len, + DMA_TO_DEVICE); + if (dma_mapping_error(dev->dev, dma_addr)) + return -ENOMEM; + + memset(&c, 0, sizeof(c)); + c.features.opcode = nvme_admin_set_features; + c.features.fid = cpu_to_le32(NVME_FEAT_HOST_MEM_BUF); + c.features.dword11 = cpu_to_le32(bits); + c.features.dword12 = cpu_to_le32(dev->host_mem_size >> + ilog2(dev->ctrl.page_size)); + c.features.dword13 = cpu_to_le32(lower_32_bits(dma_addr)); + c.features.dword14 = cpu_to_le32(upper_32_bits(dma_addr)); + c.features.dword15 = cpu_to_le32(dev->nr_host_mem_descs); + + ret = nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); + if (ret) { + dev_warn(dev->ctrl.device, + "failed to set host mem (err %d, flags %#x).\n", + ret, bits); + } + dma_unmap_single(dev->dev, dma_addr, len, DMA_TO_DEVICE); + return ret; +} + +static void nvme_free_host_mem(struct nvme_dev *dev) +{ + int i; + + for (i = 0; i < dev->nr_host_mem_descs; i++) { + struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i]; + size_t size = le32_to_cpu(desc->size) * dev->ctrl.page_size; + + dma_free_coherent(dev->dev, size, dev->host_mem_desc_bufs[i], + le64_to_cpu(desc->addr)); + } + + kfree(dev->host_mem_desc_bufs); + dev->host_mem_desc_bufs = NULL; + kfree(dev->host_mem_descs); + dev->host_mem_descs = NULL; +} + +static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) +{ + struct nvme_host_mem_buf_desc *descs; + u32 chunk_size, max_entries; + int i = 0; + void **bufs; + u64 size = 0, tmp; + + /* start big and work our way down */ + chunk_size = min(preferred, (u64)PAGE_SIZE << MAX_ORDER); +retry: + tmp = (preferred + chunk_size - 1); + do_div(tmp, chunk_size); + max_entries = tmp; + descs = kcalloc(max_entries, sizeof(*descs), GFP_KERNEL); + if (!descs) + goto out; + + bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL); + if (!bufs) + goto out_free_descs; + + for (size = 0; size < preferred; size += chunk_size) { + u32 len = min_t(u64, chunk_size, preferred - size); + dma_addr_t dma_addr; + + bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL, + DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN); + if (!bufs[i]) + break; + + descs[i].addr = cpu_to_le64(dma_addr); + descs[i].size = cpu_to_le32(len / dev->ctrl.page_size); + i++; + } + + if (!size || (min && size < min)) { + dev_warn(dev->ctrl.device, + "failed to allocate host memory buffer.\n"); + goto out_free_bufs; + } + + dev_info(dev->ctrl.device, + "allocated %lld MiB host memory buffer.\n", + size >> ilog2(SZ_1M)); + dev->nr_host_mem_descs = i; + dev->host_mem_size = size; + dev->host_mem_descs = descs; + dev->host_mem_desc_bufs = bufs; + return 0; + +out_free_bufs: + while (--i >= 0) { + size_t size = le32_to_cpu(descs[i].size) * dev->ctrl.page_size; + + dma_free_coherent(dev->dev, size, bufs[i], + le64_to_cpu(descs[i].addr)); + } + + kfree(bufs); +out_free_descs: + kfree(descs); +out: + /* try a smaller chunk size if we failed early */ + if (chunk_size >= PAGE_SIZE * 2 && (i == 0 || size < min)) { + chunk_size /= 2; + goto retry; + } + dev->host_mem_descs = NULL; + return -ENOMEM; +} + +static void nvme_setup_host_mem(struct nvme_dev *dev) { - return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride); + u64 max = (u64)max_host_mem_size_mb * SZ_1M; + u64 preferred = (u64)dev->ctrl.hmpre * 4096; + u64 min = (u64)dev->ctrl.hmmin * 4096; + u32 enable_bits = NVME_HOST_MEM_ENABLE; + + preferred = min(preferred, max); + if (min > max) { + dev_warn(dev->ctrl.device, + "min host memory (%lld MiB) above limit (%d MiB).\n", + min >> ilog2(SZ_1M), max_host_mem_size_mb); + nvme_free_host_mem(dev); + return; + } + + /* + * If we already have a buffer allocated check if we can reuse it. + */ + if (dev->host_mem_descs) { + if (dev->host_mem_size >= min) + enable_bits |= NVME_HOST_MEM_RETURN; + else + nvme_free_host_mem(dev); + } + + if (!dev->host_mem_descs) { + if (nvme_alloc_host_mem(dev, min, preferred)) + return; + } + + if (nvme_set_host_mem(dev, enable_bits)) + nvme_free_host_mem(dev); } static int nvme_setup_io_queues(struct nvme_dev *dev) { struct nvme_queue *adminq = dev->queues[0]; struct pci_dev *pdev = to_pci_dev(dev->dev); - int result, nr_io_queues, size; + int result, nr_io_queues; + unsigned long size; - nr_io_queues = num_online_cpus(); + nr_io_queues = num_present_cpus(); result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); if (result < 0) return result; @@ -1542,20 +1730,15 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) nvme_release_cmb(dev); } - size = db_bar_size(dev, nr_io_queues); - if (size > 8192) { - iounmap(dev->bar); - do { - dev->bar = ioremap(pci_resource_start(pdev, 0), size); - if (dev->bar) - break; - if (!--nr_io_queues) - return -ENOMEM; - size = db_bar_size(dev, nr_io_queues); - } while (1); - dev->dbs = dev->bar + 4096; - adminq->q_db = dev->dbs; - } + do { + size = db_bar_size(dev, nr_io_queues); + result = nvme_remap_bar(dev, size); + if (!result) + break; + if (!--nr_io_queues) + return -ENOMEM; + } while (1); + adminq->q_db = dev->dbs; /* Deregister the admin queue's interrupt */ pci_free_irq(pdev, 0, adminq); @@ -1586,7 +1769,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) return nvme_create_io_queues(dev); } -static void nvme_del_queue_end(struct request *req, int error) +static void nvme_del_queue_end(struct request *req, blk_status_t error) { struct nvme_queue *nvmeq = req->end_io_data; @@ -1594,7 +1777,7 @@ static void nvme_del_queue_end(struct request *req, int error) complete(&nvmeq->dev->ioq_wait); } -static void nvme_del_cq_end(struct request *req, int error) +static void nvme_del_cq_end(struct request *req, blk_status_t error) { struct nvme_queue *nvmeq = req->end_io_data; @@ -1701,7 +1884,6 @@ static int nvme_dev_add(struct nvme_dev *dev) static int nvme_pci_enable(struct nvme_dev *dev) { - u64 cap; int result = -ENOMEM; struct pci_dev *pdev = to_pci_dev(dev->dev); @@ -1728,10 +1910,11 @@ static int nvme_pci_enable(struct nvme_dev *dev) if (result < 0) return result; - cap = lo_hi_readq(dev->bar + NVME_REG_CAP); + dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP); - dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH); - dev->db_stride = 1 << NVME_CAP_STRIDE(cap); + dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1, + io_queue_depth); + dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); dev->dbs = dev->bar + 4096; /* @@ -1743,6 +1926,12 @@ static int nvme_pci_enable(struct nvme_dev *dev) dev_warn(dev->ctrl.device, "detected Apple NVMe controller, " "set queue depth=%u to work around controller resets\n", dev->q_depth); + } else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG && + (pdev->device == 0xa821 || pdev->device == 0xa822) && + NVME_CAP_MQES(dev->ctrl.cap) == 0) { + dev->q_depth = 64; + dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, " + "set queue depth=%u\n", dev->q_depth); } /* @@ -1799,13 +1988,12 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) bool dead = true; struct pci_dev *pdev = to_pci_dev(dev->dev); - del_timer_sync(&dev->watchdog_timer); - mutex_lock(&dev->shutdown_lock); if (pci_is_enabled(pdev)) { u32 csts = readl(dev->bar + NVME_REG_CSTS); - if (dev->ctrl.state == NVME_CTRL_LIVE) + if (dev->ctrl.state == NVME_CTRL_LIVE || + dev->ctrl.state == NVME_CTRL_RESETTING) nvme_start_freeze(&dev->ctrl); dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) || pdev->error_state != pci_channel_io_normal); @@ -1815,12 +2003,24 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) * Give the controller a chance to complete all entered requests if * doing a safe shutdown. */ - if (!dead && shutdown) - nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); + if (!dead) { + if (shutdown) + nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); + + /* + * If the controller is still alive tell it to stop using the + * host memory buffer. In theory the shutdown / reset should + * make sure that it doesn't access the host memoery anymore, + * but I'd rather be safe than sorry.. + */ + if (dev->host_mem_descs) + nvme_set_host_mem(dev, 0); + + } nvme_stop_queues(&dev->ctrl); queues = dev->online_queues - 1; - for (i = dev->queue_count - 1; i > 0; i--) + for (i = dev->ctrl.queue_count - 1; i > 0; i--) nvme_suspend_queue(dev->queues[i]); if (dead) { @@ -1828,7 +2028,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) * probe, before the admin queue is configured. Thus, * queue_count can be 0 here. */ - if (dev->queue_count) + if (dev->ctrl.queue_count) nvme_suspend_queue(dev->queues[0]); } else { nvme_disable_io_queues(dev, queues); @@ -1899,7 +2099,8 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status) static void nvme_reset_work(struct work_struct *work) { - struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); + struct nvme_dev *dev = + container_of(work, struct nvme_dev, ctrl.reset_work); bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); int result = -ENODEV; @@ -1917,7 +2118,7 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out; - result = nvme_configure_admin_queue(dev); + result = nvme_pci_configure_admin_queue(dev); if (result) goto out; @@ -1948,22 +2149,14 @@ static void nvme_reset_work(struct work_struct *work) "unable to allocate dma for dbbuf\n"); } + if (dev->ctrl.hmpre) + nvme_setup_host_mem(dev); + result = nvme_setup_io_queues(dev); if (result) goto out; /* - * A controller that can not execute IO typically requires user - * intervention to correct. For such degraded controllers, the driver - * should not submit commands the user did not request, so skip - * registering for asynchronous event notification on this condition. - */ - if (dev->online_queues > 1) - nvme_queue_async_events(&dev->ctrl); - - mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ)); - - /* * Keep the controller around but remove all namespaces if we don't have * any working I/O queue. */ @@ -1983,8 +2176,7 @@ static void nvme_reset_work(struct work_struct *work) goto out; } - if (dev->online_queues > 1) - nvme_queue_scan(&dev->ctrl); + nvme_start_ctrl(&dev->ctrl); return; out: @@ -2002,17 +2194,6 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work) nvme_put_ctrl(&dev->ctrl); } -static int nvme_reset(struct nvme_dev *dev) -{ - if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q)) - return -ENODEV; - if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) - return -EBUSY; - if (!queue_work(nvme_workq, &dev->reset_work)) - return -EBUSY; - return 0; -} - static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) { *val = readl(to_nvme_dev(ctrl)->bar + off); @@ -2031,16 +2212,6 @@ static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) return 0; } -static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl) -{ - struct nvme_dev *dev = to_nvme_dev(ctrl); - int ret = nvme_reset(dev); - - if (!ret) - flush_work(&dev->reset_work); - return ret; -} - static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .name = "pcie", .module = THIS_MODULE, @@ -2048,7 +2219,6 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .reg_read32 = nvme_pci_reg_read32, .reg_write32 = nvme_pci_reg_write32, .reg_read64 = nvme_pci_reg_read64, - .reset_ctrl = nvme_pci_reset_ctrl, .free_ctrl = nvme_pci_free_ctrl, .submit_async_event = nvme_pci_submit_async_event, }; @@ -2060,8 +2230,7 @@ static int nvme_dev_map(struct nvme_dev *dev) if (pci_request_mem_regions(pdev, "nvme")) return -ENODEV; - dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); - if (!dev->bar) + if (nvme_remap_bar(dev, NVME_REG_DBS + 4096)) goto release; return 0; @@ -2115,10 +2284,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto free; - INIT_WORK(&dev->reset_work, nvme_reset_work); + INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); - setup_timer(&dev->watchdog_timer, nvme_watchdog_timer, - (unsigned long)dev); mutex_init(&dev->shutdown_lock); init_completion(&dev->ioq_wait); @@ -2136,7 +2303,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING); dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); - queue_work(nvme_workq, &dev->reset_work); + queue_work(nvme_wq, &dev->ctrl.reset_work); return 0; release_pools: @@ -2150,14 +2317,16 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) return result; } -static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) +static void nvme_reset_prepare(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); + nvme_dev_disable(dev, false); +} - if (prepare) - nvme_dev_disable(dev, false); - else - nvme_reset(dev); +static void nvme_reset_done(struct pci_dev *pdev) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + nvme_reset_ctrl(&dev->ctrl); } static void nvme_shutdown(struct pci_dev *pdev) @@ -2177,7 +2346,7 @@ static void nvme_remove(struct pci_dev *pdev) nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); - cancel_work_sync(&dev->reset_work); + cancel_work_sync(&dev->ctrl.reset_work); pci_set_drvdata(pdev, NULL); if (!pci_device_is_present(pdev)) { @@ -2185,11 +2354,14 @@ static void nvme_remove(struct pci_dev *pdev) nvme_dev_disable(dev, false); } - flush_work(&dev->reset_work); - nvme_uninit_ctrl(&dev->ctrl); + flush_work(&dev->ctrl.reset_work); + nvme_stop_ctrl(&dev->ctrl); + nvme_remove_namespaces(&dev->ctrl); nvme_dev_disable(dev, true); + nvme_free_host_mem(dev); nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); + nvme_uninit_ctrl(&dev->ctrl); nvme_release_prp_pools(dev); nvme_dev_unmap(dev); nvme_put_ctrl(&dev->ctrl); @@ -2228,7 +2400,7 @@ static int nvme_resume(struct device *dev) struct pci_dev *pdev = to_pci_dev(dev); struct nvme_dev *ndev = pci_get_drvdata(pdev); - nvme_reset(ndev); + nvme_reset_ctrl(&ndev->ctrl); return 0; } #endif @@ -2267,7 +2439,7 @@ static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev) dev_info(dev->ctrl.device, "restart after slot reset\n"); pci_restore_state(pdev); - nvme_reset(dev); + nvme_reset_ctrl(&dev->ctrl); return PCI_ERS_RESULT_RECOVERED; } @@ -2280,7 +2452,8 @@ static const struct pci_error_handlers nvme_err_handler = { .error_detected = nvme_error_detected, .slot_reset = nvme_slot_reset, .resume = nvme_error_resume, - .reset_notify = nvme_reset_notify, + .reset_prepare = nvme_reset_prepare, + .reset_done = nvme_reset_done, }; static const struct pci_device_id nvme_id_table[] = { @@ -2301,6 +2474,10 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE(0x1c5f, 0x0540), /* Memblaze Pblaze4 adapter */ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, + { PCI_DEVICE(0x144d, 0xa821), /* Samsung PM1725 */ + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, + { PCI_DEVICE(0x144d, 0xa822), /* Samsung PM1725a */ + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, @@ -2323,22 +2500,12 @@ static struct pci_driver nvme_driver = { static int __init nvme_init(void) { - int result; - - nvme_workq = alloc_workqueue("nvme", WQ_UNBOUND | WQ_MEM_RECLAIM, 0); - if (!nvme_workq) - return -ENOMEM; - - result = pci_register_driver(&nvme_driver); - if (result) - destroy_workqueue(nvme_workq); - return result; + return pci_register_driver(&nvme_driver); } static void __exit nvme_exit(void) { pci_unregister_driver(&nvme_driver); - destroy_workqueue(nvme_workq); _nvme_check_size(); } diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 24397d306d53..da04df1af231 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -48,7 +48,7 @@ */ #define NVME_RDMA_NR_AEN_COMMANDS 1 #define NVME_RDMA_AQ_BLKMQ_DEPTH \ - (NVMF_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS) + (NVME_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS) struct nvme_rdma_device { struct ib_device *dev; @@ -80,15 +80,13 @@ struct nvme_rdma_request { }; enum nvme_rdma_queue_flags { - NVME_RDMA_Q_CONNECTED = (1 << 0), - NVME_RDMA_IB_QUEUE_ALLOCATED = (1 << 1), - NVME_RDMA_Q_DELETING = (1 << 2), - NVME_RDMA_Q_LIVE = (1 << 3), + NVME_RDMA_Q_LIVE = 0, + NVME_RDMA_Q_DELETING = 1, }; struct nvme_rdma_queue { struct nvme_rdma_qe *rsp_ring; - u8 sig_count; + atomic_t sig_count; int queue_size; size_t cmnd_capsule_len; struct nvme_rdma_ctrl *ctrl; @@ -103,17 +101,12 @@ struct nvme_rdma_queue { }; struct nvme_rdma_ctrl { - /* read and written in the hot path */ - spinlock_t lock; - /* read only in the hot path */ struct nvme_rdma_queue *queues; - u32 queue_count; /* other member variables */ struct blk_mq_tag_set tag_set; struct work_struct delete_work; - struct work_struct reset_work; struct work_struct err_work; struct nvme_rdma_qe async_event_sqe; @@ -125,7 +118,6 @@ struct nvme_rdma_ctrl { struct blk_mq_tag_set admin_tag_set; struct nvme_rdma_device *device; - u64 cap; u32 max_fr_pages; struct sockaddr_storage addr; @@ -145,8 +137,6 @@ static DEFINE_MUTEX(device_list_mutex); static LIST_HEAD(nvme_rdma_ctrl_list); static DEFINE_MUTEX(nvme_rdma_ctrl_mutex); -static struct workqueue_struct *nvme_rdma_wq; - /* * Disabling this option makes small I/O goes faster, but is fundamentally * unsafe. With it turned off we will have to register a global rkey that @@ -282,9 +272,6 @@ static int nvme_rdma_reinit_request(void *data, struct request *rq) struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); int ret = 0; - if (!req->mr->need_inval) - goto out; - ib_dereg_mr(req->mr); req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG, @@ -301,10 +288,12 @@ out: return ret; } -static void __nvme_rdma_exit_request(struct nvme_rdma_ctrl *ctrl, - struct request *rq, unsigned int queue_idx) +static void nvme_rdma_exit_request(struct blk_mq_tag_set *set, + struct request *rq, unsigned int hctx_idx) { + struct nvme_rdma_ctrl *ctrl = set->driver_data; struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; struct nvme_rdma_device *dev = queue->device; @@ -315,22 +304,13 @@ static void __nvme_rdma_exit_request(struct nvme_rdma_ctrl *ctrl, DMA_TO_DEVICE); } -static void nvme_rdma_exit_request(struct blk_mq_tag_set *set, - struct request *rq, unsigned int hctx_idx) -{ - return __nvme_rdma_exit_request(set->driver_data, rq, hctx_idx + 1); -} - -static void nvme_rdma_exit_admin_request(struct blk_mq_tag_set *set, - struct request *rq, unsigned int hctx_idx) -{ - return __nvme_rdma_exit_request(set->driver_data, rq, 0); -} - -static int __nvme_rdma_init_request(struct nvme_rdma_ctrl *ctrl, - struct request *rq, unsigned int queue_idx) +static int nvme_rdma_init_request(struct blk_mq_tag_set *set, + struct request *rq, unsigned int hctx_idx, + unsigned int numa_node) { + struct nvme_rdma_ctrl *ctrl = set->driver_data; struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; struct nvme_rdma_device *dev = queue->device; struct ib_device *ibdev = dev->dev; @@ -358,27 +338,13 @@ out_free_qe: return -ENOMEM; } -static int nvme_rdma_init_request(struct blk_mq_tag_set *set, - struct request *rq, unsigned int hctx_idx, - unsigned int numa_node) -{ - return __nvme_rdma_init_request(set->driver_data, rq, hctx_idx + 1); -} - -static int nvme_rdma_init_admin_request(struct blk_mq_tag_set *set, - struct request *rq, unsigned int hctx_idx, - unsigned int numa_node) -{ - return __nvme_rdma_init_request(set->driver_data, rq, 0); -} - static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { struct nvme_rdma_ctrl *ctrl = data; struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1]; - BUG_ON(hctx_idx >= ctrl->queue_count); + BUG_ON(hctx_idx >= ctrl->ctrl.queue_count); hctx->driver_data = queue; return 0; @@ -469,9 +435,6 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) struct nvme_rdma_device *dev; struct ib_device *ibdev; - if (!test_and_clear_bit(NVME_RDMA_IB_QUEUE_ALLOCATED, &queue->flags)) - return; - dev = queue->device; ibdev = dev->dev; rdma_destroy_qp(queue->cm_id); @@ -483,17 +446,21 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) nvme_rdma_dev_put(dev); } -static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue, - struct nvme_rdma_device *dev) +static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) { - struct ib_device *ibdev = dev->dev; + struct ib_device *ibdev; const int send_wr_factor = 3; /* MR, SEND, INV */ const int cq_factor = send_wr_factor + 1; /* + RECV */ int comp_vector, idx = nvme_rdma_queue_idx(queue); - int ret; - queue->device = dev; + queue->device = nvme_rdma_find_get_device(queue->cm_id); + if (!queue->device) { + dev_err(queue->cm_id->device->dev.parent, + "no client data found!\n"); + return -ECONNREFUSED; + } + ibdev = queue->device->dev; /* * The admin queue is barely used once the controller is live, so don't @@ -506,12 +473,12 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue, /* +1 for ib_stop_cq */ - queue->ib_cq = ib_alloc_cq(dev->dev, queue, - cq_factor * queue->queue_size + 1, comp_vector, - IB_POLL_SOFTIRQ); + queue->ib_cq = ib_alloc_cq(ibdev, queue, + cq_factor * queue->queue_size + 1, + comp_vector, IB_POLL_SOFTIRQ); if (IS_ERR(queue->ib_cq)) { ret = PTR_ERR(queue->ib_cq); - goto out; + goto out_put_dev; } ret = nvme_rdma_create_qp(queue, send_wr_factor); @@ -524,7 +491,6 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue, ret = -ENOMEM; goto out_destroy_qp; } - set_bit(NVME_RDMA_IB_QUEUE_ALLOCATED, &queue->flags); return 0; @@ -532,7 +498,8 @@ out_destroy_qp: ib_destroy_qp(queue->qp); out_destroy_ib_cq: ib_free_cq(queue->ib_cq); -out: +out_put_dev: + nvme_rdma_dev_put(queue->device); return ret; } @@ -553,6 +520,7 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl, queue->cmnd_capsule_len = sizeof(struct nvme_command); queue->queue_size = queue_size; + atomic_set(&queue->sig_count, 0); queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue, RDMA_PS_TCP, IB_QPT_RC); @@ -583,12 +551,10 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl, } clear_bit(NVME_RDMA_Q_DELETING, &queue->flags); - set_bit(NVME_RDMA_Q_CONNECTED, &queue->flags); return 0; out_destroy_cm_id: - nvme_rdma_destroy_queue_ib(queue); rdma_destroy_id(queue->cm_id); return ret; } @@ -617,7 +583,7 @@ static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl) { int i; - for (i = 1; i < ctrl->queue_count; i++) + for (i = 1; i < ctrl->ctrl.queue_count; i++) nvme_rdma_stop_and_free_queue(&ctrl->queues[i]); } @@ -625,7 +591,7 @@ static int nvme_rdma_connect_io_queues(struct nvme_rdma_ctrl *ctrl) { int i, ret = 0; - for (i = 1; i < ctrl->queue_count; i++) { + for (i = 1; i < ctrl->ctrl.queue_count; i++) { ret = nvmf_connect_io_queue(&ctrl->ctrl, i); if (ret) { dev_info(ctrl->ctrl.device, @@ -653,14 +619,14 @@ static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl) if (ret) return ret; - ctrl->queue_count = nr_io_queues + 1; - if (ctrl->queue_count < 2) + ctrl->ctrl.queue_count = nr_io_queues + 1; + if (ctrl->ctrl.queue_count < 2) return 0; dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n", nr_io_queues); - for (i = 1; i < ctrl->queue_count; i++) { + for (i = 1; i < ctrl->ctrl.queue_count; i++) { ret = nvme_rdma_init_queue(ctrl, i, ctrl->ctrl.opts->queue_size); if (ret) { @@ -718,11 +684,11 @@ static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl) if (nvmf_should_reconnect(&ctrl->ctrl)) { dev_info(ctrl->ctrl.device, "Reconnecting in %d seconds...\n", ctrl->ctrl.opts->reconnect_delay); - queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work, + queue_delayed_work(nvme_wq, &ctrl->reconnect_work, ctrl->ctrl.opts->reconnect_delay * HZ); } else { dev_info(ctrl->ctrl.device, "Removing controller...\n"); - queue_work(nvme_rdma_wq, &ctrl->delete_work); + queue_work(nvme_wq, &ctrl->delete_work); } } @@ -733,9 +699,9 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) bool changed; int ret; - ++ctrl->ctrl.opts->nr_reconnects; + ++ctrl->ctrl.nr_reconnects; - if (ctrl->queue_count > 1) { + if (ctrl->ctrl.queue_count > 1) { nvme_rdma_free_io_queues(ctrl); ret = blk_mq_reinit_tagset(&ctrl->tag_set); @@ -749,7 +715,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) if (ret) goto requeue; - ret = nvme_rdma_init_queue(ctrl, 0, NVMF_AQ_DEPTH); + ret = nvme_rdma_init_queue(ctrl, 0, NVME_AQ_DEPTH); if (ret) goto requeue; @@ -759,13 +725,11 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags); - ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); + ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); if (ret) goto requeue; - nvme_start_keep_alive(&ctrl->ctrl); - - if (ctrl->queue_count > 1) { + if (ctrl->ctrl.queue_count > 1) { ret = nvme_rdma_init_io_queues(ctrl); if (ret) goto requeue; @@ -773,16 +737,16 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) ret = nvme_rdma_connect_io_queues(ctrl); if (ret) goto requeue; + + blk_mq_update_nr_hw_queues(&ctrl->tag_set, + ctrl->ctrl.queue_count - 1); } changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); - ctrl->ctrl.opts->nr_reconnects = 0; + ctrl->ctrl.nr_reconnects = 0; - if (ctrl->queue_count > 1) { - nvme_queue_scan(&ctrl->ctrl); - nvme_queue_async_events(&ctrl->ctrl); - } + nvme_start_ctrl(&ctrl->ctrl); dev_info(ctrl->ctrl.device, "Successfully reconnected\n"); @@ -790,7 +754,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) requeue: dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n", - ctrl->ctrl.opts->nr_reconnects); + ctrl->ctrl.nr_reconnects); nvme_rdma_reconnect_or_remove(ctrl); } @@ -800,19 +764,17 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) struct nvme_rdma_ctrl, err_work); int i; - nvme_stop_keep_alive(&ctrl->ctrl); + nvme_stop_ctrl(&ctrl->ctrl); - for (i = 0; i < ctrl->queue_count; i++) { - clear_bit(NVME_RDMA_Q_CONNECTED, &ctrl->queues[i].flags); + for (i = 0; i < ctrl->ctrl.queue_count; i++) clear_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags); - } - if (ctrl->queue_count > 1) + if (ctrl->ctrl.queue_count > 1) nvme_stop_queues(&ctrl->ctrl); - blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); /* We must take care of fastfail/requeue all our inflight requests */ - if (ctrl->queue_count > 1) + if (ctrl->ctrl.queue_count > 1) blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, &ctrl->ctrl); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, @@ -822,7 +784,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) * queues are not a live anymore, so restart the queues to fail fast * new IO */ - blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_start_queues(&ctrl->ctrl); nvme_rdma_reconnect_or_remove(ctrl); @@ -833,7 +795,7 @@ static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl) if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) return; - queue_work(nvme_rdma_wq, &ctrl->err_work); + queue_work(nvme_wq, &ctrl->err_work); } static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc, @@ -1040,17 +1002,16 @@ static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) nvme_rdma_wr_error(cq, wc, "SEND"); } -static inline int nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue) +/* + * We want to signal completion at least every queue depth/2. This returns the + * largest power of two that is not above half of (queue size + 1) to optimize + * (avoid divisions). + */ +static inline bool nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue) { - int sig_limit; + int limit = 1 << ilog2((queue->queue_size + 1) / 2); - /* - * We signal completion every queue depth/2 and also handle the - * degenerated case of a device with queue_depth=1, where we - * would need to signal every message. - */ - sig_limit = max(queue->queue_size / 2, 1); - return (++queue->sig_count % sig_limit) == 0; + return (atomic_inc_return(&queue->sig_count) & (limit - 1)) == 0; } static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, @@ -1278,21 +1239,11 @@ static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue, static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue) { - struct nvme_rdma_device *dev; int ret; - dev = nvme_rdma_find_get_device(queue->cm_id); - if (!dev) { - dev_err(queue->cm_id->device->dev.parent, - "no client data found!\n"); - return -ECONNREFUSED; - } - - ret = nvme_rdma_create_queue_ib(queue, dev); - if (ret) { - nvme_rdma_dev_put(dev); - goto out; - } + ret = nvme_rdma_create_queue_ib(queue); + if (ret) + return ret; ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS); if (ret) { @@ -1306,7 +1257,6 @@ static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue) out_destroy_queue: nvme_rdma_destroy_queue_ib(queue); -out: return ret; } @@ -1334,8 +1284,8 @@ static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue) * specified by the Fabrics standard. */ if (priv.qid == 0) { - priv.hrqsize = cpu_to_le16(NVMF_AQ_DEPTH); - priv.hsqsize = cpu_to_le16(NVMF_AQ_DEPTH - 1); + priv.hrqsize = cpu_to_le16(NVME_AQ_DEPTH); + priv.hsqsize = cpu_to_le16(NVME_AQ_DEPTH - 1); } else { /* * current interpretation of the fabrics spec @@ -1383,12 +1333,14 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, complete(&queue->cm_done); return 0; case RDMA_CM_EVENT_REJECTED: + nvme_rdma_destroy_queue_ib(queue); cm_error = nvme_rdma_conn_rejected(queue, ev); break; - case RDMA_CM_EVENT_ADDR_ERROR: case RDMA_CM_EVENT_ROUTE_ERROR: case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: + nvme_rdma_destroy_queue_ib(queue); + case RDMA_CM_EVENT_ADDR_ERROR: dev_dbg(queue->ctrl->ctrl.device, "CM error event %d\n", ev->event); cm_error = -ECONNRESET; @@ -1435,8 +1387,8 @@ nvme_rdma_timeout(struct request *rq, bool reserved) /* * We cannot accept any other command until the Connect command has completed. */ -static inline int nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue, - struct request *rq) +static inline blk_status_t +nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue, struct request *rq) { if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) { struct nvme_command *cmd = nvme_req(rq)->cmd; @@ -1452,16 +1404,15 @@ static inline int nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue, * failover. */ if (queue->ctrl->ctrl.state == NVME_CTRL_RECONNECTING) - return -EIO; - else - return -EAGAIN; + return BLK_STS_IOERR; + return BLK_STS_RESOURCE; /* try again later */ } } return 0; } -static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct nvme_ns *ns = hctx->queue->queuedata; @@ -1472,28 +1423,29 @@ static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_command *c = sqe->data; bool flush = false; struct ib_device *dev; - int ret; + blk_status_t ret; + int err; WARN_ON_ONCE(rq->tag < 0); ret = nvme_rdma_queue_is_ready(queue, rq); if (unlikely(ret)) - goto err; + return ret; dev = queue->device->dev; ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(struct nvme_command), DMA_TO_DEVICE); ret = nvme_setup_cmd(ns, rq, c); - if (ret != BLK_MQ_RQ_QUEUE_OK) + if (ret) return ret; blk_mq_start_request(rq); - ret = nvme_rdma_map_data(queue, rq, c); - if (ret < 0) { + err = nvme_rdma_map_data(queue, rq, c); + if (err < 0) { dev_err(queue->ctrl->ctrl.device, - "Failed to map data (%d)\n", ret); + "Failed to map data (%d)\n", err); nvme_cleanup_cmd(rq); goto err; } @@ -1503,17 +1455,18 @@ static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, if (req_op(rq) == REQ_OP_FLUSH) flush = true; - ret = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, + err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge, req->mr->need_inval ? &req->reg_wr.wr : NULL, flush); - if (ret) { + if (err) { nvme_rdma_unmap_data(queue, rq); goto err; } - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; err: - return (ret == -ENOMEM || ret == -EAGAIN) ? - BLK_MQ_RQ_QUEUE_BUSY : BLK_MQ_RQ_QUEUE_ERROR; + if (err == -ENOMEM || err == -EAGAIN) + return BLK_STS_RESOURCE; + return BLK_STS_IOERR; } static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) @@ -1523,7 +1476,6 @@ static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) struct ib_wc wc; int found = 0; - ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); while (ib_poll_cq(cq, 1, &wc) > 0) { struct ib_cqe *cqe = wc.wr_cqe; @@ -1560,8 +1512,8 @@ static const struct blk_mq_ops nvme_rdma_mq_ops = { static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { .queue_rq = nvme_rdma_queue_rq, .complete = nvme_rdma_complete_rq, - .init_request = nvme_rdma_init_admin_request, - .exit_request = nvme_rdma_exit_admin_request, + .init_request = nvme_rdma_init_request, + .exit_request = nvme_rdma_exit_request, .reinit_request = nvme_rdma_reinit_request, .init_hctx = nvme_rdma_init_admin_hctx, .timeout = nvme_rdma_timeout, @@ -1571,7 +1523,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) { int error; - error = nvme_rdma_init_queue(ctrl, 0, NVMF_AQ_DEPTH); + error = nvme_rdma_init_queue(ctrl, 0, NVME_AQ_DEPTH); if (error) return error; @@ -1615,7 +1567,8 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags); - error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap); + error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, + &ctrl->ctrl.cap); if (error) { dev_err(ctrl->ctrl.device, "prop_get NVME_REG_CAP failed\n"); @@ -1623,9 +1576,9 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) } ctrl->ctrl.sqsize = - min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->ctrl.sqsize); + min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize); - error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); + error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); if (error) goto out_cleanup_queue; @@ -1642,8 +1595,6 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) if (error) goto out_cleanup_queue; - nvme_start_keep_alive(&ctrl->ctrl); - return 0; out_cleanup_queue: @@ -1661,32 +1612,34 @@ out_free_queue: static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl) { - nvme_stop_keep_alive(&ctrl->ctrl); cancel_work_sync(&ctrl->err_work); cancel_delayed_work_sync(&ctrl->reconnect_work); - if (ctrl->queue_count > 1) { + if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, &ctrl->ctrl); nvme_rdma_free_io_queues(ctrl); } - if (test_bit(NVME_RDMA_Q_CONNECTED, &ctrl->queues[0].flags)) + if (test_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags)) nvme_shutdown_ctrl(&ctrl->ctrl); - blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request, &ctrl->ctrl); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_rdma_destroy_admin_queue(ctrl); } static void __nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) { - nvme_uninit_ctrl(&ctrl->ctrl); + nvme_stop_ctrl(&ctrl->ctrl); + nvme_remove_namespaces(&ctrl->ctrl); if (shutdown) nvme_rdma_shutdown_ctrl(ctrl); + nvme_uninit_ctrl(&ctrl->ctrl); if (ctrl->ctrl.tagset) { blk_cleanup_queue(ctrl->ctrl.connect_q); blk_mq_free_tag_set(&ctrl->tag_set); @@ -1709,7 +1662,7 @@ static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl) if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) return -EBUSY; - if (!queue_work(nvme_rdma_wq, &ctrl->delete_work)) + if (!queue_work(nvme_wq, &ctrl->delete_work)) return -EBUSY; return 0; @@ -1743,11 +1696,12 @@ static void nvme_rdma_remove_ctrl_work(struct work_struct *work) static void nvme_rdma_reset_ctrl_work(struct work_struct *work) { - struct nvme_rdma_ctrl *ctrl = container_of(work, - struct nvme_rdma_ctrl, reset_work); + struct nvme_rdma_ctrl *ctrl = + container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work); int ret; bool changed; + nvme_stop_ctrl(&ctrl->ctrl); nvme_rdma_shutdown_ctrl(ctrl); ret = nvme_rdma_configure_admin_queue(ctrl); @@ -1757,7 +1711,7 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) goto del_dead_ctrl; } - if (ctrl->queue_count > 1) { + if (ctrl->ctrl.queue_count > 1) { ret = blk_mq_reinit_tagset(&ctrl->tag_set); if (ret) goto del_dead_ctrl; @@ -1769,38 +1723,22 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) ret = nvme_rdma_connect_io_queues(ctrl); if (ret) goto del_dead_ctrl; + + blk_mq_update_nr_hw_queues(&ctrl->tag_set, + ctrl->ctrl.queue_count - 1); } changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); - if (ctrl->queue_count > 1) { - nvme_start_queues(&ctrl->ctrl); - nvme_queue_scan(&ctrl->ctrl); - nvme_queue_async_events(&ctrl->ctrl); - } + nvme_start_ctrl(&ctrl->ctrl); return; del_dead_ctrl: /* Deleting this dead controller... */ dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); - WARN_ON(!queue_work(nvme_rdma_wq, &ctrl->delete_work)); -} - -static int nvme_rdma_reset_ctrl(struct nvme_ctrl *nctrl) -{ - struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); - - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) - return -EBUSY; - - if (!queue_work(nvme_rdma_wq, &ctrl->reset_work)) - return -EBUSY; - - flush_work(&ctrl->reset_work); - - return 0; + WARN_ON(!queue_work(nvme_wq, &ctrl->delete_work)); } static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { @@ -1810,11 +1748,9 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, - .reset_ctrl = nvme_rdma_reset_ctrl, .free_ctrl = nvme_rdma_free_ctrl, .submit_async_event = nvme_rdma_submit_async_event, .delete_ctrl = nvme_rdma_del_ctrl, - .get_subsysnqn = nvmf_get_subsysnqn, .get_address = nvmf_get_address, }; @@ -1843,7 +1779,7 @@ static int nvme_rdma_create_io_queues(struct nvme_rdma_ctrl *ctrl) ctrl->tag_set.cmd_size = sizeof(struct nvme_rdma_request) + SG_CHUNK_SIZE * sizeof(struct scatterlist); ctrl->tag_set.driver_data = ctrl; - ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1; + ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1; ctrl->tag_set.timeout = NVME_IO_TIMEOUT; ret = blk_mq_alloc_tag_set(&ctrl->tag_set); @@ -1919,15 +1855,14 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, nvme_rdma_reconnect_ctrl_work); INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work); INIT_WORK(&ctrl->delete_work, nvme_rdma_del_ctrl_work); - INIT_WORK(&ctrl->reset_work, nvme_rdma_reset_ctrl_work); - spin_lock_init(&ctrl->lock); + INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work); - ctrl->queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */ + ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */ ctrl->ctrl.sqsize = opts->queue_size - 1; ctrl->ctrl.kato = opts->kato; ret = -ENOMEM; - ctrl->queues = kcalloc(ctrl->queue_count, sizeof(*ctrl->queues), + ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues), GFP_KERNEL); if (!ctrl->queues) goto out_uninit_ctrl; @@ -1939,12 +1874,14 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, /* sanity check icdoff */ if (ctrl->ctrl.icdoff) { dev_err(ctrl->ctrl.device, "icdoff is not supported!\n"); + ret = -EINVAL; goto out_remove_admin_queue; } /* sanity check keyed sgls */ if (!(ctrl->ctrl.sgls & (1 << 20))) { dev_err(ctrl->ctrl.device, "Mandatory keyed sgls are not support\n"); + ret = -EINVAL; goto out_remove_admin_queue; } @@ -1982,15 +1919,11 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); mutex_unlock(&nvme_rdma_ctrl_mutex); - if (opts->nr_io_queues) { - nvme_queue_scan(&ctrl->ctrl); - nvme_queue_async_events(&ctrl->ctrl); - } + nvme_start_ctrl(&ctrl->ctrl); return &ctrl->ctrl; out_remove_admin_queue: - nvme_stop_keep_alive(&ctrl->ctrl); nvme_rdma_destroy_admin_queue(ctrl); out_kfree_queues: kfree(ctrl->queues); @@ -2033,7 +1966,7 @@ static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data) } mutex_unlock(&nvme_rdma_ctrl_mutex); - flush_workqueue(nvme_rdma_wq); + flush_workqueue(nvme_wq); } static struct ib_client nvme_rdma_ib_client = { @@ -2046,13 +1979,9 @@ static int __init nvme_rdma_init_module(void) { int ret; - nvme_rdma_wq = create_workqueue("nvme_rdma_wq"); - if (!nvme_rdma_wq) - return -ENOMEM; - ret = ib_register_client(&nvme_rdma_ib_client); if (ret) - goto err_destroy_wq; + return ret; ret = nvmf_register_transport(&nvme_rdma_transport); if (ret) @@ -2062,8 +1991,6 @@ static int __init nvme_rdma_init_module(void) err_unreg_client: ib_unregister_client(&nvme_rdma_ib_client); -err_destroy_wq: - destroy_workqueue(nvme_rdma_wq); return ret; } @@ -2071,7 +1998,6 @@ static void __exit nvme_rdma_cleanup_module(void) { nvmf_unregister_transport(&nvme_rdma_transport); ib_unregister_client(&nvme_rdma_ib_client); - destroy_workqueue(nvme_rdma_wq); } module_init(nvme_rdma_init_module); diff --git a/drivers/nvme/host/scsi.c b/drivers/nvme/host/scsi.c deleted file mode 100644 index 1f7671e631dd..000000000000 --- a/drivers/nvme/host/scsi.c +++ /dev/null @@ -1,2460 +0,0 @@ -/* - * NVM Express device driver - * Copyright (c) 2011-2014, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - */ - -/* - * Refer to the SCSI-NVMe Translation spec for details on how - * each command is translated. - */ - -#include <linux/bio.h> -#include <linux/bitops.h> -#include <linux/blkdev.h> -#include <linux/compat.h> -#include <linux/delay.h> -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/genhd.h> -#include <linux/idr.h> -#include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/io.h> -#include <linux/kdev_t.h> -#include <linux/kthread.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/pci.h> -#include <linux/poison.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/types.h> -#include <asm/unaligned.h> -#include <scsi/sg.h> -#include <scsi/scsi.h> -#include <scsi/scsi_request.h> - -#include "nvme.h" - -static int sg_version_num = 30534; /* 2 digits for each component */ - -/* VPD Page Codes */ -#define VPD_SUPPORTED_PAGES 0x00 -#define VPD_SERIAL_NUMBER 0x80 -#define VPD_DEVICE_IDENTIFIERS 0x83 -#define VPD_EXTENDED_INQUIRY 0x86 -#define VPD_BLOCK_LIMITS 0xB0 -#define VPD_BLOCK_DEV_CHARACTERISTICS 0xB1 - -/* format unit paramter list offsets */ -#define FORMAT_UNIT_SHORT_PARM_LIST_LEN 4 -#define FORMAT_UNIT_LONG_PARM_LIST_LEN 8 -#define FORMAT_UNIT_PROT_INT_OFFSET 3 -#define FORMAT_UNIT_PROT_FIELD_USAGE_OFFSET 0 -#define FORMAT_UNIT_PROT_FIELD_USAGE_MASK 0x07 - -/* Misc. defines */ -#define FIXED_SENSE_DATA 0x70 -#define DESC_FORMAT_SENSE_DATA 0x72 -#define FIXED_SENSE_DATA_ADD_LENGTH 10 -#define LUN_ENTRY_SIZE 8 -#define LUN_DATA_HEADER_SIZE 8 -#define ALL_LUNS_RETURNED 0x02 -#define ALL_WELL_KNOWN_LUNS_RETURNED 0x01 -#define RESTRICTED_LUNS_RETURNED 0x00 -#define DOWNLOAD_SAVE_ACTIVATE 0x05 -#define DOWNLOAD_SAVE_DEFER_ACTIVATE 0x0E -#define ACTIVATE_DEFERRED_MICROCODE 0x0F -#define FORMAT_UNIT_IMMED_MASK 0x2 -#define FORMAT_UNIT_IMMED_OFFSET 1 -#define KELVIN_TEMP_FACTOR 273 -#define FIXED_FMT_SENSE_DATA_SIZE 18 -#define DESC_FMT_SENSE_DATA_SIZE 8 - -/* SCSI/NVMe defines and bit masks */ -#define INQ_STANDARD_INQUIRY_PAGE 0x00 -#define INQ_SUPPORTED_VPD_PAGES_PAGE 0x00 -#define INQ_UNIT_SERIAL_NUMBER_PAGE 0x80 -#define INQ_DEVICE_IDENTIFICATION_PAGE 0x83 -#define INQ_EXTENDED_INQUIRY_DATA_PAGE 0x86 -#define INQ_BDEV_LIMITS_PAGE 0xB0 -#define INQ_BDEV_CHARACTERISTICS_PAGE 0xB1 -#define INQ_SERIAL_NUMBER_LENGTH 0x14 -#define INQ_NUM_SUPPORTED_VPD_PAGES 6 -#define VERSION_SPC_4 0x06 -#define ACA_UNSUPPORTED 0 -#define STANDARD_INQUIRY_LENGTH 36 -#define ADDITIONAL_STD_INQ_LENGTH 31 -#define EXTENDED_INQUIRY_DATA_PAGE_LENGTH 0x3C -#define RESERVED_FIELD 0 - -/* Mode Sense/Select defines */ -#define MODE_PAGE_INFO_EXCEP 0x1C -#define MODE_PAGE_CACHING 0x08 -#define MODE_PAGE_CONTROL 0x0A -#define MODE_PAGE_POWER_CONDITION 0x1A -#define MODE_PAGE_RETURN_ALL 0x3F -#define MODE_PAGE_BLK_DES_LEN 0x08 -#define MODE_PAGE_LLBAA_BLK_DES_LEN 0x10 -#define MODE_PAGE_CACHING_LEN 0x14 -#define MODE_PAGE_CONTROL_LEN 0x0C -#define MODE_PAGE_POW_CND_LEN 0x28 -#define MODE_PAGE_INF_EXC_LEN 0x0C -#define MODE_PAGE_ALL_LEN 0x54 -#define MODE_SENSE6_MPH_SIZE 4 -#define MODE_SENSE_PAGE_CONTROL_MASK 0xC0 -#define MODE_SENSE_PAGE_CODE_OFFSET 2 -#define MODE_SENSE_PAGE_CODE_MASK 0x3F -#define MODE_SENSE_LLBAA_MASK 0x10 -#define MODE_SENSE_LLBAA_SHIFT 4 -#define MODE_SENSE_DBD_MASK 8 -#define MODE_SENSE_DBD_SHIFT 3 -#define MODE_SENSE10_MPH_SIZE 8 -#define MODE_SELECT_CDB_PAGE_FORMAT_MASK 0x10 -#define MODE_SELECT_CDB_SAVE_PAGES_MASK 0x1 -#define MODE_SELECT_6_BD_OFFSET 3 -#define MODE_SELECT_10_BD_OFFSET 6 -#define MODE_SELECT_10_LLBAA_OFFSET 4 -#define MODE_SELECT_10_LLBAA_MASK 1 -#define MODE_SELECT_6_MPH_SIZE 4 -#define MODE_SELECT_10_MPH_SIZE 8 -#define CACHING_MODE_PAGE_WCE_MASK 0x04 -#define MODE_SENSE_BLK_DESC_ENABLED 0 -#define MODE_SENSE_BLK_DESC_COUNT 1 -#define MODE_SELECT_PAGE_CODE_MASK 0x3F -#define SHORT_DESC_BLOCK 8 -#define LONG_DESC_BLOCK 16 -#define MODE_PAGE_POW_CND_LEN_FIELD 0x26 -#define MODE_PAGE_INF_EXC_LEN_FIELD 0x0A -#define MODE_PAGE_CACHING_LEN_FIELD 0x12 -#define MODE_PAGE_CONTROL_LEN_FIELD 0x0A -#define MODE_SENSE_PC_CURRENT_VALUES 0 - -/* Log Sense defines */ -#define LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE 0x00 -#define LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH 0x07 -#define LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE 0x2F -#define LOG_PAGE_TEMPERATURE_PAGE 0x0D -#define LOG_SENSE_CDB_SP_NOT_ENABLED 0 -#define LOG_SENSE_CDB_PC_MASK 0xC0 -#define LOG_SENSE_CDB_PC_SHIFT 6 -#define LOG_SENSE_CDB_PC_CUMULATIVE_VALUES 1 -#define LOG_SENSE_CDB_PAGE_CODE_MASK 0x3F -#define REMAINING_INFO_EXCP_PAGE_LENGTH 0x8 -#define LOG_INFO_EXCP_PAGE_LENGTH 0xC -#define REMAINING_TEMP_PAGE_LENGTH 0xC -#define LOG_TEMP_PAGE_LENGTH 0x10 -#define LOG_TEMP_UNKNOWN 0xFF -#define SUPPORTED_LOG_PAGES_PAGE_LENGTH 0x3 - -/* Read Capacity defines */ -#define READ_CAP_10_RESP_SIZE 8 -#define READ_CAP_16_RESP_SIZE 32 - -/* NVMe Namespace and Command Defines */ -#define BYTES_TO_DWORDS 4 -#define NVME_MAX_FIRMWARE_SLOT 7 - -/* Report LUNs defines */ -#define REPORT_LUNS_FIRST_LUN_OFFSET 8 - -/* SCSI ADDITIONAL SENSE Codes */ - -#define SCSI_ASC_NO_SENSE 0x00 -#define SCSI_ASC_PERIPHERAL_DEV_WRITE_FAULT 0x03 -#define SCSI_ASC_LUN_NOT_READY 0x04 -#define SCSI_ASC_WARNING 0x0B -#define SCSI_ASC_LOG_BLOCK_GUARD_CHECK_FAILED 0x10 -#define SCSI_ASC_LOG_BLOCK_APPTAG_CHECK_FAILED 0x10 -#define SCSI_ASC_LOG_BLOCK_REFTAG_CHECK_FAILED 0x10 -#define SCSI_ASC_UNRECOVERED_READ_ERROR 0x11 -#define SCSI_ASC_MISCOMPARE_DURING_VERIFY 0x1D -#define SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID 0x20 -#define SCSI_ASC_ILLEGAL_COMMAND 0x20 -#define SCSI_ASC_ILLEGAL_BLOCK 0x21 -#define SCSI_ASC_INVALID_CDB 0x24 -#define SCSI_ASC_INVALID_LUN 0x25 -#define SCSI_ASC_INVALID_PARAMETER 0x26 -#define SCSI_ASC_FORMAT_COMMAND_FAILED 0x31 -#define SCSI_ASC_INTERNAL_TARGET_FAILURE 0x44 - -/* SCSI ADDITIONAL SENSE Code Qualifiers */ - -#define SCSI_ASCQ_CAUSE_NOT_REPORTABLE 0x00 -#define SCSI_ASCQ_FORMAT_COMMAND_FAILED 0x01 -#define SCSI_ASCQ_LOG_BLOCK_GUARD_CHECK_FAILED 0x01 -#define SCSI_ASCQ_LOG_BLOCK_APPTAG_CHECK_FAILED 0x02 -#define SCSI_ASCQ_LOG_BLOCK_REFTAG_CHECK_FAILED 0x03 -#define SCSI_ASCQ_FORMAT_IN_PROGRESS 0x04 -#define SCSI_ASCQ_POWER_LOSS_EXPECTED 0x08 -#define SCSI_ASCQ_INVALID_LUN_ID 0x09 - -/* copied from drivers/usb/gadget/function/storage_common.h */ -static inline u32 get_unaligned_be24(u8 *buf) -{ - return 0xffffff & (u32) get_unaligned_be32(buf - 1); -} - -/* Struct to gather data that needs to be extracted from a SCSI CDB. - Not conforming to any particular CDB variant, but compatible with all. */ - -struct nvme_trans_io_cdb { - u8 fua; - u8 prot_info; - u64 lba; - u32 xfer_len; -}; - - -/* Internal Helper Functions */ - - -/* Copy data to userspace memory */ - -static int nvme_trans_copy_to_user(struct sg_io_hdr *hdr, void *from, - unsigned long n) -{ - int i; - void *index = from; - size_t remaining = n; - size_t xfer_len; - - if (hdr->iovec_count > 0) { - struct sg_iovec sgl; - - for (i = 0; i < hdr->iovec_count; i++) { - if (copy_from_user(&sgl, hdr->dxferp + - i * sizeof(struct sg_iovec), - sizeof(struct sg_iovec))) - return -EFAULT; - xfer_len = min(remaining, sgl.iov_len); - if (copy_to_user(sgl.iov_base, index, xfer_len)) - return -EFAULT; - - index += xfer_len; - remaining -= xfer_len; - if (remaining == 0) - break; - } - return 0; - } - - if (copy_to_user(hdr->dxferp, from, n)) - return -EFAULT; - return 0; -} - -/* Copy data from userspace memory */ - -static int nvme_trans_copy_from_user(struct sg_io_hdr *hdr, void *to, - unsigned long n) -{ - int i; - void *index = to; - size_t remaining = n; - size_t xfer_len; - - if (hdr->iovec_count > 0) { - struct sg_iovec sgl; - - for (i = 0; i < hdr->iovec_count; i++) { - if (copy_from_user(&sgl, hdr->dxferp + - i * sizeof(struct sg_iovec), - sizeof(struct sg_iovec))) - return -EFAULT; - xfer_len = min(remaining, sgl.iov_len); - if (copy_from_user(index, sgl.iov_base, xfer_len)) - return -EFAULT; - index += xfer_len; - remaining -= xfer_len; - if (remaining == 0) - break; - } - return 0; - } - - if (copy_from_user(to, hdr->dxferp, n)) - return -EFAULT; - return 0; -} - -/* Status/Sense Buffer Writeback */ - -static int nvme_trans_completion(struct sg_io_hdr *hdr, u8 status, u8 sense_key, - u8 asc, u8 ascq) -{ - u8 xfer_len; - u8 resp[DESC_FMT_SENSE_DATA_SIZE]; - - if (scsi_status_is_good(status)) { - hdr->status = SAM_STAT_GOOD; - hdr->masked_status = GOOD; - hdr->host_status = DID_OK; - hdr->driver_status = DRIVER_OK; - hdr->sb_len_wr = 0; - } else { - hdr->status = status; - hdr->masked_status = status >> 1; - hdr->host_status = DID_OK; - hdr->driver_status = DRIVER_OK; - - memset(resp, 0, DESC_FMT_SENSE_DATA_SIZE); - resp[0] = DESC_FORMAT_SENSE_DATA; - resp[1] = sense_key; - resp[2] = asc; - resp[3] = ascq; - - xfer_len = min_t(u8, hdr->mx_sb_len, DESC_FMT_SENSE_DATA_SIZE); - hdr->sb_len_wr = xfer_len; - if (copy_to_user(hdr->sbp, resp, xfer_len) > 0) - return -EFAULT; - } - - return 0; -} - -/* - * Take a status code from a lowlevel routine, and if it was a positive NVMe - * error code update the sense data based on it. In either case the passed - * in value is returned again, unless an -EFAULT from copy_to_user overrides - * it. - */ -static int nvme_trans_status_code(struct sg_io_hdr *hdr, int nvme_sc) -{ - u8 status, sense_key, asc, ascq; - int res; - - /* For non-nvme (Linux) errors, simply return the error code */ - if (nvme_sc < 0) - return nvme_sc; - - /* Mask DNR, More, and reserved fields */ - switch (nvme_sc & 0x7FF) { - /* Generic Command Status */ - case NVME_SC_SUCCESS: - status = SAM_STAT_GOOD; - sense_key = NO_SENSE; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_INVALID_OPCODE: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_ILLEGAL_COMMAND; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_INVALID_FIELD: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_INVALID_CDB; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_DATA_XFER_ERROR: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_POWER_LOSS: - status = SAM_STAT_TASK_ABORTED; - sense_key = ABORTED_COMMAND; - asc = SCSI_ASC_WARNING; - ascq = SCSI_ASCQ_POWER_LOSS_EXPECTED; - break; - case NVME_SC_INTERNAL: - status = SAM_STAT_CHECK_CONDITION; - sense_key = HARDWARE_ERROR; - asc = SCSI_ASC_INTERNAL_TARGET_FAILURE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_ABORT_REQ: - status = SAM_STAT_TASK_ABORTED; - sense_key = ABORTED_COMMAND; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_ABORT_QUEUE: - status = SAM_STAT_TASK_ABORTED; - sense_key = ABORTED_COMMAND; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_FUSED_FAIL: - status = SAM_STAT_TASK_ABORTED; - sense_key = ABORTED_COMMAND; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_FUSED_MISSING: - status = SAM_STAT_TASK_ABORTED; - sense_key = ABORTED_COMMAND; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_INVALID_NS: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID; - ascq = SCSI_ASCQ_INVALID_LUN_ID; - break; - case NVME_SC_LBA_RANGE: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_ILLEGAL_BLOCK; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_CAP_EXCEEDED: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_NS_NOT_READY: - status = SAM_STAT_CHECK_CONDITION; - sense_key = NOT_READY; - asc = SCSI_ASC_LUN_NOT_READY; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - - /* Command Specific Status */ - case NVME_SC_INVALID_FORMAT: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_FORMAT_COMMAND_FAILED; - ascq = SCSI_ASCQ_FORMAT_COMMAND_FAILED; - break; - case NVME_SC_BAD_ATTRIBUTES: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_INVALID_CDB; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - - /* Media Errors */ - case NVME_SC_WRITE_FAULT: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_PERIPHERAL_DEV_WRITE_FAULT; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_READ_ERROR: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_UNRECOVERED_READ_ERROR; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_GUARD_CHECK: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_LOG_BLOCK_GUARD_CHECK_FAILED; - ascq = SCSI_ASCQ_LOG_BLOCK_GUARD_CHECK_FAILED; - break; - case NVME_SC_APPTAG_CHECK: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_LOG_BLOCK_APPTAG_CHECK_FAILED; - ascq = SCSI_ASCQ_LOG_BLOCK_APPTAG_CHECK_FAILED; - break; - case NVME_SC_REFTAG_CHECK: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_LOG_BLOCK_REFTAG_CHECK_FAILED; - ascq = SCSI_ASCQ_LOG_BLOCK_REFTAG_CHECK_FAILED; - break; - case NVME_SC_COMPARE_FAILED: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MISCOMPARE; - asc = SCSI_ASC_MISCOMPARE_DURING_VERIFY; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_ACCESS_DENIED: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID; - ascq = SCSI_ASCQ_INVALID_LUN_ID; - break; - - /* Unspecified/Default */ - case NVME_SC_CMDID_CONFLICT: - case NVME_SC_CMD_SEQ_ERROR: - case NVME_SC_CQ_INVALID: - case NVME_SC_QID_INVALID: - case NVME_SC_QUEUE_SIZE: - case NVME_SC_ABORT_LIMIT: - case NVME_SC_ABORT_MISSING: - case NVME_SC_ASYNC_LIMIT: - case NVME_SC_FIRMWARE_SLOT: - case NVME_SC_FIRMWARE_IMAGE: - case NVME_SC_INVALID_VECTOR: - case NVME_SC_INVALID_LOG_PAGE: - default: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - } - - res = nvme_trans_completion(hdr, status, sense_key, asc, ascq); - return res ? res : nvme_sc; -} - -/* INQUIRY Helper Functions */ - -static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *inq_response, - int alloc_len) -{ - struct nvme_ctrl *ctrl = ns->ctrl; - struct nvme_id_ns *id_ns; - int res; - int nvme_sc; - int xfer_len; - u8 resp_data_format = 0x02; - u8 protect; - u8 cmdque = 0x01 << 1; - u8 fw_offset = sizeof(ctrl->firmware_rev); - - /* nvme ns identify - use DPS value for PROTECT field */ - nvme_sc = nvme_identify_ns(ctrl, ns->ns_id, &id_ns); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - if (id_ns->dps) - protect = 0x01; - else - protect = 0; - kfree(id_ns); - - memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); - inq_response[2] = VERSION_SPC_4; - inq_response[3] = resp_data_format; /*normaca=0 | hisup=0 */ - inq_response[4] = ADDITIONAL_STD_INQ_LENGTH; - inq_response[5] = protect; /* sccs=0 | acc=0 | tpgs=0 | pc3=0 */ - inq_response[7] = cmdque; /* wbus16=0 | sync=0 | vs=0 */ - strncpy(&inq_response[8], "NVMe ", 8); - strncpy(&inq_response[16], ctrl->model, 16); - - while (ctrl->firmware_rev[fw_offset - 1] == ' ' && fw_offset > 4) - fw_offset--; - fw_offset -= 4; - strncpy(&inq_response[32], ctrl->firmware_rev + fw_offset, 4); - - xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); - return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); -} - -static int nvme_trans_supported_vpd_pages(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *inq_response, - int alloc_len) -{ - int xfer_len; - - memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); - inq_response[1] = INQ_SUPPORTED_VPD_PAGES_PAGE; /* Page Code */ - inq_response[3] = INQ_NUM_SUPPORTED_VPD_PAGES; /* Page Length */ - inq_response[4] = INQ_SUPPORTED_VPD_PAGES_PAGE; - inq_response[5] = INQ_UNIT_SERIAL_NUMBER_PAGE; - inq_response[6] = INQ_DEVICE_IDENTIFICATION_PAGE; - inq_response[7] = INQ_EXTENDED_INQUIRY_DATA_PAGE; - inq_response[8] = INQ_BDEV_CHARACTERISTICS_PAGE; - inq_response[9] = INQ_BDEV_LIMITS_PAGE; - - xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); - return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); -} - -static int nvme_trans_unit_serial_page(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *inq_response, - int alloc_len) -{ - int xfer_len; - - memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); - inq_response[1] = INQ_UNIT_SERIAL_NUMBER_PAGE; /* Page Code */ - inq_response[3] = INQ_SERIAL_NUMBER_LENGTH; /* Page Length */ - strncpy(&inq_response[4], ns->ctrl->serial, INQ_SERIAL_NUMBER_LENGTH); - - xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); - return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); -} - -static int nvme_fill_device_id_eui64(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *inq_response, int alloc_len) -{ - struct nvme_id_ns *id_ns; - int nvme_sc, res; - size_t len; - void *eui; - - nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - eui = id_ns->eui64; - len = sizeof(id_ns->eui64); - - if (ns->ctrl->vs >= NVME_VS(1, 2, 0)) { - if (bitmap_empty(eui, len * 8)) { - eui = id_ns->nguid; - len = sizeof(id_ns->nguid); - } - } - - if (bitmap_empty(eui, len * 8)) { - res = -EOPNOTSUPP; - goto out_free_id; - } - - memset(inq_response, 0, alloc_len); - inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; - inq_response[3] = 4 + len; /* Page Length */ - - /* Designation Descriptor start */ - inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */ - inq_response[5] = 0x02; /* PIV=0b | Asso=00b | Designator Type=2h */ - inq_response[6] = 0x00; /* Rsvd */ - inq_response[7] = len; /* Designator Length */ - memcpy(&inq_response[8], eui, len); - - res = nvme_trans_copy_to_user(hdr, inq_response, alloc_len); -out_free_id: - kfree(id_ns); - return res; -} - -static int nvme_fill_device_id_scsi_string(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *inq_response, int alloc_len) -{ - struct nvme_ctrl *ctrl = ns->ctrl; - struct nvme_id_ctrl *id_ctrl; - int nvme_sc, res; - - if (alloc_len < 72) { - return nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - } - - nvme_sc = nvme_identify_ctrl(ctrl, &id_ctrl); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - memset(inq_response, 0, alloc_len); - inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; - inq_response[3] = 0x48; /* Page Length */ - - /* Designation Descriptor start */ - inq_response[4] = 0x03; /* Proto ID=0h | Code set=3h */ - inq_response[5] = 0x08; /* PIV=0b | Asso=00b | Designator Type=8h */ - inq_response[6] = 0x00; /* Rsvd */ - inq_response[7] = 0x44; /* Designator Length */ - - sprintf(&inq_response[8], "%04x", le16_to_cpu(id_ctrl->vid)); - memcpy(&inq_response[12], ctrl->model, sizeof(ctrl->model)); - sprintf(&inq_response[52], "%04x", cpu_to_be32(ns->ns_id)); - memcpy(&inq_response[56], ctrl->serial, sizeof(ctrl->serial)); - - res = nvme_trans_copy_to_user(hdr, inq_response, alloc_len); - kfree(id_ctrl); - return res; -} - -static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *resp, int alloc_len) -{ - int res; - - if (ns->ctrl->vs >= NVME_VS(1, 1, 0)) { - res = nvme_fill_device_id_eui64(ns, hdr, resp, alloc_len); - if (res != -EOPNOTSUPP) - return res; - } - - return nvme_fill_device_id_scsi_string(ns, hdr, resp, alloc_len); -} - -static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, - int alloc_len) -{ - u8 *inq_response; - int res; - int nvme_sc; - struct nvme_ctrl *ctrl = ns->ctrl; - struct nvme_id_ctrl *id_ctrl; - struct nvme_id_ns *id_ns; - int xfer_len; - u8 microcode = 0x80; - u8 spt; - u8 spt_lut[8] = {0, 0, 2, 1, 4, 6, 5, 7}; - u8 grd_chk, app_chk, ref_chk, protect; - u8 uask_sup = 0x20; - u8 v_sup; - u8 luiclr = 0x01; - - inq_response = kmalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL); - if (inq_response == NULL) - return -ENOMEM; - - nvme_sc = nvme_identify_ns(ctrl, ns->ns_id, &id_ns); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out_free_inq; - - spt = spt_lut[id_ns->dpc & 0x07] << 3; - if (id_ns->dps) - protect = 0x01; - else - protect = 0; - kfree(id_ns); - - grd_chk = protect << 2; - app_chk = protect << 1; - ref_chk = protect; - - nvme_sc = nvme_identify_ctrl(ctrl, &id_ctrl); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out_free_inq; - - v_sup = id_ctrl->vwc; - kfree(id_ctrl); - - memset(inq_response, 0, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); - inq_response[1] = INQ_EXTENDED_INQUIRY_DATA_PAGE; /* Page Code */ - inq_response[2] = 0x00; /* Page Length MSB */ - inq_response[3] = 0x3C; /* Page Length LSB */ - inq_response[4] = microcode | spt | grd_chk | app_chk | ref_chk; - inq_response[5] = uask_sup; - inq_response[6] = v_sup; - inq_response[7] = luiclr; - inq_response[8] = 0; - inq_response[9] = 0; - - xfer_len = min(alloc_len, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); - res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); - - out_free_inq: - kfree(inq_response); - return res; -} - -static int nvme_trans_bdev_limits_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *inq_response, int alloc_len) -{ - __be32 max_sectors = cpu_to_be32( - nvme_block_nr(ns, queue_max_hw_sectors(ns->queue))); - __be32 max_discard = cpu_to_be32(ns->queue->limits.max_discard_sectors); - __be32 discard_desc_count = cpu_to_be32(0x100); - - memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); - inq_response[1] = VPD_BLOCK_LIMITS; - inq_response[3] = 0x3c; /* Page Length */ - memcpy(&inq_response[8], &max_sectors, sizeof(u32)); - memcpy(&inq_response[20], &max_discard, sizeof(u32)); - - if (max_discard) - memcpy(&inq_response[24], &discard_desc_count, sizeof(u32)); - - return nvme_trans_copy_to_user(hdr, inq_response, 0x3c); -} - -static int nvme_trans_bdev_char_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, - int alloc_len) -{ - u8 *inq_response; - int res; - int xfer_len; - - inq_response = kzalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL); - if (inq_response == NULL) { - res = -ENOMEM; - goto out_mem; - } - - inq_response[1] = INQ_BDEV_CHARACTERISTICS_PAGE; /* Page Code */ - inq_response[2] = 0x00; /* Page Length MSB */ - inq_response[3] = 0x3C; /* Page Length LSB */ - inq_response[4] = 0x00; /* Medium Rotation Rate MSB */ - inq_response[5] = 0x01; /* Medium Rotation Rate LSB */ - inq_response[6] = 0x00; /* Form Factor */ - - xfer_len = min(alloc_len, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); - res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); - - kfree(inq_response); - out_mem: - return res; -} - -/* LOG SENSE Helper Functions */ - -static int nvme_trans_log_supp_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr, - int alloc_len) -{ - int res; - int xfer_len; - u8 *log_response; - - log_response = kzalloc(LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH, GFP_KERNEL); - if (log_response == NULL) { - res = -ENOMEM; - goto out_mem; - } - - log_response[0] = LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE; - /* Subpage=0x00, Page Length MSB=0 */ - log_response[3] = SUPPORTED_LOG_PAGES_PAGE_LENGTH; - log_response[4] = LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE; - log_response[5] = LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE; - log_response[6] = LOG_PAGE_TEMPERATURE_PAGE; - - xfer_len = min(alloc_len, LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH); - res = nvme_trans_copy_to_user(hdr, log_response, xfer_len); - - kfree(log_response); - out_mem: - return res; -} - -static int nvme_trans_log_info_exceptions(struct nvme_ns *ns, - struct sg_io_hdr *hdr, int alloc_len) -{ - int res; - int xfer_len; - u8 *log_response; - struct nvme_smart_log *smart_log; - u8 temp_c; - u16 temp_k; - - log_response = kzalloc(LOG_INFO_EXCP_PAGE_LENGTH, GFP_KERNEL); - if (log_response == NULL) - return -ENOMEM; - - res = nvme_get_log_page(ns->ctrl, &smart_log); - if (res < 0) - goto out_free_response; - - if (res != NVME_SC_SUCCESS) { - temp_c = LOG_TEMP_UNKNOWN; - } else { - temp_k = (smart_log->temperature[1] << 8) + - (smart_log->temperature[0]); - temp_c = temp_k - KELVIN_TEMP_FACTOR; - } - kfree(smart_log); - - log_response[0] = LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE; - /* Subpage=0x00, Page Length MSB=0 */ - log_response[3] = REMAINING_INFO_EXCP_PAGE_LENGTH; - /* Informational Exceptions Log Parameter 1 Start */ - /* Parameter Code=0x0000 bytes 4,5 */ - log_response[6] = 0x23; /* DU=0, TSD=1, ETC=0, TMC=0, FMT_AND_LNK=11b */ - log_response[7] = 0x04; /* PARAMETER LENGTH */ - /* Add sense Code and qualifier = 0x00 each */ - /* Use Temperature from NVMe Get Log Page, convert to C from K */ - log_response[10] = temp_c; - - xfer_len = min(alloc_len, LOG_INFO_EXCP_PAGE_LENGTH); - res = nvme_trans_copy_to_user(hdr, log_response, xfer_len); - - out_free_response: - kfree(log_response); - return res; -} - -static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr, - int alloc_len) -{ - int res; - int xfer_len; - u8 *log_response; - struct nvme_smart_log *smart_log; - u32 feature_resp; - u8 temp_c_cur, temp_c_thresh; - u16 temp_k; - - log_response = kzalloc(LOG_TEMP_PAGE_LENGTH, GFP_KERNEL); - if (log_response == NULL) - return -ENOMEM; - - res = nvme_get_log_page(ns->ctrl, &smart_log); - if (res < 0) - goto out_free_response; - - if (res != NVME_SC_SUCCESS) { - temp_c_cur = LOG_TEMP_UNKNOWN; - } else { - temp_k = (smart_log->temperature[1] << 8) + - (smart_log->temperature[0]); - temp_c_cur = temp_k - KELVIN_TEMP_FACTOR; - } - kfree(smart_log); - - /* Get Features for Temp Threshold */ - res = nvme_get_features(ns->ctrl, NVME_FEAT_TEMP_THRESH, 0, NULL, 0, - &feature_resp); - if (res != NVME_SC_SUCCESS) - temp_c_thresh = LOG_TEMP_UNKNOWN; - else - temp_c_thresh = (feature_resp & 0xFFFF) - KELVIN_TEMP_FACTOR; - - log_response[0] = LOG_PAGE_TEMPERATURE_PAGE; - /* Subpage=0x00, Page Length MSB=0 */ - log_response[3] = REMAINING_TEMP_PAGE_LENGTH; - /* Temperature Log Parameter 1 (Temperature) Start */ - /* Parameter Code = 0x0000 */ - log_response[6] = 0x01; /* Format and Linking = 01b */ - log_response[7] = 0x02; /* Parameter Length */ - /* Use Temperature from NVMe Get Log Page, convert to C from K */ - log_response[9] = temp_c_cur; - /* Temperature Log Parameter 2 (Reference Temperature) Start */ - log_response[11] = 0x01; /* Parameter Code = 0x0001 */ - log_response[12] = 0x01; /* Format and Linking = 01b */ - log_response[13] = 0x02; /* Parameter Length */ - /* Use Temperature Thresh from NVMe Get Log Page, convert to C from K */ - log_response[15] = temp_c_thresh; - - xfer_len = min(alloc_len, LOG_TEMP_PAGE_LENGTH); - res = nvme_trans_copy_to_user(hdr, log_response, xfer_len); - - out_free_response: - kfree(log_response); - return res; -} - -/* MODE SENSE Helper Functions */ - -static int nvme_trans_fill_mode_parm_hdr(u8 *resp, int len, u8 cdb10, u8 llbaa, - u16 mode_data_length, u16 blk_desc_len) -{ - /* Quick check to make sure I don't stomp on my own memory... */ - if ((cdb10 && len < 8) || (!cdb10 && len < 4)) - return -EINVAL; - - if (cdb10) { - resp[0] = (mode_data_length & 0xFF00) >> 8; - resp[1] = (mode_data_length & 0x00FF); - resp[3] = 0x10 /* DPOFUA */; - resp[4] = llbaa; - resp[5] = RESERVED_FIELD; - resp[6] = (blk_desc_len & 0xFF00) >> 8; - resp[7] = (blk_desc_len & 0x00FF); - } else { - resp[0] = (mode_data_length & 0x00FF); - resp[2] = 0x10 /* DPOFUA */; - resp[3] = (blk_desc_len & 0x00FF); - } - - return 0; -} - -static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *resp, int len, u8 llbaa) -{ - int res; - int nvme_sc; - struct nvme_id_ns *id_ns; - u8 flbas; - u32 lba_length; - - if (llbaa == 0 && len < MODE_PAGE_BLK_DES_LEN) - return -EINVAL; - else if (llbaa > 0 && len < MODE_PAGE_LLBAA_BLK_DES_LEN) - return -EINVAL; - - nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - flbas = (id_ns->flbas) & 0x0F; - lba_length = (1 << (id_ns->lbaf[flbas].ds)); - - if (llbaa == 0) { - __be32 tmp_cap = cpu_to_be32(le64_to_cpu(id_ns->ncap)); - /* Byte 4 is reserved */ - __be32 tmp_len = cpu_to_be32(lba_length & 0x00FFFFFF); - - memcpy(resp, &tmp_cap, sizeof(u32)); - memcpy(&resp[4], &tmp_len, sizeof(u32)); - } else { - __be64 tmp_cap = cpu_to_be64(le64_to_cpu(id_ns->ncap)); - __be32 tmp_len = cpu_to_be32(lba_length); - - memcpy(resp, &tmp_cap, sizeof(u64)); - /* Bytes 8, 9, 10, 11 are reserved */ - memcpy(&resp[12], &tmp_len, sizeof(u32)); - } - - kfree(id_ns); - return res; -} - -static int nvme_trans_fill_control_page(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *resp, - int len) -{ - if (len < MODE_PAGE_CONTROL_LEN) - return -EINVAL; - - resp[0] = MODE_PAGE_CONTROL; - resp[1] = MODE_PAGE_CONTROL_LEN_FIELD; - resp[2] = 0x0E; /* TST=000b, TMF_ONLY=0, DPICZ=1, - * D_SENSE=1, GLTSD=1, RLEC=0 */ - resp[3] = 0x12; /* Q_ALGO_MODIFIER=1h, NUAR=0, QERR=01b */ - /* Byte 4: VS=0, RAC=0, UA_INT=0, SWP=0 */ - resp[5] = 0x40; /* ATO=0, TAS=1, ATMPE=0, RWWP=0, AUTOLOAD=0 */ - /* resp[6] and [7] are obsolete, thus zero */ - resp[8] = 0xFF; /* Busy timeout period = 0xffff */ - resp[9] = 0xFF; - /* Bytes 10,11: Extended selftest completion time = 0x0000 */ - - return 0; -} - -static int nvme_trans_fill_caching_page(struct nvme_ns *ns, - struct sg_io_hdr *hdr, - u8 *resp, int len) -{ - int res = 0; - int nvme_sc; - u32 feature_resp; - u8 vwc; - - if (len < MODE_PAGE_CACHING_LEN) - return -EINVAL; - - nvme_sc = nvme_get_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, 0, NULL, 0, - &feature_resp); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - vwc = feature_resp & 0x00000001; - - resp[0] = MODE_PAGE_CACHING; - resp[1] = MODE_PAGE_CACHING_LEN_FIELD; - resp[2] = vwc << 2; - return 0; -} - -static int nvme_trans_fill_pow_cnd_page(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *resp, - int len) -{ - if (len < MODE_PAGE_POW_CND_LEN) - return -EINVAL; - - resp[0] = MODE_PAGE_POWER_CONDITION; - resp[1] = MODE_PAGE_POW_CND_LEN_FIELD; - /* All other bytes are zero */ - - return 0; -} - -static int nvme_trans_fill_inf_exc_page(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *resp, - int len) -{ - if (len < MODE_PAGE_INF_EXC_LEN) - return -EINVAL; - - resp[0] = MODE_PAGE_INFO_EXCEP; - resp[1] = MODE_PAGE_INF_EXC_LEN_FIELD; - resp[2] = 0x88; - /* All other bytes are zero */ - - return 0; -} - -static int nvme_trans_fill_all_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *resp, int len) -{ - int res; - u16 mode_pages_offset_1 = 0; - u16 mode_pages_offset_2, mode_pages_offset_3, mode_pages_offset_4; - - mode_pages_offset_2 = mode_pages_offset_1 + MODE_PAGE_CACHING_LEN; - mode_pages_offset_3 = mode_pages_offset_2 + MODE_PAGE_CONTROL_LEN; - mode_pages_offset_4 = mode_pages_offset_3 + MODE_PAGE_POW_CND_LEN; - - res = nvme_trans_fill_caching_page(ns, hdr, &resp[mode_pages_offset_1], - MODE_PAGE_CACHING_LEN); - if (res) - return res; - res = nvme_trans_fill_control_page(ns, hdr, &resp[mode_pages_offset_2], - MODE_PAGE_CONTROL_LEN); - if (res) - return res; - res = nvme_trans_fill_pow_cnd_page(ns, hdr, &resp[mode_pages_offset_3], - MODE_PAGE_POW_CND_LEN); - if (res) - return res; - return nvme_trans_fill_inf_exc_page(ns, hdr, &resp[mode_pages_offset_4], - MODE_PAGE_INF_EXC_LEN); -} - -static inline int nvme_trans_get_blk_desc_len(u8 dbd, u8 llbaa) -{ - if (dbd == MODE_SENSE_BLK_DESC_ENABLED) { - /* SPC-4: len = 8 x Num_of_descriptors if llbaa = 0, 16x if 1 */ - return 8 * (llbaa + 1) * MODE_SENSE_BLK_DESC_COUNT; - } else { - return 0; - } -} - -static int nvme_trans_mode_page_create(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *cmd, - u16 alloc_len, u8 cdb10, - int (*mode_page_fill_func) - (struct nvme_ns *, - struct sg_io_hdr *hdr, u8 *, int), - u16 mode_pages_tot_len) -{ - int res; - int xfer_len; - u8 *response; - u8 dbd, llbaa; - u16 resp_size; - int mph_size; - u16 mode_pages_offset_1; - u16 blk_desc_len, blk_desc_offset, mode_data_length; - - dbd = (cmd[1] & MODE_SENSE_DBD_MASK) >> MODE_SENSE_DBD_SHIFT; - llbaa = (cmd[1] & MODE_SENSE_LLBAA_MASK) >> MODE_SENSE_LLBAA_SHIFT; - mph_size = cdb10 ? MODE_SENSE10_MPH_SIZE : MODE_SENSE6_MPH_SIZE; - - blk_desc_len = nvme_trans_get_blk_desc_len(dbd, llbaa); - - resp_size = mph_size + blk_desc_len + mode_pages_tot_len; - /* Refer spc4r34 Table 440 for calculation of Mode data Length field */ - mode_data_length = 3 + (3 * cdb10) + blk_desc_len + mode_pages_tot_len; - - blk_desc_offset = mph_size; - mode_pages_offset_1 = blk_desc_offset + blk_desc_len; - - response = kzalloc(resp_size, GFP_KERNEL); - if (response == NULL) { - res = -ENOMEM; - goto out_mem; - } - - res = nvme_trans_fill_mode_parm_hdr(&response[0], mph_size, cdb10, - llbaa, mode_data_length, blk_desc_len); - if (res) - goto out_free; - if (blk_desc_len > 0) { - res = nvme_trans_fill_blk_desc(ns, hdr, - &response[blk_desc_offset], - blk_desc_len, llbaa); - if (res) - goto out_free; - } - res = mode_page_fill_func(ns, hdr, &response[mode_pages_offset_1], - mode_pages_tot_len); - if (res) - goto out_free; - - xfer_len = min(alloc_len, resp_size); - res = nvme_trans_copy_to_user(hdr, response, xfer_len); - - out_free: - kfree(response); - out_mem: - return res; -} - -/* Read Capacity Helper Functions */ - -static void nvme_trans_fill_read_cap(u8 *response, struct nvme_id_ns *id_ns, - u8 cdb16) -{ - u8 flbas; - u32 lba_length; - u64 rlba; - u8 prot_en; - u8 p_type_lut[4] = {0, 0, 1, 2}; - __be64 tmp_rlba; - __be32 tmp_rlba_32; - __be32 tmp_len; - - flbas = (id_ns->flbas) & 0x0F; - lba_length = (1 << (id_ns->lbaf[flbas].ds)); - rlba = le64_to_cpup(&id_ns->nsze) - 1; - (id_ns->dps) ? (prot_en = 0x01) : (prot_en = 0); - - if (!cdb16) { - if (rlba > 0xFFFFFFFF) - rlba = 0xFFFFFFFF; - tmp_rlba_32 = cpu_to_be32(rlba); - tmp_len = cpu_to_be32(lba_length); - memcpy(response, &tmp_rlba_32, sizeof(u32)); - memcpy(&response[4], &tmp_len, sizeof(u32)); - } else { - tmp_rlba = cpu_to_be64(rlba); - tmp_len = cpu_to_be32(lba_length); - memcpy(response, &tmp_rlba, sizeof(u64)); - memcpy(&response[8], &tmp_len, sizeof(u32)); - response[12] = (p_type_lut[id_ns->dps & 0x3] << 1) | prot_en; - /* P_I_Exponent = 0x0 | LBPPBE = 0x0 */ - /* LBPME = 0 | LBPRZ = 0 | LALBA = 0x00 */ - /* Bytes 16-31 - Reserved */ - } -} - -/* Start Stop Unit Helper Functions */ - -static int nvme_trans_send_activate_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 buffer_id) -{ - struct nvme_command c; - int nvme_sc; - - memset(&c, 0, sizeof(c)); - c.common.opcode = nvme_admin_activate_fw; - c.common.cdw10[0] = cpu_to_le32(buffer_id | NVME_FWACT_REPL_ACTV); - - nvme_sc = nvme_submit_sync_cmd(ns->queue, &c, NULL, 0); - return nvme_trans_status_code(hdr, nvme_sc); -} - -static int nvme_trans_send_download_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 opcode, u32 tot_len, u32 offset, - u8 buffer_id) -{ - int nvme_sc; - struct nvme_command c; - - if (hdr->iovec_count > 0) { - /* Assuming SGL is not allowed for this command */ - return nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - } - - memset(&c, 0, sizeof(c)); - c.common.opcode = nvme_admin_download_fw; - c.dlfw.numd = cpu_to_le32((tot_len/BYTES_TO_DWORDS) - 1); - c.dlfw.offset = cpu_to_le32(offset/BYTES_TO_DWORDS); - - nvme_sc = nvme_submit_user_cmd(ns->ctrl->admin_q, &c, - hdr->dxferp, tot_len, NULL, 0); - return nvme_trans_status_code(hdr, nvme_sc); -} - -/* Mode Select Helper Functions */ - -static inline void nvme_trans_modesel_get_bd_len(u8 *parm_list, u8 cdb10, - u16 *bd_len, u8 *llbaa) -{ - if (cdb10) { - /* 10 Byte CDB */ - *bd_len = (parm_list[MODE_SELECT_10_BD_OFFSET] << 8) + - parm_list[MODE_SELECT_10_BD_OFFSET + 1]; - *llbaa = parm_list[MODE_SELECT_10_LLBAA_OFFSET] & - MODE_SELECT_10_LLBAA_MASK; - } else { - /* 6 Byte CDB */ - *bd_len = parm_list[MODE_SELECT_6_BD_OFFSET]; - } -} - -static void nvme_trans_modesel_save_bd(struct nvme_ns *ns, u8 *parm_list, - u16 idx, u16 bd_len, u8 llbaa) -{ - /* Store block descriptor info if a FORMAT UNIT comes later */ - /* TODO Saving 1st BD info; what to do if multiple BD received? */ - if (llbaa == 0) { - /* Standard Block Descriptor - spc4r34 7.5.5.1 */ - ns->mode_select_num_blocks = - (parm_list[idx + 1] << 16) + - (parm_list[idx + 2] << 8) + - (parm_list[idx + 3]); - - ns->mode_select_block_len = - (parm_list[idx + 5] << 16) + - (parm_list[idx + 6] << 8) + - (parm_list[idx + 7]); - } else { - /* Long LBA Block Descriptor - sbc3r27 6.4.2.3 */ - ns->mode_select_num_blocks = - (((u64)parm_list[idx + 0]) << 56) + - (((u64)parm_list[idx + 1]) << 48) + - (((u64)parm_list[idx + 2]) << 40) + - (((u64)parm_list[idx + 3]) << 32) + - (((u64)parm_list[idx + 4]) << 24) + - (((u64)parm_list[idx + 5]) << 16) + - (((u64)parm_list[idx + 6]) << 8) + - ((u64)parm_list[idx + 7]); - - ns->mode_select_block_len = - (parm_list[idx + 12] << 24) + - (parm_list[idx + 13] << 16) + - (parm_list[idx + 14] << 8) + - (parm_list[idx + 15]); - } -} - -static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *mode_page, u8 page_code) -{ - int res = 0; - int nvme_sc; - unsigned dword11; - - switch (page_code) { - case MODE_PAGE_CACHING: - dword11 = ((mode_page[2] & CACHING_MODE_PAGE_WCE_MASK) ? 1 : 0); - nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, - dword11, NULL, 0, NULL); - res = nvme_trans_status_code(hdr, nvme_sc); - break; - case MODE_PAGE_CONTROL: - break; - case MODE_PAGE_POWER_CONDITION: - /* Verify the OS is not trying to set timers */ - if ((mode_page[2] & 0x01) != 0 || (mode_page[3] & 0x0F) != 0) { - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_PARAMETER, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - break; - default: - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - - return res; -} - -static int nvme_trans_modesel_data(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd, u16 parm_list_len, u8 pf, - u8 sp, u8 cdb10) -{ - int res; - u8 *parm_list; - u16 bd_len; - u8 llbaa = 0; - u16 index, saved_index; - u8 page_code; - u16 mp_size; - - /* Get parm list from data-in/out buffer */ - parm_list = kmalloc(parm_list_len, GFP_KERNEL); - if (parm_list == NULL) { - res = -ENOMEM; - goto out; - } - - res = nvme_trans_copy_from_user(hdr, parm_list, parm_list_len); - if (res) - goto out_mem; - - nvme_trans_modesel_get_bd_len(parm_list, cdb10, &bd_len, &llbaa); - index = (cdb10) ? (MODE_SELECT_10_MPH_SIZE) : (MODE_SELECT_6_MPH_SIZE); - - if (bd_len != 0) { - /* Block Descriptors present, parse */ - nvme_trans_modesel_save_bd(ns, parm_list, index, bd_len, llbaa); - index += bd_len; - } - saved_index = index; - - /* Multiple mode pages may be present; iterate through all */ - /* In 1st Iteration, don't do NVME Command, only check for CDB errors */ - do { - page_code = parm_list[index] & MODE_SELECT_PAGE_CODE_MASK; - mp_size = parm_list[index + 1] + 2; - if ((page_code != MODE_PAGE_CACHING) && - (page_code != MODE_PAGE_CONTROL) && - (page_code != MODE_PAGE_POWER_CONDITION)) { - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out_mem; - } - index += mp_size; - } while (index < parm_list_len); - - /* In 2nd Iteration, do the NVME Commands */ - index = saved_index; - do { - page_code = parm_list[index] & MODE_SELECT_PAGE_CODE_MASK; - mp_size = parm_list[index + 1] + 2; - res = nvme_trans_modesel_get_mp(ns, hdr, &parm_list[index], - page_code); - if (res) - break; - index += mp_size; - } while (index < parm_list_len); - - out_mem: - kfree(parm_list); - out: - return res; -} - -/* Format Unit Helper Functions */ - -static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns, - struct sg_io_hdr *hdr) -{ - int res = 0; - int nvme_sc; - u8 flbas; - - /* - * SCSI Expects a MODE SELECT would have been issued prior to - * a FORMAT UNIT, and the block size and number would be used - * from the block descriptor in it. If a MODE SELECT had not - * been issued, FORMAT shall use the current values for both. - */ - - if (ns->mode_select_num_blocks == 0 || ns->mode_select_block_len == 0) { - struct nvme_id_ns *id_ns; - - nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - if (ns->mode_select_num_blocks == 0) - ns->mode_select_num_blocks = le64_to_cpu(id_ns->ncap); - if (ns->mode_select_block_len == 0) { - flbas = (id_ns->flbas) & 0x0F; - ns->mode_select_block_len = - (1 << (id_ns->lbaf[flbas].ds)); - } - - kfree(id_ns); - } - - return 0; -} - -static int nvme_trans_fmt_get_parm_header(struct sg_io_hdr *hdr, u8 len, - u8 format_prot_info, u8 *nvme_pf_code) -{ - int res; - u8 *parm_list; - u8 pf_usage, pf_code; - - parm_list = kmalloc(len, GFP_KERNEL); - if (parm_list == NULL) { - res = -ENOMEM; - goto out; - } - res = nvme_trans_copy_from_user(hdr, parm_list, len); - if (res) - goto out_mem; - - if ((parm_list[FORMAT_UNIT_IMMED_OFFSET] & - FORMAT_UNIT_IMMED_MASK) != 0) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out_mem; - } - - if (len == FORMAT_UNIT_LONG_PARM_LIST_LEN && - (parm_list[FORMAT_UNIT_PROT_INT_OFFSET] & 0x0F) != 0) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out_mem; - } - pf_usage = parm_list[FORMAT_UNIT_PROT_FIELD_USAGE_OFFSET] & - FORMAT_UNIT_PROT_FIELD_USAGE_MASK; - pf_code = (pf_usage << 2) | format_prot_info; - switch (pf_code) { - case 0: - *nvme_pf_code = 0; - break; - case 2: - *nvme_pf_code = 1; - break; - case 3: - *nvme_pf_code = 2; - break; - case 7: - *nvme_pf_code = 3; - break; - default: - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - - out_mem: - kfree(parm_list); - out: - return res; -} - -static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 prot_info) -{ - int res; - int nvme_sc; - struct nvme_id_ns *id_ns; - u8 i; - u8 nlbaf; - u8 selected_lbaf = 0xFF; - u32 cdw10 = 0; - struct nvme_command c; - - /* Loop thru LBAF's in id_ns to match reqd lbaf, put in cdw10 */ - nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - nlbaf = id_ns->nlbaf; - - for (i = 0; i < nlbaf; i++) { - if (ns->mode_select_block_len == (1 << (id_ns->lbaf[i].ds))) { - selected_lbaf = i; - break; - } - } - if (selected_lbaf > 0x0F) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_PARAMETER, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - } - if (ns->mode_select_num_blocks != le64_to_cpu(id_ns->ncap)) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_PARAMETER, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - } - - cdw10 |= prot_info << 5; - cdw10 |= selected_lbaf & 0x0F; - memset(&c, 0, sizeof(c)); - c.format.opcode = nvme_admin_format_nvm; - c.format.nsid = cpu_to_le32(ns->ns_id); - c.format.cdw10 = cpu_to_le32(cdw10); - - nvme_sc = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, NULL, 0); - res = nvme_trans_status_code(hdr, nvme_sc); - - kfree(id_ns); - return res; -} - -static inline u32 nvme_trans_io_get_num_cmds(struct sg_io_hdr *hdr, - struct nvme_trans_io_cdb *cdb_info, - u32 max_blocks) -{ - /* If using iovecs, send one nvme command per vector */ - if (hdr->iovec_count > 0) - return hdr->iovec_count; - else if (cdb_info->xfer_len > max_blocks) - return ((cdb_info->xfer_len - 1) / max_blocks) + 1; - else - return 1; -} - -static u16 nvme_trans_io_get_control(struct nvme_ns *ns, - struct nvme_trans_io_cdb *cdb_info) -{ - u16 control = 0; - - /* When Protection information support is added, implement here */ - - if (cdb_info->fua > 0) - control |= NVME_RW_FUA; - - return control; -} - -static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, - struct nvme_trans_io_cdb *cdb_info, u8 is_write) -{ - int nvme_sc = NVME_SC_SUCCESS; - u32 num_cmds; - u64 unit_len; - u64 unit_num_blocks; /* Number of blocks to xfer in each nvme cmd */ - u32 retcode; - u32 i = 0; - u64 nvme_offset = 0; - void __user *next_mapping_addr; - struct nvme_command c; - u8 opcode = (is_write ? nvme_cmd_write : nvme_cmd_read); - u16 control; - u32 max_blocks = queue_max_hw_sectors(ns->queue) >> (ns->lba_shift - 9); - - num_cmds = nvme_trans_io_get_num_cmds(hdr, cdb_info, max_blocks); - - /* - * This loop handles two cases. - * First, when an SGL is used in the form of an iovec list: - * - Use iov_base as the next mapping address for the nvme command_id - * - Use iov_len as the data transfer length for the command. - * Second, when we have a single buffer - * - If larger than max_blocks, split into chunks, offset - * each nvme command accordingly. - */ - for (i = 0; i < num_cmds; i++) { - memset(&c, 0, sizeof(c)); - if (hdr->iovec_count > 0) { - struct sg_iovec sgl; - - retcode = copy_from_user(&sgl, hdr->dxferp + - i * sizeof(struct sg_iovec), - sizeof(struct sg_iovec)); - if (retcode) - return -EFAULT; - unit_len = sgl.iov_len; - unit_num_blocks = unit_len >> ns->lba_shift; - next_mapping_addr = sgl.iov_base; - } else { - unit_num_blocks = min((u64)max_blocks, - (cdb_info->xfer_len - nvme_offset)); - unit_len = unit_num_blocks << ns->lba_shift; - next_mapping_addr = hdr->dxferp + - ((1 << ns->lba_shift) * nvme_offset); - } - - c.rw.opcode = opcode; - c.rw.nsid = cpu_to_le32(ns->ns_id); - c.rw.slba = cpu_to_le64(cdb_info->lba + nvme_offset); - c.rw.length = cpu_to_le16(unit_num_blocks - 1); - control = nvme_trans_io_get_control(ns, cdb_info); - c.rw.control = cpu_to_le16(control); - - if (get_capacity(ns->disk) - unit_num_blocks < - cdb_info->lba + nvme_offset) { - nvme_sc = NVME_SC_LBA_RANGE; - break; - } - nvme_sc = nvme_submit_user_cmd(ns->queue, &c, - next_mapping_addr, unit_len, NULL, 0); - if (nvme_sc) - break; - - nvme_offset += unit_num_blocks; - } - - return nvme_trans_status_code(hdr, nvme_sc); -} - - -/* SCSI Command Translation Functions */ - -static int nvme_trans_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 is_write, - u8 *cmd) -{ - int res = 0; - struct nvme_trans_io_cdb cdb_info = { 0, }; - u8 opcode = cmd[0]; - u64 xfer_bytes; - u64 sum_iov_len = 0; - struct sg_iovec sgl; - int i; - size_t not_copied; - - /* - * The FUA and WPROTECT fields are not supported in 6-byte CDBs, - * but always in the same place for all others. - */ - switch (opcode) { - case WRITE_6: - case READ_6: - break; - default: - cdb_info.fua = cmd[1] & 0x8; - cdb_info.prot_info = (cmd[1] & 0xe0) >> 5; - if (cdb_info.prot_info && !ns->pi_type) { - return nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - } - } - - switch (opcode) { - case WRITE_6: - case READ_6: - cdb_info.lba = get_unaligned_be24(&cmd[1]); - cdb_info.xfer_len = cmd[4]; - if (cdb_info.xfer_len == 0) - cdb_info.xfer_len = 256; - break; - case WRITE_10: - case READ_10: - cdb_info.lba = get_unaligned_be32(&cmd[2]); - cdb_info.xfer_len = get_unaligned_be16(&cmd[7]); - break; - case WRITE_12: - case READ_12: - cdb_info.lba = get_unaligned_be32(&cmd[2]); - cdb_info.xfer_len = get_unaligned_be32(&cmd[6]); - break; - case WRITE_16: - case READ_16: - cdb_info.lba = get_unaligned_be64(&cmd[2]); - cdb_info.xfer_len = get_unaligned_be32(&cmd[10]); - break; - default: - /* Will never really reach here */ - res = -EIO; - goto out; - } - - /* Calculate total length of transfer (in bytes) */ - if (hdr->iovec_count > 0) { - for (i = 0; i < hdr->iovec_count; i++) { - not_copied = copy_from_user(&sgl, hdr->dxferp + - i * sizeof(struct sg_iovec), - sizeof(struct sg_iovec)); - if (not_copied) - return -EFAULT; - sum_iov_len += sgl.iov_len; - /* IO vector sizes should be multiples of block size */ - if (sgl.iov_len % (1 << ns->lba_shift) != 0) { - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_PARAMETER, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - } - } else { - sum_iov_len = hdr->dxfer_len; - } - - /* As Per sg ioctl howto, if the lengths differ, use the lower one */ - xfer_bytes = min(((u64)hdr->dxfer_len), sum_iov_len); - - /* If block count and actual data buffer size dont match, error out */ - if (xfer_bytes != (cdb_info.xfer_len << ns->lba_shift)) { - res = -EINVAL; - goto out; - } - - /* Check for 0 length transfer - it is not illegal */ - if (cdb_info.xfer_len == 0) - goto out; - - /* Send NVMe IO Command(s) */ - res = nvme_trans_do_nvme_io(ns, hdr, &cdb_info, is_write); - if (res) - goto out; - - out: - return res; -} - -static int nvme_trans_inquiry(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res = 0; - u8 evpd; - u8 page_code; - int alloc_len; - u8 *inq_response; - - evpd = cmd[1] & 0x01; - page_code = cmd[2]; - alloc_len = get_unaligned_be16(&cmd[3]); - - inq_response = kmalloc(max(alloc_len, STANDARD_INQUIRY_LENGTH), - GFP_KERNEL); - if (inq_response == NULL) { - res = -ENOMEM; - goto out_mem; - } - - if (evpd == 0) { - if (page_code == INQ_STANDARD_INQUIRY_PAGE) { - res = nvme_trans_standard_inquiry_page(ns, hdr, - inq_response, alloc_len); - } else { - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - } - } else { - switch (page_code) { - case VPD_SUPPORTED_PAGES: - res = nvme_trans_supported_vpd_pages(ns, hdr, - inq_response, alloc_len); - break; - case VPD_SERIAL_NUMBER: - res = nvme_trans_unit_serial_page(ns, hdr, inq_response, - alloc_len); - break; - case VPD_DEVICE_IDENTIFIERS: - res = nvme_trans_device_id_page(ns, hdr, inq_response, - alloc_len); - break; - case VPD_EXTENDED_INQUIRY: - res = nvme_trans_ext_inq_page(ns, hdr, alloc_len); - break; - case VPD_BLOCK_LIMITS: - res = nvme_trans_bdev_limits_page(ns, hdr, inq_response, - alloc_len); - break; - case VPD_BLOCK_DEV_CHARACTERISTICS: - res = nvme_trans_bdev_char_page(ns, hdr, alloc_len); - break; - default: - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - } - kfree(inq_response); - out_mem: - return res; -} - -static int nvme_trans_log_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res; - u16 alloc_len; - u8 pc; - u8 page_code; - - if (cmd[1] != LOG_SENSE_CDB_SP_NOT_ENABLED) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - - page_code = cmd[2] & LOG_SENSE_CDB_PAGE_CODE_MASK; - pc = (cmd[2] & LOG_SENSE_CDB_PC_MASK) >> LOG_SENSE_CDB_PC_SHIFT; - if (pc != LOG_SENSE_CDB_PC_CUMULATIVE_VALUES) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - alloc_len = get_unaligned_be16(&cmd[7]); - switch (page_code) { - case LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE: - res = nvme_trans_log_supp_pages(ns, hdr, alloc_len); - break; - case LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE: - res = nvme_trans_log_info_exceptions(ns, hdr, alloc_len); - break; - case LOG_PAGE_TEMPERATURE_PAGE: - res = nvme_trans_log_temperature(ns, hdr, alloc_len); - break; - default: - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - - out: - return res; -} - -static int nvme_trans_mode_select(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - u8 cdb10 = 0; - u16 parm_list_len; - u8 page_format; - u8 save_pages; - - page_format = cmd[1] & MODE_SELECT_CDB_PAGE_FORMAT_MASK; - save_pages = cmd[1] & MODE_SELECT_CDB_SAVE_PAGES_MASK; - - if (cmd[0] == MODE_SELECT) { - parm_list_len = cmd[4]; - } else { - parm_list_len = cmd[7]; - cdb10 = 1; - } - - if (parm_list_len != 0) { - /* - * According to SPC-4 r24, a paramter list length field of 0 - * shall not be considered an error - */ - return nvme_trans_modesel_data(ns, hdr, cmd, parm_list_len, - page_format, save_pages, cdb10); - } - - return 0; -} - -static int nvme_trans_mode_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res = 0; - u16 alloc_len; - u8 cdb10 = 0; - - if (cmd[0] == MODE_SENSE) { - alloc_len = cmd[4]; - } else { - alloc_len = get_unaligned_be16(&cmd[7]); - cdb10 = 1; - } - - if ((cmd[2] & MODE_SENSE_PAGE_CONTROL_MASK) != - MODE_SENSE_PC_CURRENT_VALUES) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - - switch (cmd[2] & MODE_SENSE_PAGE_CODE_MASK) { - case MODE_PAGE_CACHING: - res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, - cdb10, - &nvme_trans_fill_caching_page, - MODE_PAGE_CACHING_LEN); - break; - case MODE_PAGE_CONTROL: - res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, - cdb10, - &nvme_trans_fill_control_page, - MODE_PAGE_CONTROL_LEN); - break; - case MODE_PAGE_POWER_CONDITION: - res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, - cdb10, - &nvme_trans_fill_pow_cnd_page, - MODE_PAGE_POW_CND_LEN); - break; - case MODE_PAGE_INFO_EXCEP: - res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, - cdb10, - &nvme_trans_fill_inf_exc_page, - MODE_PAGE_INF_EXC_LEN); - break; - case MODE_PAGE_RETURN_ALL: - res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, - cdb10, - &nvme_trans_fill_all_pages, - MODE_PAGE_ALL_LEN); - break; - default: - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - - out: - return res; -} - -static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd, u8 cdb16) -{ - int res; - int nvme_sc; - u32 alloc_len; - u32 resp_size; - u32 xfer_len; - struct nvme_id_ns *id_ns; - u8 *response; - - if (cdb16) { - alloc_len = get_unaligned_be32(&cmd[10]); - resp_size = READ_CAP_16_RESP_SIZE; - } else { - alloc_len = READ_CAP_10_RESP_SIZE; - resp_size = READ_CAP_10_RESP_SIZE; - } - - nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - response = kzalloc(resp_size, GFP_KERNEL); - if (response == NULL) { - res = -ENOMEM; - goto out_free_id; - } - nvme_trans_fill_read_cap(response, id_ns, cdb16); - - xfer_len = min(alloc_len, resp_size); - res = nvme_trans_copy_to_user(hdr, response, xfer_len); - - kfree(response); - out_free_id: - kfree(id_ns); - return res; -} - -static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res; - int nvme_sc; - u32 alloc_len, xfer_len, resp_size; - u8 *response; - struct nvme_id_ctrl *id_ctrl; - u32 ll_length, lun_id; - u8 lun_id_offset = REPORT_LUNS_FIRST_LUN_OFFSET; - __be32 tmp_len; - - switch (cmd[2]) { - default: - return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - case ALL_LUNS_RETURNED: - case ALL_WELL_KNOWN_LUNS_RETURNED: - case RESTRICTED_LUNS_RETURNED: - nvme_sc = nvme_identify_ctrl(ns->ctrl, &id_ctrl); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - ll_length = le32_to_cpu(id_ctrl->nn) * LUN_ENTRY_SIZE; - resp_size = ll_length + LUN_DATA_HEADER_SIZE; - - alloc_len = get_unaligned_be32(&cmd[6]); - if (alloc_len < resp_size) { - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out_free_id; - } - - response = kzalloc(resp_size, GFP_KERNEL); - if (response == NULL) { - res = -ENOMEM; - goto out_free_id; - } - - /* The first LUN ID will always be 0 per the SAM spec */ - for (lun_id = 0; lun_id < le32_to_cpu(id_ctrl->nn); lun_id++) { - /* - * Set the LUN Id and then increment to the next LUN - * location in the parameter data. - */ - __be64 tmp_id = cpu_to_be64(lun_id); - memcpy(&response[lun_id_offset], &tmp_id, sizeof(u64)); - lun_id_offset += LUN_ENTRY_SIZE; - } - tmp_len = cpu_to_be32(ll_length); - memcpy(response, &tmp_len, sizeof(u32)); - } - - xfer_len = min(alloc_len, resp_size); - res = nvme_trans_copy_to_user(hdr, response, xfer_len); - - kfree(response); - out_free_id: - kfree(id_ctrl); - return res; -} - -static int nvme_trans_request_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res; - u8 alloc_len, xfer_len, resp_size; - u8 desc_format; - u8 *response; - - desc_format = cmd[1] & 0x01; - alloc_len = cmd[4]; - - resp_size = ((desc_format) ? (DESC_FMT_SENSE_DATA_SIZE) : - (FIXED_FMT_SENSE_DATA_SIZE)); - response = kzalloc(resp_size, GFP_KERNEL); - if (response == NULL) { - res = -ENOMEM; - goto out; - } - - if (desc_format) { - /* Descriptor Format Sense Data */ - response[0] = DESC_FORMAT_SENSE_DATA; - response[1] = NO_SENSE; - /* TODO How is LOW POWER CONDITION ON handled? (byte 2) */ - response[2] = SCSI_ASC_NO_SENSE; - response[3] = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - /* SDAT_OVFL = 0 | Additional Sense Length = 0 */ - } else { - /* Fixed Format Sense Data */ - response[0] = FIXED_SENSE_DATA; - /* Byte 1 = Obsolete */ - response[2] = NO_SENSE; /* FM, EOM, ILI, SDAT_OVFL = 0 */ - /* Bytes 3-6 - Information - set to zero */ - response[7] = FIXED_SENSE_DATA_ADD_LENGTH; - /* Bytes 8-11 - Cmd Specific Information - set to zero */ - response[12] = SCSI_ASC_NO_SENSE; - response[13] = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - /* Byte 14 = Field Replaceable Unit Code = 0 */ - /* Bytes 15-17 - SKSV=0; Sense Key Specific = 0 */ - } - - xfer_len = min(alloc_len, resp_size); - res = nvme_trans_copy_to_user(hdr, response, xfer_len); - - kfree(response); - out: - return res; -} - -static int nvme_trans_synchronize_cache(struct nvme_ns *ns, - struct sg_io_hdr *hdr) -{ - int nvme_sc; - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.common.opcode = nvme_cmd_flush; - c.common.nsid = cpu_to_le32(ns->ns_id); - - nvme_sc = nvme_submit_sync_cmd(ns->queue, &c, NULL, 0); - return nvme_trans_status_code(hdr, nvme_sc); -} - -static int nvme_trans_format_unit(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res; - u8 parm_hdr_len = 0; - u8 nvme_pf_code = 0; - u8 format_prot_info, long_list, format_data; - - format_prot_info = (cmd[1] & 0xc0) >> 6; - long_list = cmd[1] & 0x20; - format_data = cmd[1] & 0x10; - - if (format_data != 0) { - if (format_prot_info != 0) { - if (long_list == 0) - parm_hdr_len = FORMAT_UNIT_SHORT_PARM_LIST_LEN; - else - parm_hdr_len = FORMAT_UNIT_LONG_PARM_LIST_LEN; - } - } else if (format_data == 0 && format_prot_info != 0) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - - /* Get parm header from data-in/out buffer */ - /* - * According to the translation spec, the only fields in the parameter - * list we are concerned with are in the header. So allocate only that. - */ - if (parm_hdr_len > 0) { - res = nvme_trans_fmt_get_parm_header(hdr, parm_hdr_len, - format_prot_info, &nvme_pf_code); - if (res) - goto out; - } - - /* Attempt to activate any previously downloaded firmware image */ - res = nvme_trans_send_activate_fw_cmd(ns, hdr, 0); - - /* Determine Block size and count and send format command */ - res = nvme_trans_fmt_set_blk_size_count(ns, hdr); - if (res) - goto out; - - res = nvme_trans_fmt_send_cmd(ns, hdr, nvme_pf_code); - - out: - return res; -} - -static int nvme_trans_test_unit_ready(struct nvme_ns *ns, - struct sg_io_hdr *hdr, - u8 *cmd) -{ - if (nvme_ctrl_ready(ns->ctrl)) - return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - NOT_READY, SCSI_ASC_LUN_NOT_READY, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - else - return nvme_trans_completion(hdr, SAM_STAT_GOOD, NO_SENSE, 0, 0); -} - -static int nvme_trans_write_buffer(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res = 0; - u32 buffer_offset, parm_list_length; - u8 buffer_id, mode; - - parm_list_length = get_unaligned_be24(&cmd[6]); - if (parm_list_length % BYTES_TO_DWORDS != 0) { - /* NVMe expects Firmware file to be a whole number of DWORDS */ - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - buffer_id = cmd[2]; - if (buffer_id > NVME_MAX_FIRMWARE_SLOT) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - mode = cmd[1] & 0x1f; - buffer_offset = get_unaligned_be24(&cmd[3]); - - switch (mode) { - case DOWNLOAD_SAVE_ACTIVATE: - res = nvme_trans_send_download_fw_cmd(ns, hdr, nvme_admin_download_fw, - parm_list_length, buffer_offset, - buffer_id); - if (res) - goto out; - res = nvme_trans_send_activate_fw_cmd(ns, hdr, buffer_id); - break; - case DOWNLOAD_SAVE_DEFER_ACTIVATE: - res = nvme_trans_send_download_fw_cmd(ns, hdr, nvme_admin_download_fw, - parm_list_length, buffer_offset, - buffer_id); - break; - case ACTIVATE_DEFERRED_MICROCODE: - res = nvme_trans_send_activate_fw_cmd(ns, hdr, buffer_id); - break; - default: - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - - out: - return res; -} - -struct scsi_unmap_blk_desc { - __be64 slba; - __be32 nlb; - u32 resv; -}; - -struct scsi_unmap_parm_list { - __be16 unmap_data_len; - __be16 unmap_blk_desc_data_len; - u32 resv; - struct scsi_unmap_blk_desc desc[0]; -}; - -static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - struct scsi_unmap_parm_list *plist; - struct nvme_dsm_range *range; - struct nvme_command c; - int i, nvme_sc, res; - u16 ndesc, list_len; - - list_len = get_unaligned_be16(&cmd[7]); - if (!list_len) - return -EINVAL; - - plist = kmalloc(list_len, GFP_KERNEL); - if (!plist) - return -ENOMEM; - - res = nvme_trans_copy_from_user(hdr, plist, list_len); - if (res) - goto out; - - ndesc = be16_to_cpu(plist->unmap_blk_desc_data_len) >> 4; - if (!ndesc || ndesc > 256) { - res = -EINVAL; - goto out; - } - - range = kcalloc(ndesc, sizeof(*range), GFP_KERNEL); - if (!range) { - res = -ENOMEM; - goto out; - } - - for (i = 0; i < ndesc; i++) { - range[i].nlb = cpu_to_le32(be32_to_cpu(plist->desc[i].nlb)); - range[i].slba = cpu_to_le64(be64_to_cpu(plist->desc[i].slba)); - range[i].cattr = 0; - } - - memset(&c, 0, sizeof(c)); - c.dsm.opcode = nvme_cmd_dsm; - c.dsm.nsid = cpu_to_le32(ns->ns_id); - c.dsm.nr = cpu_to_le32(ndesc - 1); - c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); - - nvme_sc = nvme_submit_sync_cmd(ns->queue, &c, range, - ndesc * sizeof(*range)); - res = nvme_trans_status_code(hdr, nvme_sc); - - kfree(range); - out: - kfree(plist); - return res; -} - -static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr) -{ - u8 cmd[16]; - int retcode; - unsigned int opcode; - - if (hdr->cmdp == NULL) - return -EMSGSIZE; - if (hdr->cmd_len > sizeof(cmd)) - return -EINVAL; - if (copy_from_user(cmd, hdr->cmdp, hdr->cmd_len)) - return -EFAULT; - - /* - * Prime the hdr with good status for scsi commands that don't require - * an nvme command for translation. - */ - retcode = nvme_trans_status_code(hdr, NVME_SC_SUCCESS); - if (retcode) - return retcode; - - opcode = cmd[0]; - - switch (opcode) { - case READ_6: - case READ_10: - case READ_12: - case READ_16: - retcode = nvme_trans_io(ns, hdr, 0, cmd); - break; - case WRITE_6: - case WRITE_10: - case WRITE_12: - case WRITE_16: - retcode = nvme_trans_io(ns, hdr, 1, cmd); - break; - case INQUIRY: - retcode = nvme_trans_inquiry(ns, hdr, cmd); - break; - case LOG_SENSE: - retcode = nvme_trans_log_sense(ns, hdr, cmd); - break; - case MODE_SELECT: - case MODE_SELECT_10: - retcode = nvme_trans_mode_select(ns, hdr, cmd); - break; - case MODE_SENSE: - case MODE_SENSE_10: - retcode = nvme_trans_mode_sense(ns, hdr, cmd); - break; - case READ_CAPACITY: - retcode = nvme_trans_read_capacity(ns, hdr, cmd, 0); - break; - case SERVICE_ACTION_IN_16: - switch (cmd[1]) { - case SAI_READ_CAPACITY_16: - retcode = nvme_trans_read_capacity(ns, hdr, cmd, 1); - break; - default: - goto out; - } - break; - case REPORT_LUNS: - retcode = nvme_trans_report_luns(ns, hdr, cmd); - break; - case REQUEST_SENSE: - retcode = nvme_trans_request_sense(ns, hdr, cmd); - break; - case SYNCHRONIZE_CACHE: - retcode = nvme_trans_synchronize_cache(ns, hdr); - break; - case FORMAT_UNIT: - retcode = nvme_trans_format_unit(ns, hdr, cmd); - break; - case TEST_UNIT_READY: - retcode = nvme_trans_test_unit_ready(ns, hdr, cmd); - break; - case WRITE_BUFFER: - retcode = nvme_trans_write_buffer(ns, hdr, cmd); - break; - case UNMAP: - retcode = nvme_trans_unmap(ns, hdr, cmd); - break; - default: - out: - retcode = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_ILLEGAL_COMMAND, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - return retcode; -} - -int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr) -{ - struct sg_io_hdr hdr; - int retcode; - - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (copy_from_user(&hdr, u_hdr, sizeof(hdr))) - return -EFAULT; - if (hdr.interface_id != 'S') - return -EINVAL; - - /* - * A positive return code means a NVMe status, which has been - * translated to sense data. - */ - retcode = nvme_scsi_translate(ns, &hdr); - if (retcode < 0) - return retcode; - if (copy_to_user(u_hdr, &hdr, sizeof(sg_io_hdr_t)) > 0) - return -EFAULT; - return 0; -} - -int nvme_sg_get_version_num(int __user *ip) -{ - return put_user(sg_version_num, ip); -} diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index ff1f97006322..35f930db3c02 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -336,7 +336,7 @@ out: static void nvmet_execute_identify_nslist(struct nvmet_req *req) { - static const int buf_size = 4096; + static const int buf_size = NVME_IDENTIFY_DATA_SIZE; struct nvmet_ctrl *ctrl = req->sq->ctrl; struct nvmet_ns *ns; u32 min_nsid = le32_to_cpu(req->cmd->identify.nsid); @@ -367,6 +367,64 @@ out: nvmet_req_complete(req, status); } +static u16 nvmet_copy_ns_identifier(struct nvmet_req *req, u8 type, u8 len, + void *id, off_t *off) +{ + struct nvme_ns_id_desc desc = { + .nidt = type, + .nidl = len, + }; + u16 status; + + status = nvmet_copy_to_sgl(req, *off, &desc, sizeof(desc)); + if (status) + return status; + *off += sizeof(desc); + + status = nvmet_copy_to_sgl(req, *off, id, len); + if (status) + return status; + *off += len; + + return 0; +} + +static void nvmet_execute_identify_desclist(struct nvmet_req *req) +{ + struct nvmet_ns *ns; + u16 status = 0; + off_t off = 0; + + ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid); + if (!ns) { + status = NVME_SC_INVALID_NS | NVME_SC_DNR; + goto out; + } + + if (memchr_inv(&ns->uuid, 0, sizeof(ns->uuid))) { + status = nvmet_copy_ns_identifier(req, NVME_NIDT_UUID, + NVME_NIDT_UUID_LEN, + &ns->uuid, &off); + if (status) + goto out_put_ns; + } + if (memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) { + status = nvmet_copy_ns_identifier(req, NVME_NIDT_NGUID, + NVME_NIDT_NGUID_LEN, + &ns->nguid, &off); + if (status) + goto out_put_ns; + } + + if (sg_zero_buffer(req->sg, req->sg_cnt, NVME_IDENTIFY_DATA_SIZE - off, + off) != NVME_IDENTIFY_DATA_SIZE - off) + status = NVME_SC_INTERNAL | NVME_SC_DNR; +out_put_ns: + nvmet_put_namespace(ns); +out: + nvmet_req_complete(req, status); +} + /* * A "mimimum viable" abort implementation: the command is mandatory in the * spec, but we are not required to do any useful work. We couldn't really @@ -504,7 +562,7 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) } break; case nvme_admin_identify: - req->data_len = 4096; + req->data_len = NVME_IDENTIFY_DATA_SIZE; switch (cmd->identify.cns) { case NVME_ID_CNS_NS: req->execute = nvmet_execute_identify_ns; @@ -515,6 +573,9 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) case NVME_ID_CNS_NS_ACTIVE_LIST: req->execute = nvmet_execute_identify_nslist; return 0; + case NVME_ID_CNS_NS_DESC_LIST: + req->execute = nvmet_execute_identify_desclist; + return 0; } break; case nvme_admin_abort_cmd: diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index be8c800078e2..a358ecd93e11 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -305,11 +305,41 @@ out_unlock: CONFIGFS_ATTR(nvmet_ns_, device_path); +static ssize_t nvmet_ns_device_uuid_show(struct config_item *item, char *page) +{ + return sprintf(page, "%pUb\n", &to_nvmet_ns(item)->uuid); +} + +static ssize_t nvmet_ns_device_uuid_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + struct nvmet_subsys *subsys = ns->subsys; + int ret = 0; + + + mutex_lock(&subsys->lock); + if (ns->enabled) { + ret = -EBUSY; + goto out_unlock; + } + + + if (uuid_parse(page, &ns->uuid)) + ret = -EINVAL; + +out_unlock: + mutex_unlock(&subsys->lock); + return ret ? ret : count; +} + static ssize_t nvmet_ns_device_nguid_show(struct config_item *item, char *page) { return sprintf(page, "%pUb\n", &to_nvmet_ns(item)->nguid); } +CONFIGFS_ATTR(nvmet_ns_, device_uuid); + static ssize_t nvmet_ns_device_nguid_store(struct config_item *item, const char *page, size_t count) { @@ -379,6 +409,7 @@ CONFIGFS_ATTR(nvmet_ns_, enable); static struct configfs_attribute *nvmet_ns_attrs[] = { &nvmet_ns_attr_device_path, &nvmet_ns_attr_device_nguid, + &nvmet_ns_attr_device_uuid, &nvmet_ns_attr_enable, NULL, }; @@ -619,8 +650,45 @@ out_unlock: CONFIGFS_ATTR(nvmet_subsys_, attr_allow_any_host); +static ssize_t nvmet_subsys_version_show(struct config_item *item, + char *page) +{ + struct nvmet_subsys *subsys = to_subsys(item); + + if (NVME_TERTIARY(subsys->ver)) + return snprintf(page, PAGE_SIZE, "%d.%d.%d\n", + (int)NVME_MAJOR(subsys->ver), + (int)NVME_MINOR(subsys->ver), + (int)NVME_TERTIARY(subsys->ver)); + else + return snprintf(page, PAGE_SIZE, "%d.%d\n", + (int)NVME_MAJOR(subsys->ver), + (int)NVME_MINOR(subsys->ver)); +} + +static ssize_t nvmet_subsys_version_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item); + int major, minor, tertiary = 0; + int ret; + + + ret = sscanf(page, "%d.%d.%d\n", &major, &minor, &tertiary); + if (ret != 2 && ret != 3) + return -EINVAL; + + down_write(&nvmet_config_sem); + subsys->ver = NVME_VS(major, minor, tertiary); + up_write(&nvmet_config_sem); + + return count; +} +CONFIGFS_ATTR(nvmet_subsys_, version); + static struct configfs_attribute *nvmet_subsys_attrs[] = { &nvmet_subsys_attr_attr_allow_any_host, + &nvmet_subsys_attr_version, NULL, }; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index eb9399ac97cf..b5b4ac103748 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -380,6 +380,7 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) ns->nsid = nsid; ns->subsys = subsys; + uuid_gen(&ns->uuid); return ns; } @@ -926,7 +927,7 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, if (!subsys) return NULL; - subsys->ver = NVME_VS(1, 2, 1); /* NVMe 1.2.1 */ + subsys->ver = NVME_VS(1, 3, 0); /* NVMe 1.3.0 */ switch (type) { case NVME_NQN_NVME: diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 1aaf597e81fc..8f3b57b4c97b 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -53,7 +53,7 @@ static void nvmet_format_discovery_entry(struct nvmf_disc_rsp_page_hdr *hdr, e->portid = port->disc_addr.portid; /* we support only dynamic controllers */ e->cntlid = cpu_to_le16(NVME_CNTLID_DYNAMIC); - e->asqsz = cpu_to_le16(NVMF_AQ_DEPTH); + e->asqsz = cpu_to_le16(NVME_AQ_DEPTH); e->subtype = type; memcpy(e->trsvcid, port->disc_addr.trsvcid, NVMF_TRSVCID_SIZE); memcpy(e->traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE); @@ -185,7 +185,7 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req) return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } case nvme_admin_identify: - req->data_len = 4096; + req->data_len = NVME_IDENTIFY_DATA_SIZE; switch (cmd->identify.cns) { case NVME_ID_CNS_CTRL: req->execute = diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 2006fae61980..1e6dcc241b3c 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1164,18 +1164,24 @@ nvmet_fc_ls_create_association(struct nvmet_fc_tgtport *tgtport, memset(acc, 0, sizeof(*acc)); - if (iod->rqstdatalen < sizeof(struct fcnvme_ls_cr_assoc_rqst)) + /* + * FC-NVME spec changes. There are initiators sending different + * lengths as padding sizes for Create Association Cmd descriptor + * was incorrect. + * Accept anything of "minimum" length. Assume format per 1.15 + * spec (with HOSTID reduced to 16 bytes), ignore how long the + * trailing pad length is. + */ + if (iod->rqstdatalen < FCNVME_LSDESC_CRA_RQST_MINLEN) ret = VERR_CR_ASSOC_LEN; - else if (rqst->desc_list_len != - fcnvme_lsdesc_len( - sizeof(struct fcnvme_ls_cr_assoc_rqst))) + else if (rqst->desc_list_len < + cpu_to_be32(FCNVME_LSDESC_CRA_RQST_MIN_LISTLEN)) ret = VERR_CR_ASSOC_RQST_LEN; else if (rqst->assoc_cmd.desc_tag != cpu_to_be32(FCNVME_LSDESC_CREATE_ASSOC_CMD)) ret = VERR_CR_ASSOC_CMD; - else if (rqst->assoc_cmd.desc_len != - fcnvme_lsdesc_len( - sizeof(struct fcnvme_lsdesc_cr_assoc_cmd))) + else if (rqst->assoc_cmd.desc_len < + cpu_to_be32(FCNVME_LSDESC_CRA_CMD_DESC_MIN_DESCLEN)) ret = VERR_CR_ASSOC_CMD_LEN; else if (!rqst->assoc_cmd.ersp_ratio || (be16_to_cpu(rqst->assoc_cmd.ersp_ratio) >= @@ -2096,20 +2102,22 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, /* clear any response payload */ memset(&fod->rspiubuf, 0, sizeof(fod->rspiubuf)); + fod->data_sg = NULL; + fod->data_sg_cnt = 0; + ret = nvmet_req_init(&fod->req, &fod->queue->nvme_cq, &fod->queue->nvme_sq, &nvmet_fc_tgt_fcp_ops); - if (!ret) { /* bad SQE content or invalid ctrl state */ - nvmet_fc_abort_op(tgtport, fod); + if (!ret) { + /* bad SQE content or invalid ctrl state */ + /* nvmet layer has already called op done to send rsp. */ return; } /* keep a running counter of tail position */ atomic_inc(&fod->queue->sqtail); - fod->data_sg = NULL; - fod->data_sg_cnt = 0; if (fod->total_length) { ret = nvmet_fc_alloc_tgt_pgs(fod); if (ret) { diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 294a6611fb24..1bb9d5b311b1 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -569,7 +569,6 @@ fcloop_tgt_fcp_abort(struct nvmet_fc_target_port *tgtport, struct nvmefc_tgt_fcp_req *tgt_fcpreq) { struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq); - int active; /* * mark aborted only in case there were 2 threads in transport @@ -577,7 +576,6 @@ fcloop_tgt_fcp_abort(struct nvmet_fc_target_port *tgtport, * after the abort request */ spin_lock(&tfcp_req->reqlock); - active = tfcp_req->active; tfcp_req->aborted = true; spin_unlock(&tfcp_req->reqlock); diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c index c77940d80fc8..3b4d47a6abdb 100644 --- a/drivers/nvme/target/io-cmd.c +++ b/drivers/nvme/target/io-cmd.c @@ -21,7 +21,7 @@ static void nvmet_bio_done(struct bio *bio) struct nvmet_req *req = bio->bi_private; nvmet_req_complete(req, - bio->bi_error ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); + bio->bi_status ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); if (bio != &req->inline_bio) bio_put(bio); @@ -85,7 +85,7 @@ static void nvmet_execute_rw(struct nvmet_req *req) bio_set_op_attrs(bio, op, op_flags); bio_chain(bio, prev); - cookie = submit_bio(prev); + submit_bio(prev); } sector += sg->length >> 9; @@ -145,7 +145,7 @@ static void nvmet_execute_discard(struct nvmet_req *req) bio->bi_private = req; bio->bi_end_io = nvmet_bio_done; if (status) { - bio->bi_error = -EIO; + bio->bi_status = BLK_STS_IOERR; bio_endio(bio); } else { submit_bio(bio); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index e503cfff0337..717ed7ddb2f6 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -21,8 +21,6 @@ #include "../host/nvme.h" #include "../host/fabrics.h" -#define NVME_LOOP_AQ_DEPTH 256 - #define NVME_LOOP_MAX_SEGMENTS 256 /* @@ -31,7 +29,7 @@ */ #define NVME_LOOP_NR_AEN_COMMANDS 1 #define NVME_LOOP_AQ_BLKMQ_DEPTH \ - (NVME_LOOP_AQ_DEPTH - NVME_LOOP_NR_AEN_COMMANDS) + (NVME_AQ_DEPTH - NVME_LOOP_NR_AEN_COMMANDS) struct nvme_loop_iod { struct nvme_request nvme_req; @@ -45,21 +43,17 @@ struct nvme_loop_iod { }; struct nvme_loop_ctrl { - spinlock_t lock; struct nvme_loop_queue *queues; - u32 queue_count; struct blk_mq_tag_set admin_tag_set; struct list_head list; - u64 cap; struct blk_mq_tag_set tag_set; struct nvme_loop_iod async_event_iod; struct nvme_ctrl ctrl; struct nvmet_ctrl *target_ctrl; struct work_struct delete_work; - struct work_struct reset_work; }; static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl) @@ -151,7 +145,7 @@ nvme_loop_timeout(struct request *rq, bool reserved) struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(rq); /* queue error recovery */ - schedule_work(&iod->queue->ctrl->reset_work); + nvme_reset_ctrl(&iod->queue->ctrl->ctrl); /* fail with DNR on admin cmd timeout */ nvme_req(rq)->status = NVME_SC_ABORT_REQ | NVME_SC_DNR; @@ -159,17 +153,17 @@ nvme_loop_timeout(struct request *rq, bool reserved) return BLK_EH_HANDLED; } -static int nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, +static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct nvme_ns *ns = hctx->queue->queuedata; struct nvme_loop_queue *queue = hctx->driver_data; struct request *req = bd->rq; struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); - int ret; + blk_status_t ret; ret = nvme_setup_cmd(ns, req, &iod->cmd); - if (ret != BLK_MQ_RQ_QUEUE_OK) + if (ret) return ret; iod->cmd.common.flags |= NVME_CMD_SGL_METABUF; @@ -179,16 +173,15 @@ static int nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, nvme_cleanup_cmd(req); blk_mq_start_request(req); nvme_loop_queue_response(&iod->req); - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; } if (blk_rq_bytes(req)) { iod->sg_table.sgl = iod->first_sgl; - ret = sg_alloc_table_chained(&iod->sg_table, + if (sg_alloc_table_chained(&iod->sg_table, blk_rq_nr_phys_segments(req), - iod->sg_table.sgl); - if (ret) - return BLK_MQ_RQ_QUEUE_BUSY; + iod->sg_table.sgl)) + return BLK_STS_RESOURCE; iod->req.sg = iod->sg_table.sgl; iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl); @@ -197,7 +190,7 @@ static int nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(req); schedule_work(&iod->work); - return BLK_MQ_RQ_QUEUE_OK; + return BLK_STS_OK; } static void nvme_loop_submit_async_event(struct nvme_ctrl *arg, int aer_idx) @@ -234,15 +227,10 @@ static int nvme_loop_init_request(struct blk_mq_tag_set *set, struct request *req, unsigned int hctx_idx, unsigned int numa_node) { - return nvme_loop_init_iod(set->driver_data, blk_mq_rq_to_pdu(req), - hctx_idx + 1); -} + struct nvme_loop_ctrl *ctrl = set->driver_data; -static int nvme_loop_init_admin_request(struct blk_mq_tag_set *set, - struct request *req, unsigned int hctx_idx, - unsigned int numa_node) -{ - return nvme_loop_init_iod(set->driver_data, blk_mq_rq_to_pdu(req), 0); + return nvme_loop_init_iod(ctrl, blk_mq_rq_to_pdu(req), + (set == &ctrl->tag_set) ? hctx_idx + 1 : 0); } static int nvme_loop_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, @@ -251,7 +239,7 @@ static int nvme_loop_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, struct nvme_loop_ctrl *ctrl = data; struct nvme_loop_queue *queue = &ctrl->queues[hctx_idx + 1]; - BUG_ON(hctx_idx >= ctrl->queue_count); + BUG_ON(hctx_idx >= ctrl->ctrl.queue_count); hctx->driver_data = queue; return 0; @@ -280,7 +268,7 @@ static const struct blk_mq_ops nvme_loop_mq_ops = { static const struct blk_mq_ops nvme_loop_admin_mq_ops = { .queue_rq = nvme_loop_queue_rq, .complete = nvme_loop_complete_rq, - .init_request = nvme_loop_init_admin_request, + .init_request = nvme_loop_init_request, .init_hctx = nvme_loop_init_admin_hctx, .timeout = nvme_loop_timeout, }; @@ -317,7 +305,7 @@ static void nvme_loop_destroy_io_queues(struct nvme_loop_ctrl *ctrl) { int i; - for (i = 1; i < ctrl->queue_count; i++) + for (i = 1; i < ctrl->ctrl.queue_count; i++) nvmet_sq_destroy(&ctrl->queues[i].nvme_sq); } @@ -340,7 +328,7 @@ static int nvme_loop_init_io_queues(struct nvme_loop_ctrl *ctrl) if (ret) goto out_destroy_queues; - ctrl->queue_count++; + ctrl->ctrl.queue_count++; } return 0; @@ -354,7 +342,7 @@ static int nvme_loop_connect_io_queues(struct nvme_loop_ctrl *ctrl) { int i, ret; - for (i = 1; i < ctrl->queue_count; i++) { + for (i = 1; i < ctrl->ctrl.queue_count; i++) { ret = nvmf_connect_io_queue(&ctrl->ctrl, i); if (ret) return ret; @@ -382,7 +370,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) error = nvmet_sq_init(&ctrl->queues[0].nvme_sq); if (error) return error; - ctrl->queue_count = 1; + ctrl->ctrl.queue_count = 1; error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); if (error) @@ -398,7 +386,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) if (error) goto out_cleanup_queue; - error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap); + error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap); if (error) { dev_err(ctrl->ctrl.device, "prop_get NVME_REG_CAP failed\n"); @@ -406,9 +394,9 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) } ctrl->ctrl.sqsize = - min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->ctrl.sqsize); + min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize); - error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); + error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); if (error) goto out_cleanup_queue; @@ -419,8 +407,6 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) if (error) goto out_cleanup_queue; - nvme_start_keep_alive(&ctrl->ctrl); - return 0; out_cleanup_queue: @@ -434,9 +420,7 @@ out_free_sq: static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) { - nvme_stop_keep_alive(&ctrl->ctrl); - - if (ctrl->queue_count > 1) { + if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, &ctrl->ctrl); @@ -446,9 +430,10 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) if (ctrl->ctrl.state == NVME_CTRL_LIVE) nvme_shutdown_ctrl(&ctrl->ctrl); - blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request, &ctrl->ctrl); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_loop_destroy_admin_queue(ctrl); } @@ -457,8 +442,10 @@ static void nvme_loop_del_ctrl_work(struct work_struct *work) struct nvme_loop_ctrl *ctrl = container_of(work, struct nvme_loop_ctrl, delete_work); - nvme_uninit_ctrl(&ctrl->ctrl); + nvme_stop_ctrl(&ctrl->ctrl); + nvme_remove_namespaces(&ctrl->ctrl); nvme_loop_shutdown_ctrl(ctrl); + nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); } @@ -467,7 +454,7 @@ static int __nvme_loop_del_ctrl(struct nvme_loop_ctrl *ctrl) if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) return -EBUSY; - if (!schedule_work(&ctrl->delete_work)) + if (!queue_work(nvme_wq, &ctrl->delete_work)) return -EBUSY; return 0; @@ -501,11 +488,12 @@ static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl) static void nvme_loop_reset_ctrl_work(struct work_struct *work) { - struct nvme_loop_ctrl *ctrl = container_of(work, - struct nvme_loop_ctrl, reset_work); + struct nvme_loop_ctrl *ctrl = + container_of(work, struct nvme_loop_ctrl, ctrl.reset_work); bool changed; int ret; + nvme_stop_ctrl(&ctrl->ctrl); nvme_loop_shutdown_ctrl(ctrl); ret = nvme_loop_configure_admin_queue(ctrl); @@ -520,13 +508,13 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work) if (ret) goto out_destroy_io; + blk_mq_update_nr_hw_queues(&ctrl->tag_set, + ctrl->ctrl.queue_count - 1); + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); - nvme_queue_scan(&ctrl->ctrl); - nvme_queue_async_events(&ctrl->ctrl); - - nvme_start_queues(&ctrl->ctrl); + nvme_start_ctrl(&ctrl->ctrl); return; @@ -540,21 +528,6 @@ out_disable: nvme_put_ctrl(&ctrl->ctrl); } -static int nvme_loop_reset_ctrl(struct nvme_ctrl *nctrl) -{ - struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl); - - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) - return -EBUSY; - - if (!schedule_work(&ctrl->reset_work)) - return -EBUSY; - - flush_work(&ctrl->reset_work); - - return 0; -} - static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = { .name = "loop", .module = THIS_MODULE, @@ -562,11 +535,9 @@ static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = { .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, - .reset_ctrl = nvme_loop_reset_ctrl, .free_ctrl = nvme_loop_free_ctrl, .submit_async_event = nvme_loop_submit_async_event, .delete_ctrl = nvme_loop_del_ctrl, - .get_subsysnqn = nvmf_get_subsysnqn, }; static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) @@ -586,7 +557,7 @@ static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) ctrl->tag_set.cmd_size = sizeof(struct nvme_loop_iod) + SG_CHUNK_SIZE * sizeof(struct scatterlist); ctrl->tag_set.driver_data = ctrl; - ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1; + ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1; ctrl->tag_set.timeout = NVME_IO_TIMEOUT; ctrl->ctrl.tagset = &ctrl->tag_set; @@ -629,15 +600,13 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, INIT_LIST_HEAD(&ctrl->list); INIT_WORK(&ctrl->delete_work, nvme_loop_del_ctrl_work); - INIT_WORK(&ctrl->reset_work, nvme_loop_reset_ctrl_work); + INIT_WORK(&ctrl->ctrl.reset_work, nvme_loop_reset_ctrl_work); ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_loop_ctrl_ops, 0 /* no quirks, we're perfect! */); if (ret) goto out_put_ctrl; - spin_lock_init(&ctrl->lock); - ret = -ENOMEM; ctrl->ctrl.sqsize = opts->queue_size - 1; @@ -680,10 +649,7 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, list_add_tail(&ctrl->list, &nvme_loop_ctrl_list); mutex_unlock(&nvme_loop_ctrl_mutex); - if (opts->nr_io_queues) { - nvme_queue_scan(&ctrl->ctrl); - nvme_queue_async_events(&ctrl->ctrl); - } + nvme_start_ctrl(&ctrl->ctrl); return &ctrl->ctrl; @@ -766,7 +732,7 @@ static void __exit nvme_loop_cleanup_module(void) __nvme_loop_del_ctrl(ctrl); mutex_unlock(&nvme_loop_ctrl_mutex); - flush_scheduled_work(); + flush_workqueue(nvme_wq); } module_init(nvme_loop_init_module); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index cfc5c7fb0ab7..747bbdb4f9c6 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -21,6 +21,7 @@ #include <linux/percpu-refcount.h> #include <linux/list.h> #include <linux/mutex.h> +#include <linux/uuid.h> #include <linux/nvme.h> #include <linux/configfs.h> #include <linux/rcupdate.h> @@ -46,6 +47,7 @@ struct nvmet_ns { u32 blksize_shift; loff_t size; u8 nguid[16]; + uuid_t uuid; bool enabled; struct nvmet_subsys *subsys; diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 9e45cde63376..56a4cba690b5 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1027,7 +1027,7 @@ nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn, queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1; queue->send_queue_size = le16_to_cpu(req->hrqsize); - if (!queue->host_qid && queue->recv_queue_size > NVMF_AQ_DEPTH) + if (!queue->host_qid && queue->recv_queue_size > NVME_AQ_DEPTH) return NVME_RDMA_CM_INVALID_HSQSIZE; /* XXX: Should we enforce some kind of max for IO queues? */ @@ -1307,53 +1307,44 @@ static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, /** * nvme_rdma_device_removal() - Handle RDMA device removal + * @cm_id: rdma_cm id, used for nvmet port * @queue: nvmet rdma queue (cm id qp_context) - * @addr: nvmet address (cm_id context) * * DEVICE_REMOVAL event notifies us that the RDMA device is about - * to unplug so we should take care of destroying our RDMA resources. - * This event will be generated for each allocated cm_id. + * to unplug. Note that this event can be generated on a normal + * queue cm_id and/or a device bound listener cm_id (where in this + * case queue will be null). * - * Note that this event can be generated on a normal queue cm_id - * and/or a device bound listener cm_id (where in this case - * queue will be null). - * - * we claim ownership on destroying the cm_id. For queues we move - * the queue state to NVMET_RDMA_IN_DEVICE_REMOVAL and for port + * We registered an ib_client to handle device removal for queues, + * so we only need to handle the listening port cm_ids. In this case * we nullify the priv to prevent double cm_id destruction and destroying * the cm_id implicitely by returning a non-zero rc to the callout. */ static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, struct nvmet_rdma_queue *queue) { - unsigned long flags; - - if (!queue) { - struct nvmet_port *port = cm_id->context; + struct nvmet_port *port; + if (queue) { /* - * This is a listener cm_id. Make sure that - * future remove_port won't invoke a double - * cm_id destroy. use atomic xchg to make sure - * we don't compete with remove_port. - */ - if (xchg(&port->priv, NULL) != cm_id) - return 0; - } else { - /* - * This is a queue cm_id. Make sure that - * release queue will not destroy the cm_id - * and schedule all ctrl queues removal (only - * if the queue is not disconnecting already). + * This is a queue cm_id. we have registered + * an ib_client to handle queues removal + * so don't interfear and just return. */ - spin_lock_irqsave(&queue->state_lock, flags); - if (queue->state != NVMET_RDMA_Q_DISCONNECTING) - queue->state = NVMET_RDMA_IN_DEVICE_REMOVAL; - spin_unlock_irqrestore(&queue->state_lock, flags); - nvmet_rdma_queue_disconnect(queue); - flush_scheduled_work(); + return 0; } + port = cm_id->context; + + /* + * This is a listener cm_id. Make sure that + * future remove_port won't invoke a double + * cm_id destroy. use atomic xchg to make sure + * we don't compete with remove_port. + */ + if (xchg(&port->priv, NULL) != cm_id) + return 0; + /* * We need to return 1 so that the core will destroy * it's own ID. What a great API design.. @@ -1519,9 +1510,51 @@ static struct nvmet_fabrics_ops nvmet_rdma_ops = { .delete_ctrl = nvmet_rdma_delete_ctrl, }; +static void nvmet_rdma_add_one(struct ib_device *ib_device) +{ +} + +static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data) +{ + struct nvmet_rdma_queue *queue; + + /* Device is being removed, delete all queues using this device */ + mutex_lock(&nvmet_rdma_queue_mutex); + list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) { + if (queue->dev->device != ib_device) + continue; + + pr_info("Removing queue %d\n", queue->idx); + __nvmet_rdma_queue_disconnect(queue); + } + mutex_unlock(&nvmet_rdma_queue_mutex); + + flush_scheduled_work(); +} + +static struct ib_client nvmet_rdma_ib_client = { + .name = "nvmet_rdma", + .add = nvmet_rdma_add_one, + .remove = nvmet_rdma_remove_one +}; + static int __init nvmet_rdma_init(void) { - return nvmet_register_transport(&nvmet_rdma_ops); + int ret; + + ret = ib_register_client(&nvmet_rdma_ib_client); + if (ret) + return ret; + + ret = nvmet_register_transport(&nvmet_rdma_ops); + if (ret) + goto err_ib_client; + + return 0; + +err_ib_client: + ib_unregister_client(&nvmet_rdma_ib_client); + return ret; } static void __exit nvmet_rdma_exit(void) @@ -1544,6 +1577,7 @@ static void __exit nvmet_rdma_exit(void) mutex_unlock(&nvmet_rdma_queue_mutex); flush_scheduled_work(); + ib_unregister_client(&nvmet_rdma_ib_client); ida_destroy(&nvmet_rdma_queue_ida); } |