diff options
Diffstat (limited to 'drivers/nvme/host')
-rw-r--r-- | drivers/nvme/host/Kconfig | 3 | ||||
-rw-r--r-- | drivers/nvme/host/Makefile | 2 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 1142 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.c | 66 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.h | 13 | ||||
-rw-r--r-- | drivers/nvme/host/fc.c | 55 | ||||
-rw-r--r-- | drivers/nvme/host/ioctl.c | 498 | ||||
-rw-r--r-- | drivers/nvme/host/lightnvm.c | 12 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 172 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 86 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 33 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 16 | ||||
-rw-r--r-- | drivers/nvme/host/tcp.c | 25 | ||||
-rw-r--r-- | drivers/nvme/host/zns.c | 4 |
14 files changed, 1260 insertions, 867 deletions
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index a44d49d63968..494675aeaaad 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -71,7 +71,8 @@ config NVME_FC config NVME_TCP tristate "NVM Express over Fabrics TCP host driver" depends on INET - depends on BLK_DEV_NVME + depends on BLOCK + select NVME_CORE select NVME_FABRICS select CRYPTO select CRYPTO_CRC32C diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index d7f6a87687b8..cbc509784b2e 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o obj-$(CONFIG_NVME_FC) += nvme-fc.o obj-$(CONFIG_NVME_TCP) += nvme-tcp.o -nvme-core-y := core.o +nvme-core-y := core.o ioctl.o nvme-core-$(CONFIG_TRACING) += trace.o nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o nvme-core-$(CONFIG_NVM) += lightnvm.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0896e21642be..66973bb56305 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -89,6 +89,10 @@ static dev_t nvme_ctrl_base_chr_devt; static struct class *nvme_class; static struct class *nvme_subsys_class; +static DEFINE_IDA(nvme_ns_chr_minor_ida); +static dev_t nvme_ns_chr_devt; +static struct class *nvme_ns_chr_class; + static void nvme_put_subsystem(struct nvme_subsystem *subsys); static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, unsigned nsid); @@ -112,7 +116,7 @@ static void nvme_set_queue_dying(struct nvme_ns *ns) set_capacity_and_notify(ns->disk, 0); } -static void nvme_queue_scan(struct nvme_ctrl *ctrl) +void nvme_queue_scan(struct nvme_ctrl *ctrl) { /* * Only new queue scan work when admin and IO queues are both alive @@ -179,7 +183,7 @@ int nvme_reset_ctrl(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_reset_ctrl); -static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) +int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) { int ret; @@ -549,7 +553,12 @@ static void nvme_free_ns_head(struct kref *ref) kfree(head); } -static void nvme_put_ns_head(struct nvme_ns_head *head) +bool nvme_tryget_ns_head(struct nvme_ns_head *head) +{ + return kref_get_unless_zero(&head->ref); +} + +void nvme_put_ns_head(struct nvme_ns_head *head) { kref_put(&head->ref, nvme_free_ns_head); } @@ -567,6 +576,11 @@ static void nvme_free_ns(struct kref *kref) kfree(ns); } +static inline bool nvme_get_ns(struct nvme_ns *ns) +{ + return kref_get_unless_zero(&ns->kref); +} + void nvme_put_ns(struct nvme_ns *ns) { kref_put(&ns->kref, nvme_free_ns); @@ -575,11 +589,9 @@ EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU); static inline void nvme_clear_nvme_request(struct request *req) { - if (!(req->rq_flags & RQF_DONTPREP)) { - nvme_req(req)->retries = 0; - nvme_req(req)->flags = 0; - req->rq_flags |= RQF_DONTPREP; - } + nvme_req(req)->retries = 0; + nvme_req(req)->flags = 0; + req->rq_flags |= RQF_DONTPREP; } static inline unsigned int nvme_req_op(struct nvme_command *cmd) @@ -595,9 +607,12 @@ static inline void nvme_init_request(struct request *req, else /* no queuedata implies admin queue */ req->timeout = NVME_ADMIN_TIMEOUT; + /* passthru commands should let the driver set the SGL flags */ + cmd->common.flags &= ~NVME_CMD_SGL_ALL; + req->cmd_flags |= REQ_FAILFAST_DRIVER; nvme_clear_nvme_request(req); - nvme_req(req)->cmd = cmd; + memcpy(nvme_req(req)->cmd, cmd, sizeof(*cmd)); } struct request *nvme_alloc_request(struct request_queue *q, @@ -624,6 +639,66 @@ static struct request *nvme_alloc_request_qid(struct request_queue *q, return req; } +/* + * For something we're not in a state to send to the device the default action + * is to busy it and retry it after the controller state is recovered. However, + * if the controller is deleting or if anything is marked for failfast or + * nvme multipath it is immediately failed. + * + * Note: commands used to initialize the controller will be marked for failfast. + * Note: nvme cli/ioctl commands are marked for failfast. + */ +blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl, + struct request *rq) +{ + if (ctrl->state != NVME_CTRL_DELETING_NOIO && + ctrl->state != NVME_CTRL_DEAD && + !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) && + !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) + return BLK_STS_RESOURCE; + return nvme_host_path_error(rq); +} +EXPORT_SYMBOL_GPL(nvme_fail_nonready_command); + +bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq, + bool queue_live) +{ + struct nvme_request *req = nvme_req(rq); + + /* + * currently we have a problem sending passthru commands + * on the admin_q if the controller is not LIVE because we can't + * make sure that they are going out after the admin connect, + * controller enable and/or other commands in the initialization + * sequence. until the controller will be LIVE, fail with + * BLK_STS_RESOURCE so that they will be rescheduled. + */ + if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD)) + return false; + + if (ctrl->ops->flags & NVME_F_FABRICS) { + /* + * Only allow commands on a live queue, except for the connect + * command, which is require to set the queue live in the + * appropinquate states. + */ + switch (ctrl->state) { + case NVME_CTRL_CONNECTING: + if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) && + req->cmd->fabrics.fctype == nvme_fabrics_type_connect) + return true; + break; + default: + break; + case NVME_CTRL_DEAD: + return false; + } + } + + return queue_live; +} +EXPORT_SYMBOL_GPL(__nvme_check_ready); + static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable) { struct nvme_command c; @@ -726,14 +801,6 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl, req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9; } -static void nvme_setup_passthrough(struct request *req, - struct nvme_command *cmd) -{ - memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd)); - /* passthru commands should let the driver set the SGL flags */ - cmd->common.flags &= ~NVME_CMD_SGL_ALL; -} - static inline void nvme_setup_flush(struct nvme_ns *ns, struct nvme_command *cmnd) { @@ -888,18 +955,20 @@ void nvme_cleanup_cmd(struct request *req) } EXPORT_SYMBOL_GPL(nvme_cleanup_cmd); -blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, - struct nvme_command *cmd) +blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req) { + struct nvme_command *cmd = nvme_req(req)->cmd; blk_status_t ret = BLK_STS_OK; - nvme_clear_nvme_request(req); + if (!(req->rq_flags & RQF_DONTPREP)) { + nvme_clear_nvme_request(req); + memset(cmd, 0, sizeof(*cmd)); + } - memset(cmd, 0, sizeof(*cmd)); switch (req_op(req)) { case REQ_OP_DRV_IN: case REQ_OP_DRV_OUT: - nvme_setup_passthrough(req, cmd); + /* these are setup prior to execution in nvme_init_request() */ break; case REQ_OP_FLUSH: nvme_setup_flush(ns, cmd); @@ -1020,40 +1089,6 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, } EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); -static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf, - unsigned len, u32 seed, bool write) -{ - struct bio_integrity_payload *bip; - int ret = -ENOMEM; - void *buf; - - buf = kmalloc(len, GFP_KERNEL); - if (!buf) - goto out; - - ret = -EFAULT; - if (write && copy_from_user(buf, ubuf, len)) - goto out_free_meta; - - bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); - if (IS_ERR(bip)) { - ret = PTR_ERR(bip); - goto out_free_meta; - } - - bip->bip_iter.bi_size = len; - bip->bip_iter.bi_sector = seed; - ret = bio_integrity_add_page(bio, virt_to_page(buf), len, - offset_in_page(buf)); - if (ret == len) - return buf; - ret = -ENOMEM; -out_free_meta: - kfree(buf); -out: - return ERR_PTR(ret); -} - static u32 nvme_known_admin_effects(u8 opcode) { switch (opcode) { @@ -1076,9 +1111,9 @@ u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode) if (ns->head->effects) effects = le32_to_cpu(ns->head->effects->iocs[opcode]); if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) - dev_warn(ctrl->device, - "IO command:%02x has unhandled effects:%08x\n", - opcode, effects); + dev_warn_once(ctrl->device, + "IO command:%02x has unhandled effects:%08x\n", + opcode, effects); return 0; } @@ -1120,7 +1155,7 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) mutex_unlock(&ctrl->scan_lock); } if (effects & NVME_CMD_EFFECTS_CCC) - nvme_init_identify(ctrl); + nvme_init_ctrl_finish(ctrl); if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) { nvme_queue_scan(ctrl); flush_work(&ctrl->scan_work); @@ -1137,68 +1172,20 @@ void nvme_execute_passthru_rq(struct request *rq) effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); blk_execute_rq(disk, rq, 0); - nvme_passthru_end(ctrl, effects); + if (effects) /* nothing to be done for zero cmd effects */ + nvme_passthru_end(ctrl, effects); } EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); -static int nvme_submit_user_cmd(struct request_queue *q, - struct nvme_command *cmd, void __user *ubuffer, - unsigned bufflen, void __user *meta_buffer, unsigned meta_len, - u32 meta_seed, u64 *result, unsigned timeout) +/* + * Recommended frequency for KATO commands per NVMe 1.4 section 7.12.1: + * + * The host should send Keep Alive commands at half of the Keep Alive Timeout + * accounting for transport roundtrip times [..]. + */ +static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl) { - bool write = nvme_is_write(cmd); - struct nvme_ns *ns = q->queuedata; - struct block_device *bdev = ns ? ns->disk->part0 : NULL; - struct request *req; - struct bio *bio = NULL; - void *meta = NULL; - int ret; - - req = nvme_alloc_request(q, cmd, 0); - if (IS_ERR(req)) - return PTR_ERR(req); - - if (timeout) - req->timeout = timeout; - nvme_req(req)->flags |= NVME_REQ_USERCMD; - - if (ubuffer && bufflen) { - ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, - GFP_KERNEL); - if (ret) - goto out; - bio = req->bio; - if (bdev) - bio_set_dev(bio, bdev); - if (bdev && meta_buffer && meta_len) { - meta = nvme_add_user_metadata(bio, meta_buffer, meta_len, - meta_seed, write); - if (IS_ERR(meta)) { - ret = PTR_ERR(meta); - goto out_unmap; - } - req->cmd_flags |= REQ_INTEGRITY; - } - } - - nvme_execute_passthru_rq(req); - if (nvme_req(req)->flags & NVME_REQ_CANCELLED) - ret = -EINTR; - else - ret = nvme_req(req)->status; - if (result) - *result = le64_to_cpu(nvme_req(req)->result.u64); - if (meta && !ret && !write) { - if (copy_to_user(meta_buffer, meta, meta_len)) - ret = -EFAULT; - } - kfree(meta); - out_unmap: - if (bio) - blk_rq_unmap_user(bio); - out: - blk_mq_free_request(req); - return ret; + queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ / 2); } static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) @@ -1223,7 +1210,7 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) startka = true; spin_unlock_irqrestore(&ctrl->lock, flags); if (startka) - queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); + nvme_queue_keep_alive_work(ctrl); } static void nvme_keep_alive_work(struct work_struct *work) @@ -1237,7 +1224,7 @@ static void nvme_keep_alive_work(struct work_struct *work) dev_dbg(ctrl->device, "reschedule traffic based keep-alive timer\n"); ctrl->comp_seen = false; - queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); + nvme_queue_keep_alive_work(ctrl); return; } @@ -1260,7 +1247,7 @@ static void nvme_start_keep_alive(struct nvme_ctrl *ctrl) if (unlikely(ctrl->kato == 0)) return; - queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); + nvme_queue_keep_alive_work(ctrl); } void nvme_stop_keep_alive(struct nvme_ctrl *ctrl) @@ -1536,170 +1523,6 @@ static void nvme_enable_aen(struct nvme_ctrl *ctrl) } /* - * Convert integer values from ioctl structures to user pointers, silently - * ignoring the upper bits in the compat case to match behaviour of 32-bit - * kernels. - */ -static void __user *nvme_to_user_ptr(uintptr_t ptrval) -{ - if (in_compat_syscall()) - ptrval = (compat_uptr_t)ptrval; - return (void __user *)ptrval; -} - -static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) -{ - struct nvme_user_io io; - struct nvme_command c; - unsigned length, meta_len; - void __user *metadata; - - if (copy_from_user(&io, uio, sizeof(io))) - return -EFAULT; - if (io.flags) - return -EINVAL; - - switch (io.opcode) { - case nvme_cmd_write: - case nvme_cmd_read: - case nvme_cmd_compare: - break; - default: - return -EINVAL; - } - - length = (io.nblocks + 1) << ns->lba_shift; - - if ((io.control & NVME_RW_PRINFO_PRACT) && - ns->ms == sizeof(struct t10_pi_tuple)) { - /* - * Protection information is stripped/inserted by the - * controller. - */ - if (nvme_to_user_ptr(io.metadata)) - return -EINVAL; - meta_len = 0; - metadata = NULL; - } else { - meta_len = (io.nblocks + 1) * ns->ms; - metadata = nvme_to_user_ptr(io.metadata); - } - - if (ns->features & NVME_NS_EXT_LBAS) { - length += meta_len; - meta_len = 0; - } else if (meta_len) { - if ((io.metadata & 3) || !io.metadata) - return -EINVAL; - } - - memset(&c, 0, sizeof(c)); - c.rw.opcode = io.opcode; - c.rw.flags = io.flags; - c.rw.nsid = cpu_to_le32(ns->head->ns_id); - c.rw.slba = cpu_to_le64(io.slba); - c.rw.length = cpu_to_le16(io.nblocks); - c.rw.control = cpu_to_le16(io.control); - c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); - c.rw.reftag = cpu_to_le32(io.reftag); - c.rw.apptag = cpu_to_le16(io.apptag); - c.rw.appmask = cpu_to_le16(io.appmask); - - return nvme_submit_user_cmd(ns->queue, &c, - nvme_to_user_ptr(io.addr), length, - metadata, meta_len, lower_32_bits(io.slba), NULL, 0); -} - -static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - struct nvme_passthru_cmd __user *ucmd) -{ - struct nvme_passthru_cmd cmd; - struct nvme_command c; - unsigned timeout = 0; - u64 result; - int status; - - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (copy_from_user(&cmd, ucmd, sizeof(cmd))) - return -EFAULT; - if (cmd.flags) - return -EINVAL; - - memset(&c, 0, sizeof(c)); - c.common.opcode = cmd.opcode; - c.common.flags = cmd.flags; - c.common.nsid = cpu_to_le32(cmd.nsid); - c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); - c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); - c.common.cdw10 = cpu_to_le32(cmd.cdw10); - c.common.cdw11 = cpu_to_le32(cmd.cdw11); - c.common.cdw12 = cpu_to_le32(cmd.cdw12); - c.common.cdw13 = cpu_to_le32(cmd.cdw13); - c.common.cdw14 = cpu_to_le32(cmd.cdw14); - c.common.cdw15 = cpu_to_le32(cmd.cdw15); - - if (cmd.timeout_ms) - timeout = msecs_to_jiffies(cmd.timeout_ms); - - status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, - nvme_to_user_ptr(cmd.addr), cmd.data_len, - nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, - 0, &result, timeout); - - if (status >= 0) { - if (put_user(result, &ucmd->result)) - return -EFAULT; - } - - return status; -} - -static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - struct nvme_passthru_cmd64 __user *ucmd) -{ - struct nvme_passthru_cmd64 cmd; - struct nvme_command c; - unsigned timeout = 0; - int status; - - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (copy_from_user(&cmd, ucmd, sizeof(cmd))) - return -EFAULT; - if (cmd.flags) - return -EINVAL; - - memset(&c, 0, sizeof(c)); - c.common.opcode = cmd.opcode; - c.common.flags = cmd.flags; - c.common.nsid = cpu_to_le32(cmd.nsid); - c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); - c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); - c.common.cdw10 = cpu_to_le32(cmd.cdw10); - c.common.cdw11 = cpu_to_le32(cmd.cdw11); - c.common.cdw12 = cpu_to_le32(cmd.cdw12); - c.common.cdw13 = cpu_to_le32(cmd.cdw13); - c.common.cdw14 = cpu_to_le32(cmd.cdw14); - c.common.cdw15 = cpu_to_le32(cmd.cdw15); - - if (cmd.timeout_ms) - timeout = msecs_to_jiffies(cmd.timeout_ms); - - status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, - nvme_to_user_ptr(cmd.addr), cmd.data_len, - nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, - 0, &cmd.result, timeout); - - if (status >= 0) { - if (put_user(cmd.result, &ucmd->result)) - return -EFAULT; - } - - return status; -} - -/* * Issue ioctl requests on the first available path. Note that unlike normal * block layer requests we will not retry failed request on another controller. */ @@ -1729,137 +1552,13 @@ void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) srcu_read_unlock(&head->srcu, idx); } -static bool is_ctrl_ioctl(unsigned int cmd) -{ - if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD) - return true; - if (is_sed_ioctl(cmd)) - return true; - return false; -} - -static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, - void __user *argp, - struct nvme_ns_head *head, - int srcu_idx) -{ - struct nvme_ctrl *ctrl = ns->ctrl; - int ret; - - nvme_get_ctrl(ns->ctrl); - nvme_put_ns_from_disk(head, srcu_idx); - - switch (cmd) { - case NVME_IOCTL_ADMIN_CMD: - ret = nvme_user_cmd(ctrl, NULL, argp); - break; - case NVME_IOCTL_ADMIN64_CMD: - ret = nvme_user_cmd64(ctrl, NULL, argp); - break; - default: - ret = sed_ioctl(ctrl->opal_dev, cmd, argp); - break; - } - nvme_put_ctrl(ctrl); - return ret; -} - -static int nvme_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - struct nvme_ns_head *head = NULL; - void __user *argp = (void __user *)arg; - struct nvme_ns *ns; - int srcu_idx, ret; - - ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); - if (unlikely(!ns)) - return -EWOULDBLOCK; - - /* - * Handle ioctls that apply to the controller instead of the namespace - * seperately and drop the ns SRCU reference early. This avoids a - * deadlock when deleting namespaces using the passthrough interface. - */ - if (is_ctrl_ioctl(cmd)) - return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); - - switch (cmd) { - case NVME_IOCTL_ID: - force_successful_syscall_return(); - ret = ns->head->ns_id; - break; - case NVME_IOCTL_IO_CMD: - ret = nvme_user_cmd(ns->ctrl, ns, argp); - break; - case NVME_IOCTL_SUBMIT_IO: - ret = nvme_submit_io(ns, argp); - break; - case NVME_IOCTL_IO64_CMD: - ret = nvme_user_cmd64(ns->ctrl, ns, argp); - break; - default: - if (ns->ndev) - ret = nvme_nvm_ioctl(ns, cmd, arg); - else - ret = -ENOTTY; - } - - nvme_put_ns_from_disk(head, srcu_idx); - return ret; -} - -#ifdef CONFIG_COMPAT -struct nvme_user_io32 { - __u8 opcode; - __u8 flags; - __u16 control; - __u16 nblocks; - __u16 rsvd; - __u64 metadata; - __u64 addr; - __u64 slba; - __u32 dsmgmt; - __u32 reftag; - __u16 apptag; - __u16 appmask; -} __attribute__((__packed__)); - -#define NVME_IOCTL_SUBMIT_IO32 _IOW('N', 0x42, struct nvme_user_io32) - -static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - /* - * Corresponds to the difference of NVME_IOCTL_SUBMIT_IO - * between 32 bit programs and 64 bit kernel. - * The cause is that the results of sizeof(struct nvme_user_io), - * which is used to define NVME_IOCTL_SUBMIT_IO, - * are not same between 32 bit compiler and 64 bit compiler. - * NVME_IOCTL_SUBMIT_IO32 is for 64 bit kernel handling - * NVME_IOCTL_SUBMIT_IO issued from 32 bit programs. - * Other IOCTL numbers are same between 32 bit and 64 bit. - * So there is nothing to do regarding to other IOCTL numbers. - */ - if (cmd == NVME_IOCTL_SUBMIT_IO32) - return nvme_ioctl(bdev, mode, NVME_IOCTL_SUBMIT_IO, arg); - - return nvme_ioctl(bdev, mode, cmd, arg); -} -#else -#define nvme_compat_ioctl NULL -#endif /* CONFIG_COMPAT */ - -static int nvme_open(struct block_device *bdev, fmode_t mode) +static int nvme_ns_open(struct nvme_ns *ns) { - struct nvme_ns *ns = bdev->bd_disk->private_data; -#ifdef CONFIG_NVME_MULTIPATH /* should never be called due to GENHD_FL_HIDDEN */ - if (WARN_ON_ONCE(ns->head->disk)) + if (WARN_ON_ONCE(nvme_ns_head_multipath(ns->head))) goto fail; -#endif - if (!kref_get_unless_zero(&ns->kref)) + if (!nvme_get_ns(ns)) goto fail; if (!try_module_get(ns->ctrl->ops->module)) goto fail_put_ns; @@ -1872,15 +1571,24 @@ fail: return -ENXIO; } -static void nvme_release(struct gendisk *disk, fmode_t mode) +static void nvme_ns_release(struct nvme_ns *ns) { - struct nvme_ns *ns = disk->private_data; module_put(ns->ctrl->ops->module); nvme_put_ns(ns); } -static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int nvme_open(struct block_device *bdev, fmode_t mode) +{ + return nvme_ns_open(bdev->bd_disk->private_data); +} + +static void nvme_release(struct gendisk *disk, fmode_t mode) +{ + nvme_ns_release(disk->private_data); +} + +int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) { /* some standard values */ geo->heads = 1 << 6; @@ -1929,7 +1637,7 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) struct request_queue *queue = disk->queue; u32 size = queue_logical_block_size(queue); - if (!(ctrl->oncs & NVME_CTRL_ONCS_DSM)) { + if (ctrl->max_discard_sectors == 0) { blk_queue_flag_clear(QUEUE_FLAG_DISCARD, queue); return; } @@ -1947,27 +1655,13 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) if (blk_queue_flag_test_and_set(QUEUE_FLAG_DISCARD, queue)) return; - blk_queue_max_discard_sectors(queue, UINT_MAX); - blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES); + blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors); + blk_queue_max_discard_segments(queue, ctrl->max_discard_segments); if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); } -/* - * Even though NVMe spec explicitly states that MDTS is not applicable to the - * write-zeroes, we are cautious and limit the size to the controllers - * max_hw_sectors value, which is based on the MDTS field and possibly other - * limiting factors. - */ -static void nvme_config_write_zeroes(struct request_queue *q, - struct nvme_ctrl *ctrl) -{ - if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) && - !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES)) - blk_queue_max_write_zeroes_sectors(q, ctrl->max_hw_sectors); -} - static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids) { return !uuid_is_null(&ids->uuid) || @@ -2137,7 +1831,8 @@ static void nvme_update_disk_info(struct gendisk *disk, set_capacity_and_notify(disk, capacity); nvme_config_discard(disk, ns); - nvme_config_write_zeroes(disk->queue, ns->ctrl); + blk_queue_max_write_zeroes_sectors(disk->queue, + ns->ctrl->max_zeroes_sectors); set_disk_ro(disk, (id->nsattr & NVME_NS_ATTR_RO) || test_bit(NVME_NS_FORCE_RO, &ns->flags)); @@ -2206,11 +1901,10 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) if (blk_queue_is_zoned(ns->queue)) { ret = nvme_revalidate_zones(ns); if (ret && !nvme_first_scan(ns->disk)) - return ret; + goto out; } -#ifdef CONFIG_NVME_MULTIPATH - if (ns->head->disk) { + if (nvme_ns_head_multipath(ns->head)) { blk_mq_freeze_queue(ns->head->disk->queue); nvme_update_disk_info(ns->head->disk, ns, id); blk_stack_limits(&ns->head->disk->queue->limits, @@ -2218,11 +1912,19 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) blk_queue_update_readahead(ns->head->disk->queue); blk_mq_unfreeze_queue(ns->head->disk->queue); } -#endif return 0; out_unfreeze: blk_mq_unfreeze_queue(ns->disk->queue); +out: + /* + * If probing fails due an unsupported feature, hide the block device, + * but still allow other access. + */ + if (ret == -ENODEV) { + ns->disk->flags |= GENHD_FL_HIDDEN; + ret = 0; + } return ret; } @@ -2303,22 +2005,25 @@ static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new, enum pr_type type, bool abort) { u32 cdw10 = nvme_pr_type(type) << 8 | (abort ? 2 : 1); + return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire); } static int nvme_pr_clear(struct block_device *bdev, u64 key) { u32 cdw10 = 1 | (key ? 1 << 3 : 0); + return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register); } static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type) { u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 1 << 3 : 0); + return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); } -static const struct pr_ops nvme_pr_ops = { +const struct pr_ops nvme_pr_ops = { .pr_register = nvme_pr_register, .pr_reserve = nvme_pr_reserve, .pr_release = nvme_pr_release, @@ -2351,7 +2056,6 @@ EXPORT_SYMBOL_GPL(nvme_sec_submit); static const struct block_device_operations nvme_bdev_ops = { .owner = THIS_MODULE, .ioctl = nvme_ioctl, - .compat_ioctl = nvme_compat_ioctl, .open = nvme_open, .release = nvme_release, .getgeo = nvme_getgeo, @@ -2359,34 +2063,6 @@ static const struct block_device_operations nvme_bdev_ops = { .pr_ops = &nvme_pr_ops, }; -#ifdef CONFIG_NVME_MULTIPATH -static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode) -{ - struct nvme_ns_head *head = bdev->bd_disk->private_data; - - if (!kref_get_unless_zero(&head->ref)) - return -ENXIO; - return 0; -} - -static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) -{ - nvme_put_ns_head(disk->private_data); -} - -const struct block_device_operations nvme_ns_head_ops = { - .owner = THIS_MODULE, - .submit_bio = nvme_ns_head_submit_bio, - .open = nvme_ns_head_open, - .release = nvme_ns_head_release, - .ioctl = nvme_ioctl, - .compat_ioctl = nvme_compat_ioctl, - .getgeo = nvme_getgeo, - .report_zones = nvme_report_zones, - .pr_ops = &nvme_pr_ops, -}; -#endif /* CONFIG_NVME_MULTIPATH */ - static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) { unsigned long timeout = @@ -2541,28 +2217,28 @@ static int nvme_configure_acre(struct nvme_ctrl *ctrl) return ret; } +/* + * APST (Autonomous Power State Transition) lets us program a table of power + * state transitions that the controller will perform automatically. + * We configure it with a simple heuristic: we are willing to spend at most 2% + * of the time transitioning between power states. Therefore, when running in + * any given state, we will enter the next lower-power non-operational state + * after waiting 50 * (enlat + exlat) microseconds, as long as that state's exit + * latency is under the requested maximum latency. + * + * We will not autonomously enter any non-operational state for which the total + * latency exceeds ps_max_latency_us. + * + * Users can set ps_max_latency_us to zero to turn off APST. + */ static int nvme_configure_apst(struct nvme_ctrl *ctrl) { - /* - * APST (Autonomous Power State Transition) lets us program a - * table of power state transitions that the controller will - * perform automatically. We configure it with a simple - * heuristic: we are willing to spend at most 2% of the time - * transitioning between power states. Therefore, when running - * in any given state, we will enter the next lower-power - * non-operational state after waiting 50 * (enlat + exlat) - * microseconds, as long as that state's exit latency is under - * the requested maximum latency. - * - * We will not autonomously enter any non-operational state for - * which the total latency exceeds ps_max_latency_us. Users - * can set ps_max_latency_us to zero to turn off APST. - */ - - unsigned apste; struct nvme_feat_auto_pst *table; + unsigned apste = 0; u64 max_lat_us = 0; + __le64 target = 0; int max_ps = -1; + int state; int ret; /* @@ -2583,83 +2259,72 @@ static int nvme_configure_apst(struct nvme_ctrl *ctrl) if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) { /* Turn off APST. */ - apste = 0; dev_dbg(ctrl->device, "APST disabled\n"); - } else { - __le64 target = cpu_to_le64(0); - int state; - - /* - * Walk through all states from lowest- to highest-power. - * According to the spec, lower-numbered states use more - * power. NPSS, despite the name, is the index of the - * lowest-power state, not the number of states. - */ - for (state = (int)ctrl->npss; state >= 0; state--) { - u64 total_latency_us, exit_latency_us, transition_ms; - - if (target) - table->entries[state] = target; - - /* - * Don't allow transitions to the deepest state - * if it's quirked off. - */ - if (state == ctrl->npss && - (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) - continue; - - /* - * Is this state a useful non-operational state for - * higher-power states to autonomously transition to? - */ - if (!(ctrl->psd[state].flags & - NVME_PS_FLAGS_NON_OP_STATE)) - continue; - - exit_latency_us = - (u64)le32_to_cpu(ctrl->psd[state].exit_lat); - if (exit_latency_us > ctrl->ps_max_latency_us) - continue; + goto done; + } - total_latency_us = - exit_latency_us + - le32_to_cpu(ctrl->psd[state].entry_lat); + /* + * Walk through all states from lowest- to highest-power. + * According to the spec, lower-numbered states use more power. NPSS, + * despite the name, is the index of the lowest-power state, not the + * number of states. + */ + for (state = (int)ctrl->npss; state >= 0; state--) { + u64 total_latency_us, exit_latency_us, transition_ms; - /* - * This state is good. Use it as the APST idle - * target for higher power states. - */ - transition_ms = total_latency_us + 19; - do_div(transition_ms, 20); - if (transition_ms > (1 << 24) - 1) - transition_ms = (1 << 24) - 1; + if (target) + table->entries[state] = target; - target = cpu_to_le64((state << 3) | - (transition_ms << 8)); + /* + * Don't allow transitions to the deepest state if it's quirked + * off. + */ + if (state == ctrl->npss && + (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) + continue; - if (max_ps == -1) - max_ps = state; + /* + * Is this state a useful non-operational state for higher-power + * states to autonomously transition to? + */ + if (!(ctrl->psd[state].flags & NVME_PS_FLAGS_NON_OP_STATE)) + continue; - if (total_latency_us > max_lat_us) - max_lat_us = total_latency_us; - } + exit_latency_us = (u64)le32_to_cpu(ctrl->psd[state].exit_lat); + if (exit_latency_us > ctrl->ps_max_latency_us) + continue; - apste = 1; + total_latency_us = exit_latency_us + + le32_to_cpu(ctrl->psd[state].entry_lat); - if (max_ps == -1) { - dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n"); - } else { - dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n", - max_ps, max_lat_us, (int)sizeof(*table), table); - } + /* + * This state is good. Use it as the APST idle target for + * higher power states. + */ + transition_ms = total_latency_us + 19; + do_div(transition_ms, 20); + if (transition_ms > (1 << 24) - 1) + transition_ms = (1 << 24) - 1; + + target = cpu_to_le64((state << 3) | (transition_ms << 8)); + if (max_ps == -1) + max_ps = state; + if (total_latency_us > max_lat_us) + max_lat_us = total_latency_us; } + if (max_ps == -1) + dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n"); + else + dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n", + max_ps, max_lat_us, (int)sizeof(*table), table); + apste = 1; + +done: ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste, table, sizeof(*table), NULL); if (ret) dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret); - kfree(table); return ret; } @@ -2681,7 +2346,8 @@ static void nvme_set_latency_tolerance(struct device *dev, s32 val) if (ctrl->ps_max_latency_us != latency) { ctrl->ps_max_latency_us = latency; - nvme_configure_apst(ctrl); + if (ctrl->state == NVME_CTRL_LIVE) + nvme_configure_apst(ctrl); } } @@ -2854,8 +2520,8 @@ static ssize_t subsys_##field##_show(struct device *dev, \ { \ struct nvme_subsystem *subsys = \ container_of(dev, struct nvme_subsystem, dev); \ - return sprintf(buf, "%.*s\n", \ - (int)sizeof(subsys->field), subsys->field); \ + return sysfs_emit(buf, "%.*s\n", \ + (int)sizeof(subsys->field), subsys->field); \ } \ static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show); @@ -3038,28 +2704,74 @@ out: return 0; } -/* - * Initialize the cached copies of the Identify data and various controller - * register in our nvme_ctrl structure. This should be called as soon as - * the admin queue is fully up and running. - */ -int nvme_init_identify(struct nvme_ctrl *ctrl) +static inline u32 nvme_mps_to_sectors(struct nvme_ctrl *ctrl, u32 units) { - struct nvme_id_ctrl *id; - int ret, page_shift; - u32 max_hw_sectors; - bool prev_apst_enabled; + u32 page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12, val; - ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); - if (ret) { - dev_err(ctrl->device, "Reading VS failed (%d)\n", ret); - return ret; + if (check_shl_overflow(1U, units + page_shift - 9, &val)) + return UINT_MAX; + return val; +} + +static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) +{ + struct nvme_command c = { }; + struct nvme_id_ctrl_nvm *id; + int ret; + + if (ctrl->oncs & NVME_CTRL_ONCS_DSM) { + ctrl->max_discard_sectors = UINT_MAX; + ctrl->max_discard_segments = NVME_DSM_MAX_RANGES; + } else { + ctrl->max_discard_sectors = 0; + ctrl->max_discard_segments = 0; } - page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; - ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); - if (ctrl->vs >= NVME_VS(1, 1, 0)) - ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap); + /* + * Even though NVMe spec explicitly states that MDTS is not applicable + * to the write-zeroes, we are cautious and limit the size to the + * controllers max_hw_sectors value, which is based on the MDTS field + * and possibly other limiting factors. + */ + if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) && + !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES)) + ctrl->max_zeroes_sectors = ctrl->max_hw_sectors; + else + ctrl->max_zeroes_sectors = 0; + + if (nvme_ctrl_limited_cns(ctrl)) + return 0; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) + return 0; + + c.identify.opcode = nvme_admin_identify; + c.identify.cns = NVME_ID_CNS_CS_CTRL; + c.identify.csi = NVME_CSI_NVM; + + ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id)); + if (ret) + goto free_data; + + if (id->dmrl) + ctrl->max_discard_segments = id->dmrl; + if (id->dmrsl) + ctrl->max_discard_sectors = le32_to_cpu(id->dmrsl); + if (id->wzsl) + ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl); + +free_data: + kfree(id); + return ret; +} + +static int nvme_init_identify(struct nvme_ctrl *ctrl) +{ + struct nvme_id_ctrl *id; + u32 max_hw_sectors; + bool prev_apst_enabled; + int ret; ret = nvme_identify_ctrl(ctrl, &id); if (ret) { @@ -3077,7 +2789,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->cntlid = le16_to_cpu(id->cntlid); if (!ctrl->identified) { - int i; + unsigned int i; ret = nvme_init_subsystem(ctrl, id); if (ret) @@ -3116,7 +2828,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) atomic_set(&ctrl->abort_limit, id->acl + 1); ctrl->vwc = id->vwc; if (id->mdts) - max_hw_sectors = 1 << (id->mdts + page_shift - 9); + max_hw_sectors = nvme_mps_to_sectors(ctrl, id->mdts); else max_hw_sectors = UINT_MAX; ctrl->max_hw_sectors = @@ -3189,21 +2901,52 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->hmmaxd = le16_to_cpu(id->hmmaxd); } - ret = nvme_mpath_init(ctrl, id); - kfree(id); - + ret = nvme_mpath_init_identify(ctrl, id); if (ret < 0) - return ret; + goto out_free; if (ctrl->apst_enabled && !prev_apst_enabled) dev_pm_qos_expose_latency_tolerance(ctrl->device); else if (!ctrl->apst_enabled && prev_apst_enabled) dev_pm_qos_hide_latency_tolerance(ctrl->device); +out_free: + kfree(id); + return ret; +} + +/* + * Initialize the cached copies of the Identify data and various controller + * register in our nvme_ctrl structure. This should be called as soon as + * the admin queue is fully up and running. + */ +int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl) +{ + int ret; + + ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); + if (ret) { + dev_err(ctrl->device, "Reading VS failed (%d)\n", ret); + return ret; + } + + ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); + + if (ctrl->vs >= NVME_VS(1, 1, 0)) + ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap); + + ret = nvme_init_identify(ctrl); + if (ret) + return ret; + + ret = nvme_init_non_mdts_limits(ctrl); + if (ret < 0) + return ret; + ret = nvme_configure_apst(ctrl); if (ret < 0) return ret; - + ret = nvme_configure_timestamp(ctrl); if (ret < 0) return ret; @@ -3225,12 +2968,8 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->identified = true; return 0; - -out_free: - kfree(id); - return ret; } -EXPORT_SYMBOL_GPL(nvme_init_identify); +EXPORT_SYMBOL_GPL(nvme_init_ctrl_finish); static int nvme_dev_open(struct inode *inode, struct file *file) { @@ -3264,65 +3003,6 @@ static int nvme_dev_release(struct inode *inode, struct file *file) return 0; } -static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) -{ - struct nvme_ns *ns; - int ret; - - down_read(&ctrl->namespaces_rwsem); - if (list_empty(&ctrl->namespaces)) { - ret = -ENOTTY; - goto out_unlock; - } - - ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); - if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { - dev_warn(ctrl->device, - "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); - ret = -EINVAL; - goto out_unlock; - } - - dev_warn(ctrl->device, - "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); - kref_get(&ns->kref); - up_read(&ctrl->namespaces_rwsem); - - ret = nvme_user_cmd(ctrl, ns, argp); - nvme_put_ns(ns); - return ret; - -out_unlock: - up_read(&ctrl->namespaces_rwsem); - return ret; -} - -static long nvme_dev_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - struct nvme_ctrl *ctrl = file->private_data; - void __user *argp = (void __user *)arg; - - switch (cmd) { - case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ctrl, NULL, argp); - case NVME_IOCTL_ADMIN64_CMD: - return nvme_user_cmd64(ctrl, NULL, argp); - case NVME_IOCTL_IO_CMD: - return nvme_dev_user_cmd(ctrl, argp); - case NVME_IOCTL_RESET: - dev_warn(ctrl->device, "resetting controller\n"); - return nvme_reset_ctrl_sync(ctrl); - case NVME_IOCTL_SUBSYS_RESET: - return nvme_reset_subsystem(ctrl); - case NVME_IOCTL_RESCAN: - nvme_queue_scan(ctrl); - return 0; - default: - return -ENOTTY; - } -} - static const struct file_operations nvme_dev_fops = { .owner = THIS_MODULE, .open = nvme_dev_open, @@ -3376,13 +3056,13 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, int model_len = sizeof(subsys->model); if (!uuid_is_null(&ids->uuid)) - return sprintf(buf, "uuid.%pU\n", &ids->uuid); + return sysfs_emit(buf, "uuid.%pU\n", &ids->uuid); if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) - return sprintf(buf, "eui.%16phN\n", ids->nguid); + return sysfs_emit(buf, "eui.%16phN\n", ids->nguid); if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) - return sprintf(buf, "eui.%8phN\n", ids->eui64); + return sysfs_emit(buf, "eui.%8phN\n", ids->eui64); while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' || subsys->serial[serial_len - 1] == '\0')) @@ -3391,7 +3071,7 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, subsys->model[model_len - 1] == '\0')) model_len--; - return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id, + return sysfs_emit(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id, serial_len, subsys->serial, model_len, subsys->model, head->ns_id); } @@ -3400,7 +3080,7 @@ static DEVICE_ATTR_RO(wwid); static ssize_t nguid_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid); + return sysfs_emit(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid); } static DEVICE_ATTR_RO(nguid); @@ -3415,23 +3095,23 @@ static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, if (uuid_is_null(&ids->uuid)) { printk_ratelimited(KERN_WARNING "No UUID available providing old NGUID\n"); - return sprintf(buf, "%pU\n", ids->nguid); + return sysfs_emit(buf, "%pU\n", ids->nguid); } - return sprintf(buf, "%pU\n", &ids->uuid); + return sysfs_emit(buf, "%pU\n", &ids->uuid); } static DEVICE_ATTR_RO(uuid); static ssize_t eui_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64); + return sysfs_emit(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64); } static DEVICE_ATTR_RO(eui); static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", dev_to_ns_head(dev)->ns_id); + return sysfs_emit(buf, "%d\n", dev_to_ns_head(dev)->ns_id); } static DEVICE_ATTR_RO(nsid); @@ -3496,7 +3176,7 @@ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ - return sprintf(buf, "%.*s\n", \ + return sysfs_emit(buf, "%.*s\n", \ (int)sizeof(ctrl->subsys->field), ctrl->subsys->field); \ } \ static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); @@ -3510,7 +3190,7 @@ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ - return sprintf(buf, "%d\n", ctrl->field); \ + return sysfs_emit(buf, "%d\n", ctrl->field); \ } \ static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); @@ -3518,6 +3198,7 @@ nvme_show_int_function(cntlid); nvme_show_int_function(numa_node); nvme_show_int_function(queue_count); nvme_show_int_function(sqsize); +nvme_show_int_function(kato); static ssize_t nvme_sysfs_delete(struct device *dev, struct device_attribute *attr, const char *buf, @@ -3558,9 +3239,9 @@ static ssize_t nvme_sysfs_show_state(struct device *dev, if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) && state_name[ctrl->state]) - return sprintf(buf, "%s\n", state_name[ctrl->state]); + return sysfs_emit(buf, "%s\n", state_name[ctrl->state]); - return sprintf(buf, "unknown state\n"); + return sysfs_emit(buf, "unknown state\n"); } static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL); @@ -3612,9 +3293,9 @@ static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev, struct nvmf_ctrl_options *opts = ctrl->opts; if (ctrl->opts->max_reconnects == -1) - return sprintf(buf, "off\n"); - return sprintf(buf, "%d\n", - opts->max_reconnects * opts->reconnect_delay); + return sysfs_emit(buf, "off\n"); + return sysfs_emit(buf, "%d\n", + opts->max_reconnects * opts->reconnect_delay); } static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev, @@ -3628,7 +3309,7 @@ static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev, if (err) return -EINVAL; - else if (ctrl_loss_tmo < 0) + if (ctrl_loss_tmo < 0) opts->max_reconnects = -1; else opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo, @@ -3644,8 +3325,8 @@ static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev, struct nvme_ctrl *ctrl = dev_get_drvdata(dev); if (ctrl->opts->reconnect_delay == -1) - return sprintf(buf, "off\n"); - return sprintf(buf, "%d\n", ctrl->opts->reconnect_delay); + return sysfs_emit(buf, "off\n"); + return sysfs_emit(buf, "%d\n", ctrl->opts->reconnect_delay); } static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev, @@ -3665,6 +3346,36 @@ static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev, static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR, nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store); +static ssize_t nvme_ctrl_fast_io_fail_tmo_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (ctrl->opts->fast_io_fail_tmo == -1) + return sysfs_emit(buf, "off\n"); + return sysfs_emit(buf, "%d\n", ctrl->opts->fast_io_fail_tmo); +} + +static ssize_t nvme_ctrl_fast_io_fail_tmo_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + int fast_io_fail_tmo, err; + + err = kstrtoint(buf, 10, &fast_io_fail_tmo); + if (err) + return -EINVAL; + + if (fast_io_fail_tmo < 0) + opts->fast_io_fail_tmo = -1; + else + opts->fast_io_fail_tmo = fast_io_fail_tmo; + return count; +} +static DEVICE_ATTR(fast_io_fail_tmo, S_IRUGO | S_IWUSR, + nvme_ctrl_fast_io_fail_tmo_show, nvme_ctrl_fast_io_fail_tmo_store); + static struct attribute *nvme_dev_attrs[] = { &dev_attr_reset_controller.attr, &dev_attr_rescan_controller.attr, @@ -3684,6 +3395,8 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_hostid.attr, &dev_attr_ctrl_loss_tmo.attr, &dev_attr_reconnect_delay.attr, + &dev_attr_fast_io_fail_tmo.attr, + &dev_attr_kato.attr, NULL }; @@ -3705,6 +3418,8 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, return 0; if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts) return 0; + if (a == &dev_attr_fast_io_fail_tmo.attr && !ctrl->opts) + return 0; return a->mode; } @@ -3727,7 +3442,7 @@ static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys, lockdep_assert_held(&subsys->lock); list_for_each_entry(h, &subsys->nsheads, entry) { - if (h->ns_id == nsid && kref_get_unless_zero(&h->ref)) + if (h->ns_id == nsid && nvme_tryget_ns_head(h)) return h; } @@ -3750,6 +3465,68 @@ static int __nvme_check_ids(struct nvme_subsystem *subsys, return 0; } +void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device) +{ + cdev_device_del(cdev, cdev_device); + ida_simple_remove(&nvme_ns_chr_minor_ida, MINOR(cdev_device->devt)); +} + +int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device, + const struct file_operations *fops, struct module *owner) +{ + int minor, ret; + + minor = ida_simple_get(&nvme_ns_chr_minor_ida, 0, 0, GFP_KERNEL); + if (minor < 0) + return minor; + cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor); + cdev_device->class = nvme_ns_chr_class; + device_initialize(cdev_device); + cdev_init(cdev, fops); + cdev->owner = owner; + ret = cdev_device_add(cdev, cdev_device); + if (ret) { + put_device(cdev_device); + ida_simple_remove(&nvme_ns_chr_minor_ida, minor); + } + return ret; +} + +static int nvme_ns_chr_open(struct inode *inode, struct file *file) +{ + return nvme_ns_open(container_of(inode->i_cdev, struct nvme_ns, cdev)); +} + +static int nvme_ns_chr_release(struct inode *inode, struct file *file) +{ + nvme_ns_release(container_of(inode->i_cdev, struct nvme_ns, cdev)); + return 0; +} + +static const struct file_operations nvme_ns_chr_fops = { + .owner = THIS_MODULE, + .open = nvme_ns_chr_open, + .release = nvme_ns_chr_release, + .unlocked_ioctl = nvme_ns_chr_ioctl, + .compat_ioctl = compat_ptr_ioctl, +}; + +static int nvme_add_ns_cdev(struct nvme_ns *ns) +{ + int ret; + + ns->cdev_device.parent = ns->ctrl->device; + ret = dev_set_name(&ns->cdev_device, "ng%dn%d", + ns->ctrl->instance, ns->head->instance); + if (ret) + return ret; + ret = nvme_cdev_add(&ns->cdev, &ns->cdev_device, &nvme_ns_chr_fops, + ns->ctrl->ops->module); + if (ret) + kfree_const(ns->cdev_device.kobj.name); + return ret; +} + static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, unsigned nsid, struct nvme_ns_ids *ids) { @@ -3855,7 +3632,8 @@ out_unlock: return ret; } -static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) +static int ns_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); @@ -3870,7 +3648,7 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) down_read(&ctrl->namespaces_rwsem); list_for_each_entry(ns, &ctrl->namespaces, list) { if (ns->head->ns_id == nsid) { - if (!kref_get_unless_zero(&ns->kref)) + if (!nvme_get_ns(ns)) continue; ret = ns; break; @@ -3889,8 +3667,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, struct nvme_ns *ns; struct gendisk *disk; struct nvme_id_ns *id; - char disk_name[DISK_NAME_LEN]; - int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT; + int node = ctrl->numa_node; if (nvme_identify_ns(ctrl, nsid, ids, &id)) return; @@ -3916,7 +3693,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, if (nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED)) goto out_free_queue; - nvme_set_disk_name(disk_name, ns, ctrl, &flags); disk = alloc_disk_node(0, node); if (!disk) @@ -3925,15 +3701,22 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, disk->fops = &nvme_bdev_ops; disk->private_data = ns; disk->queue = ns->queue; - disk->flags = flags; - memcpy(disk->disk_name, disk_name, DISK_NAME_LEN); + disk->flags = GENHD_FL_EXT_DEVT; + /* + * Without the multipath code enabled, multiple controller per + * subsystems are visible as devices and thus we cannot use the + * subsystem instance. + */ + if (!nvme_mpath_set_disk_name(ns, disk->disk_name, &disk->flags)) + sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, + ns->head->instance); ns->disk = disk; if (nvme_update_ns_info(ns, id)) goto out_put_disk; if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { - if (nvme_nvm_register(ns, disk_name, node)) { + if (nvme_nvm_register(ns, disk->disk_name, node)) { dev_warn(ctrl->device, "LightNVM init failure\n"); goto out_put_disk; } @@ -3946,6 +3729,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, nvme_get_ctrl(ctrl); device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups); + if (!nvme_ns_head_multipath(ns->head)) + nvme_add_ns_cdev(ns); nvme_mpath_add_disk(ns, id); nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name); @@ -3990,6 +3775,8 @@ static void nvme_ns_remove(struct nvme_ns *ns) synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */ if (ns->disk->flags & GENHD_FL_UP) { + if (!nvme_ns_head_multipath(ns->head)) + nvme_cdev_del(&ns->cdev, &ns->cdev_device); del_gendisk(ns->disk); blk_cleanup_queue(ns->queue); if (blk_get_integrity(ns->disk)) @@ -4579,6 +4366,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, min(default_ps_max_latency_us, (unsigned long)S32_MAX)); nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device)); + nvme_mpath_init_ctrl(ctrl); return 0; out_free_name: @@ -4734,6 +4522,7 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE); + BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_nvm) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64); @@ -4779,8 +4568,24 @@ static int __init nvme_core_init(void) result = PTR_ERR(nvme_subsys_class); goto destroy_class; } + + result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS, + "nvme-generic"); + if (result < 0) + goto destroy_subsys_class; + + nvme_ns_chr_class = class_create(THIS_MODULE, "nvme-generic"); + if (IS_ERR(nvme_ns_chr_class)) { + result = PTR_ERR(nvme_ns_chr_class); + goto unregister_generic_ns; + } + return 0; +unregister_generic_ns: + unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS); +destroy_subsys_class: + class_destroy(nvme_subsys_class); destroy_class: class_destroy(nvme_class); unregister_chrdev: @@ -4797,12 +4602,15 @@ out: static void __exit nvme_core_exit(void) { + class_destroy(nvme_ns_chr_class); class_destroy(nvme_subsys_class); class_destroy(nvme_class); + unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS); unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS); destroy_workqueue(nvme_delete_wq); destroy_workqueue(nvme_reset_wq); destroy_workqueue(nvme_wq); + ida_destroy(&nvme_ns_chr_minor_ida); ida_destroy(&nvme_instance_ida); } diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 604ab0e5a2ad..34a84d2086c7 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -336,6 +336,11 @@ static void nvmf_log_connect_error(struct nvme_ctrl *ctrl, cmd->connect.recfmt); break; + case NVME_SC_HOST_PATH_ERROR: + dev_err(ctrl->device, + "Connect command failed: host path error\n"); + break; + default: dev_err(ctrl->device, "Connect command failed, error wo/DNR bit: %d\n", @@ -379,10 +384,8 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) /* * Set keep-alive timeout in seconds granularity (ms * 1000) - * and add a grace period for controller kato enforcement */ - cmd.connect.kato = ctrl->kato ? - cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000) : 0; + cmd.connect.kato = cpu_to_le32(ctrl->kato * 1000); if (ctrl->opts->disable_sqflow) cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW; @@ -535,63 +538,6 @@ static struct nvmf_transport_ops *nvmf_lookup_transport( return NULL; } -/* - * For something we're not in a state to send to the device the default action - * is to busy it and retry it after the controller state is recovered. However, - * if the controller is deleting or if anything is marked for failfast or - * nvme multipath it is immediately failed. - * - * Note: commands used to initialize the controller will be marked for failfast. - * Note: nvme cli/ioctl commands are marked for failfast. - */ -blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl, - struct request *rq) -{ - if (ctrl->state != NVME_CTRL_DELETING_NOIO && - ctrl->state != NVME_CTRL_DEAD && - !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) && - !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) - return BLK_STS_RESOURCE; - return nvme_host_path_error(rq); -} -EXPORT_SYMBOL_GPL(nvmf_fail_nonready_command); - -bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, - bool queue_live) -{ - struct nvme_request *req = nvme_req(rq); - - /* - * currently we have a problem sending passthru commands - * on the admin_q if the controller is not LIVE because we can't - * make sure that they are going out after the admin connect, - * controller enable and/or other commands in the initialization - * sequence. until the controller will be LIVE, fail with - * BLK_STS_RESOURCE so that they will be rescheduled. - */ - if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD)) - return false; - - /* - * Only allow commands on a live queue, except for the connect command, - * which is require to set the queue live in the appropinquate states. - */ - switch (ctrl->state) { - case NVME_CTRL_CONNECTING: - if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) && - req->cmd->fabrics.fctype == nvme_fabrics_type_connect) - return true; - break; - default: - break; - case NVME_CTRL_DEAD: - return false; - } - - return queue_live; -} -EXPORT_SYMBOL_GPL(__nvmf_check_ready); - static const match_table_t opt_tokens = { { NVMF_OPT_TRANSPORT, "transport=%s" }, { NVMF_OPT_TRADDR, "traddr=%s" }, diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 888b108d87a4..d7f7974dc208 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -184,20 +184,7 @@ void nvmf_unregister_transport(struct nvmf_transport_ops *ops); void nvmf_free_options(struct nvmf_ctrl_options *opts); int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size); bool nvmf_should_reconnect(struct nvme_ctrl *ctrl); -blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl, - struct request *rq); -bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, - bool queue_live); bool nvmf_ip_options_match(struct nvme_ctrl *ctrl, struct nvmf_ctrl_options *opts); -static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, - bool queue_live) -{ - if (likely(ctrl->state == NVME_CTRL_LIVE || - ctrl->state == NVME_CTRL_DELETING)) - return true; - return __nvmf_check_ready(ctrl, rq, queue_live); -} - #endif /* _NVME_FABRICS_H */ diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 6ffa8de2a0d7..f183f9fa03d0 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1708,7 +1708,7 @@ restart: * * If this routine returns error, the LLDD should abort the exchange. * - * @remoteport: pointer to the (registered) remote port that the LS + * @portptr: pointer to the (registered) remote port that the LS * was received from. The remoteport is associated with * a specific localport. * @lsrsp: pointer to a nvmefc_ls_rsp response structure to be @@ -2128,6 +2128,7 @@ nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq, op->op.fcp_req.first_sgl = op->sgl; op->op.fcp_req.private = &op->priv[0]; nvme_req(rq)->ctrl = &ctrl->ctrl; + nvme_req(rq)->cmd = &op->op.cmd_iu.sqe; return res; } @@ -2460,6 +2461,18 @@ nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved) static void __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) { + int q; + + /* + * if aborting io, the queues are no longer good, mark them + * all as not live. + */ + if (ctrl->ctrl.queue_count > 1) { + for (q = 1; q < ctrl->ctrl.queue_count; q++) + clear_bit(NVME_FC_Q_LIVE, &ctrl->queues[q].flags); + } + clear_bit(NVME_FC_Q_LIVE, &ctrl->queues[0].flags); + /* * If io queues are present, stop them and terminate all outstanding * ios on them. As FC allocates FC exchange for each io, the @@ -2759,18 +2772,16 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_fc_ctrl *ctrl = queue->ctrl; struct request *rq = bd->rq; struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); - struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu; - struct nvme_command *sqe = &cmdiu->sqe; enum nvmefc_fcp_datadir io_dir; bool queue_ready = test_bit(NVME_FC_Q_LIVE, &queue->flags); u32 data_len; blk_status_t ret; if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE || - !nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) - return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq); + !nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) + return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq); - ret = nvme_setup_cmd(ns, rq, sqe); + ret = nvme_setup_cmd(ns, rq); if (ret) return ret; @@ -3086,7 +3097,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); - ret = nvme_init_identify(&ctrl->ctrl); + ret = nvme_init_ctrl_finish(&ctrl->ctrl); if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) goto out_disconnect_admin_queue; @@ -3096,10 +3107,17 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) if (ctrl->ctrl.icdoff) { dev_err(ctrl->ctrl.device, "icdoff %d is not supported!\n", ctrl->ctrl.icdoff); + ret = NVME_SC_INVALID_FIELD | NVME_SC_DNR; goto out_disconnect_admin_queue; } /* FC-NVME supports normal SGL Data Block Descriptors */ + if (!(ctrl->ctrl.sgls & ((1 << 0) | (1 << 1)))) { + dev_err(ctrl->ctrl.device, + "Mandatory sgls are not supported!\n"); + ret = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto out_disconnect_admin_queue; + } if (opts->queue_size > ctrl->ctrl.maxcmd) { /* warn if maxcmd is lower than queue_size */ @@ -3264,11 +3282,13 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) return; - if (portptr->port_state == FC_OBJSTATE_ONLINE) + if (portptr->port_state == FC_OBJSTATE_ONLINE) { dev_info(ctrl->ctrl.device, "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n", ctrl->cnum, status); - else if (time_after_eq(jiffies, rport->dev_loss_end)) + if (status > 0 && (status & NVME_SC_DNR)) + recon = false; + } else if (time_after_eq(jiffies, rport->dev_loss_end)) recon = false; if (recon && nvmf_should_reconnect(&ctrl->ctrl)) { @@ -3282,12 +3302,17 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) queue_delayed_work(nvme_wq, &ctrl->connect_work, recon_delay); } else { - if (portptr->port_state == FC_OBJSTATE_ONLINE) - dev_warn(ctrl->ctrl.device, - "NVME-FC{%d}: Max reconnect attempts (%d) " - "reached.\n", - ctrl->cnum, ctrl->ctrl.nr_reconnects); - else + if (portptr->port_state == FC_OBJSTATE_ONLINE) { + if (status > 0 && (status & NVME_SC_DNR)) + dev_warn(ctrl->ctrl.device, + "NVME-FC{%d}: reconnect failure\n", + ctrl->cnum); + else + dev_warn(ctrl->ctrl.device, + "NVME-FC{%d}: Max reconnect attempts " + "(%d) reached.\n", + ctrl->cnum, ctrl->ctrl.nr_reconnects); + } else dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: dev_loss_tmo (%d) expired " "while waiting for remoteport connectivity.\n", diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c new file mode 100644 index 000000000000..9557ead02de1 --- /dev/null +++ b/drivers/nvme/host/ioctl.c @@ -0,0 +1,498 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2011-2014, Intel Corporation. + * Copyright (c) 2017-2021 Christoph Hellwig. + */ +#include <linux/ptrace.h> /* for force_successful_syscall_return */ +#include <linux/nvme_ioctl.h> +#include "nvme.h" + +/* + * Convert integer values from ioctl structures to user pointers, silently + * ignoring the upper bits in the compat case to match behaviour of 32-bit + * kernels. + */ +static void __user *nvme_to_user_ptr(uintptr_t ptrval) +{ + if (in_compat_syscall()) + ptrval = (compat_uptr_t)ptrval; + return (void __user *)ptrval; +} + +static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf, + unsigned len, u32 seed, bool write) +{ + struct bio_integrity_payload *bip; + int ret = -ENOMEM; + void *buf; + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + goto out; + + ret = -EFAULT; + if (write && copy_from_user(buf, ubuf, len)) + goto out_free_meta; + + bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); + if (IS_ERR(bip)) { + ret = PTR_ERR(bip); + goto out_free_meta; + } + + bip->bip_iter.bi_size = len; + bip->bip_iter.bi_sector = seed; + ret = bio_integrity_add_page(bio, virt_to_page(buf), len, + offset_in_page(buf)); + if (ret == len) + return buf; + ret = -ENOMEM; +out_free_meta: + kfree(buf); +out: + return ERR_PTR(ret); +} + +static int nvme_submit_user_cmd(struct request_queue *q, + struct nvme_command *cmd, void __user *ubuffer, + unsigned bufflen, void __user *meta_buffer, unsigned meta_len, + u32 meta_seed, u64 *result, unsigned timeout) +{ + bool write = nvme_is_write(cmd); + struct nvme_ns *ns = q->queuedata; + struct block_device *bdev = ns ? ns->disk->part0 : NULL; + struct request *req; + struct bio *bio = NULL; + void *meta = NULL; + int ret; + + req = nvme_alloc_request(q, cmd, 0); + if (IS_ERR(req)) + return PTR_ERR(req); + + if (timeout) + req->timeout = timeout; + nvme_req(req)->flags |= NVME_REQ_USERCMD; + + if (ubuffer && bufflen) { + ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, + GFP_KERNEL); + if (ret) + goto out; + bio = req->bio; + if (bdev) + bio_set_dev(bio, bdev); + if (bdev && meta_buffer && meta_len) { + meta = nvme_add_user_metadata(bio, meta_buffer, meta_len, + meta_seed, write); + if (IS_ERR(meta)) { + ret = PTR_ERR(meta); + goto out_unmap; + } + req->cmd_flags |= REQ_INTEGRITY; + } + } + + nvme_execute_passthru_rq(req); + if (nvme_req(req)->flags & NVME_REQ_CANCELLED) + ret = -EINTR; + else + ret = nvme_req(req)->status; + if (result) + *result = le64_to_cpu(nvme_req(req)->result.u64); + if (meta && !ret && !write) { + if (copy_to_user(meta_buffer, meta, meta_len)) + ret = -EFAULT; + } + kfree(meta); + out_unmap: + if (bio) + blk_rq_unmap_user(bio); + out: + blk_mq_free_request(req); + return ret; +} + + +static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) +{ + struct nvme_user_io io; + struct nvme_command c; + unsigned length, meta_len; + void __user *metadata; + + if (copy_from_user(&io, uio, sizeof(io))) + return -EFAULT; + if (io.flags) + return -EINVAL; + + switch (io.opcode) { + case nvme_cmd_write: + case nvme_cmd_read: + case nvme_cmd_compare: + break; + default: + return -EINVAL; + } + + length = (io.nblocks + 1) << ns->lba_shift; + + if ((io.control & NVME_RW_PRINFO_PRACT) && + ns->ms == sizeof(struct t10_pi_tuple)) { + /* + * Protection information is stripped/inserted by the + * controller. + */ + if (nvme_to_user_ptr(io.metadata)) + return -EINVAL; + meta_len = 0; + metadata = NULL; + } else { + meta_len = (io.nblocks + 1) * ns->ms; + metadata = nvme_to_user_ptr(io.metadata); + } + + if (ns->features & NVME_NS_EXT_LBAS) { + length += meta_len; + meta_len = 0; + } else if (meta_len) { + if ((io.metadata & 3) || !io.metadata) + return -EINVAL; + } + + memset(&c, 0, sizeof(c)); + c.rw.opcode = io.opcode; + c.rw.flags = io.flags; + c.rw.nsid = cpu_to_le32(ns->head->ns_id); + c.rw.slba = cpu_to_le64(io.slba); + c.rw.length = cpu_to_le16(io.nblocks); + c.rw.control = cpu_to_le16(io.control); + c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); + c.rw.reftag = cpu_to_le32(io.reftag); + c.rw.apptag = cpu_to_le16(io.apptag); + c.rw.appmask = cpu_to_le16(io.appmask); + + return nvme_submit_user_cmd(ns->queue, &c, + nvme_to_user_ptr(io.addr), length, + metadata, meta_len, lower_32_bits(io.slba), NULL, 0); +} + +static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + struct nvme_passthru_cmd __user *ucmd) +{ + struct nvme_passthru_cmd cmd; + struct nvme_command c; + unsigned timeout = 0; + u64 result; + int status; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (copy_from_user(&cmd, ucmd, sizeof(cmd))) + return -EFAULT; + if (cmd.flags) + return -EINVAL; + if (ns && cmd.nsid != ns->head->ns_id) { + dev_err(ctrl->device, + "%s: nsid (%u) in cmd does not match nsid (%u) of namespace\n", + current->comm, cmd.nsid, ns->head->ns_id); + return -EINVAL; + } + + memset(&c, 0, sizeof(c)); + c.common.opcode = cmd.opcode; + c.common.flags = cmd.flags; + c.common.nsid = cpu_to_le32(cmd.nsid); + c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); + c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); + c.common.cdw10 = cpu_to_le32(cmd.cdw10); + c.common.cdw11 = cpu_to_le32(cmd.cdw11); + c.common.cdw12 = cpu_to_le32(cmd.cdw12); + c.common.cdw13 = cpu_to_le32(cmd.cdw13); + c.common.cdw14 = cpu_to_le32(cmd.cdw14); + c.common.cdw15 = cpu_to_le32(cmd.cdw15); + + if (cmd.timeout_ms) + timeout = msecs_to_jiffies(cmd.timeout_ms); + + status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, + nvme_to_user_ptr(cmd.addr), cmd.data_len, + nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, + 0, &result, timeout); + + if (status >= 0) { + if (put_user(result, &ucmd->result)) + return -EFAULT; + } + + return status; +} + +static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + struct nvme_passthru_cmd64 __user *ucmd) +{ + struct nvme_passthru_cmd64 cmd; + struct nvme_command c; + unsigned timeout = 0; + int status; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (copy_from_user(&cmd, ucmd, sizeof(cmd))) + return -EFAULT; + if (cmd.flags) + return -EINVAL; + if (ns && cmd.nsid != ns->head->ns_id) { + dev_err(ctrl->device, + "%s: nsid (%u) in cmd does not match nsid (%u) of namespace\n", + current->comm, cmd.nsid, ns->head->ns_id); + return -EINVAL; + } + + memset(&c, 0, sizeof(c)); + c.common.opcode = cmd.opcode; + c.common.flags = cmd.flags; + c.common.nsid = cpu_to_le32(cmd.nsid); + c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); + c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); + c.common.cdw10 = cpu_to_le32(cmd.cdw10); + c.common.cdw11 = cpu_to_le32(cmd.cdw11); + c.common.cdw12 = cpu_to_le32(cmd.cdw12); + c.common.cdw13 = cpu_to_le32(cmd.cdw13); + c.common.cdw14 = cpu_to_le32(cmd.cdw14); + c.common.cdw15 = cpu_to_le32(cmd.cdw15); + + if (cmd.timeout_ms) + timeout = msecs_to_jiffies(cmd.timeout_ms); + + status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, + nvme_to_user_ptr(cmd.addr), cmd.data_len, + nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, + 0, &cmd.result, timeout); + + if (status >= 0) { + if (put_user(cmd.result, &ucmd->result)) + return -EFAULT; + } + + return status; +} + +static bool is_ctrl_ioctl(unsigned int cmd) +{ + if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD) + return true; + if (is_sed_ioctl(cmd)) + return true; + return false; +} + +static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd, + void __user *argp) +{ + switch (cmd) { + case NVME_IOCTL_ADMIN_CMD: + return nvme_user_cmd(ctrl, NULL, argp); + case NVME_IOCTL_ADMIN64_CMD: + return nvme_user_cmd64(ctrl, NULL, argp); + default: + return sed_ioctl(ctrl->opal_dev, cmd, argp); + } +} + +#ifdef COMPAT_FOR_U64_ALIGNMENT +struct nvme_user_io32 { + __u8 opcode; + __u8 flags; + __u16 control; + __u16 nblocks; + __u16 rsvd; + __u64 metadata; + __u64 addr; + __u64 slba; + __u32 dsmgmt; + __u32 reftag; + __u16 apptag; + __u16 appmask; +} __attribute__((__packed__)); +#define NVME_IOCTL_SUBMIT_IO32 _IOW('N', 0x42, struct nvme_user_io32) +#endif /* COMPAT_FOR_U64_ALIGNMENT */ + +static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, + void __user *argp) +{ + switch (cmd) { + case NVME_IOCTL_ID: + force_successful_syscall_return(); + return ns->head->ns_id; + case NVME_IOCTL_IO_CMD: + return nvme_user_cmd(ns->ctrl, ns, argp); + /* + * struct nvme_user_io can have different padding on some 32-bit ABIs. + * Just accept the compat version as all fields that are used are the + * same size and at the same offset. + */ +#ifdef COMPAT_FOR_U64_ALIGNMENT + case NVME_IOCTL_SUBMIT_IO32: +#endif + case NVME_IOCTL_SUBMIT_IO: + return nvme_submit_io(ns, argp); + case NVME_IOCTL_IO64_CMD: + return nvme_user_cmd64(ns->ctrl, ns, argp); + default: + if (!ns->ndev) + return -ENOTTY; + return nvme_nvm_ioctl(ns, cmd, argp); + } +} + +static int __nvme_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *arg) +{ + if (is_ctrl_ioctl(cmd)) + return nvme_ctrl_ioctl(ns->ctrl, cmd, arg); + return nvme_ns_ioctl(ns, cmd, arg); +} + +int nvme_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct nvme_ns *ns = bdev->bd_disk->private_data; + + return __nvme_ioctl(ns, cmd, (void __user *)arg); +} + +long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct nvme_ns *ns = + container_of(file_inode(file)->i_cdev, struct nvme_ns, cdev); + + return __nvme_ioctl(ns, cmd, (void __user *)arg); +} + +#ifdef CONFIG_NVME_MULTIPATH +static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, + void __user *argp, struct nvme_ns_head *head, int srcu_idx) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + int ret; + + nvme_get_ctrl(ns->ctrl); + nvme_put_ns_from_disk(head, srcu_idx); + ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp); + + nvme_put_ctrl(ctrl); + return ret; +} + +int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct nvme_ns_head *head = NULL; + void __user *argp = (void __user *)arg; + struct nvme_ns *ns; + int srcu_idx, ret; + + ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); + if (unlikely(!ns)) + return -EWOULDBLOCK; + + /* + * Handle ioctls that apply to the controller instead of the namespace + * seperately and drop the ns SRCU reference early. This avoids a + * deadlock when deleting namespaces using the passthrough interface. + */ + if (is_ctrl_ioctl(cmd)) + ret = nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); + else { + ret = nvme_ns_ioctl(ns, cmd, argp); + nvme_put_ns_from_disk(head, srcu_idx); + } + + return ret; +} + +long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct cdev *cdev = file_inode(file)->i_cdev; + struct nvme_ns_head *head = + container_of(cdev, struct nvme_ns_head, cdev); + void __user *argp = (void __user *)arg; + struct nvme_ns *ns; + int srcu_idx, ret; + + srcu_idx = srcu_read_lock(&head->srcu); + ns = nvme_find_path(head); + if (!ns) { + srcu_read_unlock(&head->srcu, srcu_idx); + return -EWOULDBLOCK; + } + + if (is_ctrl_ioctl(cmd)) + return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); + + ret = nvme_ns_ioctl(ns, cmd, argp); + nvme_put_ns_from_disk(head, srcu_idx); + + return ret; +} +#endif /* CONFIG_NVME_MULTIPATH */ + +static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) +{ + struct nvme_ns *ns; + int ret; + + down_read(&ctrl->namespaces_rwsem); + if (list_empty(&ctrl->namespaces)) { + ret = -ENOTTY; + goto out_unlock; + } + + ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); + if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { + dev_warn(ctrl->device, + "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); + ret = -EINVAL; + goto out_unlock; + } + + dev_warn(ctrl->device, + "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); + kref_get(&ns->kref); + up_read(&ctrl->namespaces_rwsem); + + ret = nvme_user_cmd(ctrl, ns, argp); + nvme_put_ns(ns); + return ret; + +out_unlock: + up_read(&ctrl->namespaces_rwsem); + return ret; +} + +long nvme_dev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct nvme_ctrl *ctrl = file->private_data; + void __user *argp = (void __user *)arg; + + switch (cmd) { + case NVME_IOCTL_ADMIN_CMD: + return nvme_user_cmd(ctrl, NULL, argp); + case NVME_IOCTL_ADMIN64_CMD: + return nvme_user_cmd64(ctrl, NULL, argp); + case NVME_IOCTL_IO_CMD: + return nvme_dev_user_cmd(ctrl, argp); + case NVME_IOCTL_RESET: + dev_warn(ctrl->device, "resetting controller\n"); + return nvme_reset_ctrl_sync(ctrl); + case NVME_IOCTL_SUBSYS_RESET: + return nvme_reset_subsystem(ctrl); + case NVME_IOCTL_RESCAN: + nvme_queue_scan(ctrl); + return 0; + default: + return -ENOTTY; + } +} diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index b705988629f2..e9d9ad47f70f 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -660,7 +660,7 @@ static struct request *nvme_nvm_alloc_request(struct request_queue *q, rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; if (rqd->bio) - blk_rq_append_bio(rq, &rqd->bio); + blk_rq_append_bio(rq, rqd->bio); else rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); @@ -930,15 +930,15 @@ static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin, return ret; } -int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg) +int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *argp) { switch (cmd) { case NVME_NVM_IOCTL_ADMIN_VIO: - return nvme_nvm_user_vcmd(ns, 1, (void __user *)arg); + return nvme_nvm_user_vcmd(ns, 1, argp); case NVME_NVM_IOCTL_IO_VIO: - return nvme_nvm_user_vcmd(ns, 0, (void __user *)arg); + return nvme_nvm_user_vcmd(ns, 0, argp); case NVME_NVM_IOCTL_SUBMIT_VIO: - return nvme_nvm_submit_vio(ns, (void __user *)arg); + return nvme_nvm_submit_vio(ns, argp); default: return -ENOTTY; } @@ -1240,7 +1240,7 @@ static struct attribute *nvm_dev_attrs[] = { static umode_t nvm_dev_attrs_visible(struct kobject *kobj, struct attribute *attr, int index) { - struct device *dev = container_of(kobj, struct device, kobj); + struct device *dev = kobj_to_dev(kobj); struct gendisk *disk = dev_to_disk(dev); struct nvme_ns *ns = disk->private_data; struct nvm_dev *ndev = ns->ndev; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index a1d476e1ac02..f81871c7128a 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -50,19 +50,19 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) * and those that have a single controller and use the controller node * directly. */ -void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, - struct nvme_ctrl *ctrl, int *flags) -{ - if (!multipath) { - sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); - } else if (ns->head->disk) { - sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance, - ctrl->instance, ns->head->instance); - *flags = GENHD_FL_HIDDEN; - } else { - sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance, - ns->head->instance); +bool nvme_mpath_set_disk_name(struct nvme_ns *ns, char *disk_name, int *flags) +{ + if (!multipath) + return false; + if (!ns->head->disk) { + sprintf(disk_name, "nvme%dn%d", ns->ctrl->subsys->instance, + ns->head->instance); + return true; } + sprintf(disk_name, "nvme%dc%dn%d", ns->ctrl->subsys->instance, + ns->ctrl->instance, ns->head->instance); + *flags = GENHD_FL_HIDDEN; + return true; } void nvme_failover_req(struct request *req) @@ -70,6 +70,7 @@ void nvme_failover_req(struct request *req) struct nvme_ns *ns = req->q->queuedata; u16 status = nvme_req(req)->status & 0x7ff; unsigned long flags; + struct bio *bio; nvme_mpath_clear_current_path(ns); @@ -84,6 +85,8 @@ void nvme_failover_req(struct request *req) } spin_lock_irqsave(&ns->head->requeue_lock, flags); + for (bio = req->bio; bio; bio = bio->bi_next) + bio_set_dev(bio, ns->head->disk->part0); blk_steal_bios(&ns->head->requeue_list, req); spin_unlock_irqrestore(&ns->head->requeue_lock, flags); @@ -294,7 +297,7 @@ static bool nvme_available_path(struct nvme_ns_head *head) return false; } -blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) +static blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) { struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data; struct device *dev = disk_to_dev(head->disk); @@ -334,6 +337,71 @@ blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) return ret; } +static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode) +{ + if (!nvme_tryget_ns_head(bdev->bd_disk->private_data)) + return -ENXIO; + return 0; +} + +static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) +{ + nvme_put_ns_head(disk->private_data); +} + +const struct block_device_operations nvme_ns_head_ops = { + .owner = THIS_MODULE, + .submit_bio = nvme_ns_head_submit_bio, + .open = nvme_ns_head_open, + .release = nvme_ns_head_release, + .ioctl = nvme_ns_head_ioctl, + .getgeo = nvme_getgeo, + .report_zones = nvme_report_zones, + .pr_ops = &nvme_pr_ops, +}; + +static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev) +{ + return container_of(cdev, struct nvme_ns_head, cdev); +} + +static int nvme_ns_head_chr_open(struct inode *inode, struct file *file) +{ + if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev))) + return -ENXIO; + return 0; +} + +static int nvme_ns_head_chr_release(struct inode *inode, struct file *file) +{ + nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev)); + return 0; +} + +static const struct file_operations nvme_ns_head_chr_fops = { + .owner = THIS_MODULE, + .open = nvme_ns_head_chr_open, + .release = nvme_ns_head_chr_release, + .unlocked_ioctl = nvme_ns_head_chr_ioctl, + .compat_ioctl = compat_ptr_ioctl, +}; + +static int nvme_add_ns_head_cdev(struct nvme_ns_head *head) +{ + int ret; + + head->cdev_device.parent = &head->subsys->dev; + ret = dev_set_name(&head->cdev_device, "ng%dn%d", + head->subsys->instance, head->instance); + if (ret) + return ret; + ret = nvme_cdev_add(&head->cdev, &head->cdev_device, + &nvme_ns_head_chr_fops, THIS_MODULE); + if (ret) + kfree_const(head->cdev_device.kobj.name); + return ret; +} + static void nvme_requeue_work(struct work_struct *work) { struct nvme_ns_head *head = @@ -412,9 +480,11 @@ static void nvme_mpath_set_live(struct nvme_ns *ns) if (!head->disk) return; - if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) + if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { device_add_disk(&head->subsys->dev, head->disk, nvme_ns_id_attr_groups); + nvme_add_ns_head_cdev(head); + } mutex_lock(&head->lock); if (nvme_path_is_optimized(ns)) { @@ -602,8 +672,8 @@ static ssize_t nvme_subsys_iopolicy_show(struct device *dev, struct nvme_subsystem *subsys = container_of(dev, struct nvme_subsystem, dev); - return sprintf(buf, "%s\n", - nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]); + return sysfs_emit(buf, "%s\n", + nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]); } static ssize_t nvme_subsys_iopolicy_store(struct device *dev, @@ -628,7 +698,7 @@ SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR, static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid); + return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid); } DEVICE_ATTR_RO(ana_grpid); @@ -637,7 +707,7 @@ static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - return sprintf(buf, "%s\n", nvme_ana_state_names[ns->ana_state]); + return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]); } DEVICE_ATTR_RO(ana_state); @@ -668,9 +738,13 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) if (desc.state) { /* found the group desc: update */ nvme_update_ns_ana_state(&desc, ns); + } else { + /* group desc not found: trigger a re-read */ + set_bit(NVME_NS_ANA_PENDING, &ns->flags); + queue_work(nvme_wq, &ns->ctrl->ana_work); } } else { - ns->ana_state = NVME_ANA_OPTIMIZED; + ns->ana_state = NVME_ANA_OPTIMIZED; nvme_mpath_set_live(ns); } @@ -687,8 +761,10 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) { if (!head->disk) return; - if (head->disk->flags & GENHD_FL_UP) + if (head->disk->flags & GENHD_FL_UP) { + nvme_cdev_del(&head->cdev, &head->cdev_device); del_gendisk(head->disk); + } blk_set_queue_dying(head->disk->queue); /* make sure all pending bios are cleaned up */ kblockd_schedule_work(&head->requeue_work); @@ -705,9 +781,18 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) put_disk(head->disk); } -int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) +void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl) { - int error; + mutex_init(&ctrl->ana_lock); + timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); + INIT_WORK(&ctrl->ana_work, nvme_ana_work); +} + +int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) +{ + size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT; + size_t ana_log_size; + int error = 0; /* check if multipath is enabled and we have the capability */ if (!multipath || !ctrl->subsys || @@ -719,37 +804,31 @@ int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) ctrl->nanagrpid = le32_to_cpu(id->nanagrpid); ctrl->anagrpmax = le32_to_cpu(id->anagrpmax); - mutex_init(&ctrl->ana_lock); - timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); - ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + - ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc); - ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32); - - if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) { + ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + + ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) + + ctrl->max_namespaces * sizeof(__le32); + if (ana_log_size > max_transfer_size) { dev_err(ctrl->device, - "ANA log page size (%zd) larger than MDTS (%d).\n", - ctrl->ana_log_size, - ctrl->max_hw_sectors << SECTOR_SHIFT); + "ANA log page size (%zd) larger than MDTS (%zd).\n", + ana_log_size, max_transfer_size); dev_err(ctrl->device, "disabling ANA support.\n"); - return 0; + goto out_uninit; } - - INIT_WORK(&ctrl->ana_work, nvme_ana_work); - kfree(ctrl->ana_log_buf); - ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL); - if (!ctrl->ana_log_buf) { - error = -ENOMEM; - goto out; + if (ana_log_size > ctrl->ana_log_size) { + nvme_mpath_stop(ctrl); + kfree(ctrl->ana_log_buf); + ctrl->ana_log_buf = kmalloc(ana_log_size, GFP_KERNEL); + if (!ctrl->ana_log_buf) + return -ENOMEM; } - + ctrl->ana_log_size = ana_log_size; error = nvme_read_ana_log(ctrl); if (error) - goto out_free_ana_log_buf; + goto out_uninit; return 0; -out_free_ana_log_buf: - kfree(ctrl->ana_log_buf); - ctrl->ana_log_buf = NULL; -out: + +out_uninit: + nvme_mpath_uninit(ctrl); return error; } @@ -758,4 +837,3 @@ void nvme_mpath_uninit(struct nvme_ctrl *ctrl) kfree(ctrl->ana_log_buf); ctrl->ana_log_buf = NULL; } - diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 07b34175c6ce..0015860ec12b 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -27,7 +27,6 @@ extern unsigned int admin_timeout; #define NVME_ADMIN_TIMEOUT (admin_timeout * HZ) #define NVME_DEFAULT_KATO 5 -#define NVME_KATO_GRACE 10 #ifdef CONFIG_ARCH_NO_SG_CHAIN #define NVME_INLINE_SG_CNT 0 @@ -276,6 +275,9 @@ struct nvme_ctrl { u32 max_hw_sectors; u32 max_segments; u32 max_integrity_segments; + u32 max_discard_sectors; + u32 max_discard_segments; + u32 max_zeroes_sectors; #ifdef CONFIG_BLK_DEV_ZONED u32 max_zone_append; #endif @@ -410,8 +412,12 @@ struct nvme_ns_head { bool shared; int instance; struct nvme_effects_log *effects; -#ifdef CONFIG_NVME_MULTIPATH + + struct cdev cdev; + struct device cdev_device; + struct gendisk *disk; +#ifdef CONFIG_NVME_MULTIPATH struct bio_list requeue_list; spinlock_t requeue_lock; struct work_struct requeue_work; @@ -422,6 +428,11 @@ struct nvme_ns_head { #endif }; +static inline bool nvme_ns_head_multipath(struct nvme_ns_head *head) +{ + return IS_ENABLED(CONFIG_NVME_MULTIPATH) && head->disk; +} + enum nvme_ns_features { NVME_NS_EXT_LBAS = 1 << 0, /* support extended LBA format */ NVME_NS_METADATA_SUPPORTED = 1 << 1, /* support getting generated md */ @@ -457,6 +468,9 @@ struct nvme_ns { #define NVME_NS_ANA_PENDING 2 #define NVME_NS_FORCE_RO 3 + struct cdev cdev; + struct device cdev_device; + struct nvme_fault_inject fault_inject; }; @@ -599,7 +613,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, void nvme_uninit_ctrl(struct nvme_ctrl *ctrl); void nvme_start_ctrl(struct nvme_ctrl *ctrl); void nvme_stop_ctrl(struct nvme_ctrl *ctrl); -int nvme_init_identify(struct nvme_ctrl *ctrl); +int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl); void nvme_remove_namespaces(struct nvme_ctrl *ctrl); @@ -623,8 +637,22 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl); struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, blk_mq_req_flags_t flags); void nvme_cleanup_cmd(struct request *req); -blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, - struct nvme_command *cmd); +blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req); +blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl, + struct request *req); +bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq, + bool queue_live); + +static inline bool nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq, + bool queue_live) +{ + if (likely(ctrl->state == NVME_CTRL_LIVE)) + return true; + if (ctrl->ops->flags & NVME_F_FABRICS && + ctrl->state == NVME_CTRL_DELETING) + return true; + return __nvme_check_ready(ctrl, rq, queue_live); +} int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buf, unsigned bufflen); int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, @@ -640,16 +668,33 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid, int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); int nvme_reset_ctrl(struct nvme_ctrl *ctrl); +int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); int nvme_try_sched_reset(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl); - +void nvme_queue_scan(struct nvme_ctrl *ctrl); int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, void *log, size_t size, u64 offset); struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, struct nvme_ns_head **head, int *srcu_idx); void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx); +bool nvme_tryget_ns_head(struct nvme_ns_head *head); +void nvme_put_ns_head(struct nvme_ns_head *head); +int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device, + const struct file_operations *fops, struct module *owner); +void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device); +int nvme_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg); +long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg); +long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, + unsigned long arg); +long nvme_dev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg); +int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo); extern const struct attribute_group *nvme_ns_id_attr_groups[]; +extern const struct pr_ops nvme_pr_ops; extern const struct block_device_operations nvme_ns_head_ops; #ifdef CONFIG_NVME_MULTIPATH @@ -661,20 +706,19 @@ static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) void nvme_mpath_unfreeze(struct nvme_subsystem *subsys); void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys); void nvme_mpath_start_freeze(struct nvme_subsystem *subsys); -void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, - struct nvme_ctrl *ctrl, int *flags); +bool nvme_mpath_set_disk_name(struct nvme_ns *ns, char *disk_name, int *flags); void nvme_failover_req(struct request *req); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id); void nvme_mpath_remove_disk(struct nvme_ns_head *head); -int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id); +int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id); +void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl); void nvme_mpath_uninit(struct nvme_ctrl *ctrl); void nvme_mpath_stop(struct nvme_ctrl *ctrl); bool nvme_mpath_clear_current_path(struct nvme_ns *ns); void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl); struct nvme_ns *nvme_find_path(struct nvme_ns_head *head); -blk_qc_t nvme_ns_head_submit_bio(struct bio *bio); static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) { @@ -701,16 +745,11 @@ static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) { return false; } -/* - * Without the multipath code enabled, multiple controller per subsystems are - * visible as devices and thus we cannot use the subsystem instance. - */ -static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, - struct nvme_ctrl *ctrl, int *flags) +static inline bool nvme_mpath_set_disk_name(struct nvme_ns *ns, char *disk_name, + int *flags) { - sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); + return false; } - static inline void nvme_failover_req(struct request *req) { } @@ -742,10 +781,13 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) static inline void nvme_trace_bio_complete(struct request *req) { } -static inline int nvme_mpath_init(struct nvme_ctrl *ctrl, +static inline void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl) +{ +} +static inline int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) { - if (ctrl->subsys->cmic & (1 << 3)) + if (ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA) dev_warn(ctrl->device, "Please enable CONFIG_NVME_MULTIPATH for full support of multi-port devices.\n"); return 0; @@ -798,7 +840,7 @@ static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); void nvme_nvm_unregister(struct nvme_ns *ns); extern const struct attribute_group nvme_nvm_attr_group; -int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg); +int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *argp); #else static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) @@ -808,7 +850,7 @@ static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, static inline void nvme_nvm_unregister(struct nvme_ns *ns) {}; static inline int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, - unsigned long arg) + void __user *argp) { return -ENOTTY; } diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 7249ae74f71f..a29b170701fc 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -224,6 +224,7 @@ struct nvme_queue { */ struct nvme_iod { struct nvme_request req; + struct nvme_command cmd; struct nvme_queue *nvmeq; bool use_sgl; int aborted; @@ -429,6 +430,7 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req, iod->nvmeq = nvmeq; nvme_req(req)->ctrl = &dev->ctrl; + nvme_req(req)->cmd = &iod->cmd; return 0; } @@ -852,7 +854,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, return nvme_setup_prp_simple(dev, req, &cmnd->rw, &bv); - if (iod->nvmeq->qid && + if (iod->nvmeq->qid && sgl_threshold && dev->ctrl.sgls & ((1 << 0) | (1 << 1))) return nvme_setup_sgl_simple(dev, req, &cmnd->rw, &bv); @@ -917,7 +919,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_dev *dev = nvmeq->dev; struct request *req = bd->rq; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_command cmnd; + struct nvme_command *cmnd = &iod->cmd; blk_status_t ret; iod->aborted = 0; @@ -931,24 +933,27 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags))) return BLK_STS_IOERR; - ret = nvme_setup_cmd(ns, req, &cmnd); + if (!nvme_check_ready(&dev->ctrl, req, true)) + return nvme_fail_nonready_command(&dev->ctrl, req); + + ret = nvme_setup_cmd(ns, req); if (ret) return ret; if (blk_rq_nr_phys_segments(req)) { - ret = nvme_map_data(dev, req, &cmnd); + ret = nvme_map_data(dev, req, cmnd); if (ret) goto out_free_cmd; } if (blk_integrity_rq(req)) { - ret = nvme_map_metadata(dev, req, &cmnd); + ret = nvme_map_metadata(dev, req, cmnd); if (ret) goto out_unmap_data; } blk_mq_start_request(req); - nvme_submit_cmd(nvmeq, &cmnd, bd->last); + nvme_submit_cmd(nvmeq, cmnd, bd->last); return BLK_STS_OK; out_unmap_data: nvme_unmap_data(dev, req); @@ -1060,18 +1065,10 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq) static irqreturn_t nvme_irq(int irq, void *data) { struct nvme_queue *nvmeq = data; - irqreturn_t ret = IRQ_NONE; - /* - * The rmb/wmb pair ensures we see all updates from a previous run of - * the irq handler, even if that was on another CPU. - */ - rmb(); if (nvme_process_cq(nvmeq)) - ret = IRQ_HANDLED; - wmb(); - - return ret; + return IRQ_HANDLED; + return IRQ_NONE; } static irqreturn_t nvme_irq_check(int irq, void *data) @@ -2178,7 +2175,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (nr_io_queues == 0) return 0; - + clear_bit(NVMEQ_ENABLED, &adminq->flags); if (dev->cmb_use_sqes) { @@ -2653,7 +2650,7 @@ static void nvme_reset_work(struct work_struct *work) */ dev->ctrl.max_integrity_segments = 1; - result = nvme_init_identify(&dev->ctrl); + result = nvme_init_ctrl_finish(&dev->ctrl); if (result) goto out; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index be905d4fdb47..4697a94c0945 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -314,6 +314,7 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set, NVME_RDMA_DATA_SGL_SIZE; req->queue = queue; + nvme_req(rq)->cmd = req->sqe.data; return 0; } @@ -920,7 +921,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); - error = nvme_init_identify(&ctrl->ctrl); + error = nvme_init_ctrl_finish(&ctrl->ctrl); if (error) goto out_quiesce_queue; @@ -1319,16 +1320,17 @@ static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue, int count) { struct nvme_sgl_desc *sg = &c->common.dptr.sgl; - struct scatterlist *sgl = req->data_sgl.sg_table.sgl; struct ib_sge *sge = &req->sge[1]; + struct scatterlist *sgl; u32 len = 0; int i; - for (i = 0; i < count; i++, sgl++, sge++) { + for_each_sg(req->data_sgl.sg_table.sgl, sgl, count, i) { sge->addr = sg_dma_address(sgl); sge->length = sg_dma_len(sgl); sge->lkey = queue->device->pd->local_dma_lkey; len += sge->length; + sge++; } sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff); @@ -2041,7 +2043,7 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq = bd->rq; struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); struct nvme_rdma_qe *sqe = &req->sqe; - struct nvme_command *c = sqe->data; + struct nvme_command *c = nvme_req(rq)->cmd; struct ib_device *dev; bool queue_ready = test_bit(NVME_RDMA_Q_LIVE, &queue->flags); blk_status_t ret; @@ -2049,8 +2051,8 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, WARN_ON_ONCE(rq->tag < 0); - if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) - return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq); + if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) + return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq); dev = queue->device->dev; @@ -2064,7 +2066,7 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(struct nvme_command), DMA_TO_DEVICE); - ret = nvme_setup_cmd(ns, rq, c); + ret = nvme_setup_cmd(ns, rq); if (ret) goto unmap_qe; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index a0f00cb8f9f3..34f4b3402f7c 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -417,6 +417,7 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set, { struct nvme_tcp_ctrl *ctrl = set->driver_data; struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_tcp_cmd_pdu *pdu; int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx]; u8 hdgst = nvme_tcp_hdgst_len(queue); @@ -427,8 +428,10 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set, if (!req->pdu) return -ENOMEM; + pdu = req->pdu; req->queue = queue; nvme_req(rq)->ctrl = &ctrl->ctrl; + nvme_req(rq)->cmd = &pdu->cmd; return 0; } @@ -874,7 +877,7 @@ static void nvme_tcp_state_change(struct sock *sk) { struct nvme_tcp_queue *queue; - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); queue = sk->sk_user_data; if (!queue) goto done; @@ -895,7 +898,7 @@ static void nvme_tcp_state_change(struct sock *sk) queue->state_change(sk); done: - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue) @@ -940,7 +943,6 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) if (ret <= 0) return ret; - nvme_tcp_advance_req(req, ret); if (queue->data_digest) nvme_tcp_ddgst_update(queue->snd_hash, page, offset, ret); @@ -957,6 +959,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) } return 1; } + nvme_tcp_advance_req(req, ret); } return -EAGAIN; } @@ -1137,7 +1140,8 @@ static void nvme_tcp_io_work(struct work_struct *w) pending = true; else if (unlikely(result < 0)) break; - } + } else + pending = !llist_empty(&queue->req_list); result = nvme_tcp_try_recv(queue); if (result > 0) @@ -1885,7 +1889,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) blk_mq_unquiesce_queue(ctrl->admin_q); - error = nvme_init_identify(ctrl); + error = nvme_init_ctrl_finish(ctrl); if (error) goto out_quiesce_queue; @@ -1973,6 +1977,11 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) goto destroy_admin; } + if (!(ctrl->sgls & ((1 << 0) | (1 << 1)))) { + dev_err(ctrl->device, "Mandatory sgls are not supported!\n"); + goto destroy_admin; + } + if (opts->queue_size > ctrl->sqsize + 1) dev_warn(ctrl->device, "queue_size %zu > ctrl sqsize %u, clamping down\n", @@ -2269,7 +2278,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0; blk_status_t ret; - ret = nvme_setup_cmd(ns, rq, &pdu->cmd); + ret = nvme_setup_cmd(ns, rq); if (ret) return ret; @@ -2330,8 +2339,8 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags); blk_status_t ret; - if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) - return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq); + if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) + return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq); ret = nvme_tcp_setup_cmd_pdu(ns, rq); if (unlikely(ret)) diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index bc2f344f0ae0..475dd45c3db4 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -96,7 +96,7 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) dev_warn(ns->ctrl->device, "zone operations:%x not supported for namespace:%u\n", le16_to_cpu(id->zoc), ns->head->ns_id); - status = -EINVAL; + status = -ENODEV; goto free_data; } @@ -105,7 +105,7 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) dev_warn(ns->ctrl->device, "invalid zone size:%llu for namespace:%u\n", ns->zsze, ns->head->ns_id); - status = -EINVAL; + status = -ENODEV; goto free_data; } |