diff options
author | Maxime Ripard <maxime.ripard@bootlin.com> | 2019-01-11 18:32:10 +0300 |
---|---|---|
committer | Maxime Ripard <maxime.ripard@bootlin.com> | 2019-01-11 18:32:10 +0300 |
commit | 23d19ba06b9c5614d6457f5fed349ec8f6d4dac9 (patch) | |
tree | 39f0b657e5b1b5b958780cae4ae6360f69548d50 /drivers/nvme/target | |
parent | 7d0250ed8e69fb6a66caecf60b8753a21224cc1a (diff) | |
parent | e3d093070eb0b5e3df668d3eb04100ea79343c65 (diff) | |
download | linux-23d19ba06b9c5614d6457f5fed349ec8f6d4dac9.tar.xz |
Merge drm/drm-next into drm-misc-next
drm-next has been forwarded to 5.0-rc1, and we need it to apply the damage
helper for dirtyfb series from Noralf Trønnes.
Signed-off-by: Maxime Ripard <maxime.ripard@bootlin.com>
Diffstat (limited to 'drivers/nvme/target')
-rw-r--r-- | drivers/nvme/target/Kconfig | 10 | ||||
-rw-r--r-- | drivers/nvme/target/Makefile | 2 | ||||
-rw-r--r-- | drivers/nvme/target/admin-cmd.c | 146 | ||||
-rw-r--r-- | drivers/nvme/target/configfs.c | 43 | ||||
-rw-r--r-- | drivers/nvme/target/core.c | 220 | ||||
-rw-r--r-- | drivers/nvme/target/discovery.c | 139 | ||||
-rw-r--r-- | drivers/nvme/target/fabrics-cmd.c | 64 | ||||
-rw-r--r-- | drivers/nvme/target/fc.c | 66 | ||||
-rw-r--r-- | drivers/nvme/target/io-cmd-bdev.c | 89 | ||||
-rw-r--r-- | drivers/nvme/target/io-cmd-file.c | 165 | ||||
-rw-r--r-- | drivers/nvme/target/loop.c | 2 | ||||
-rw-r--r-- | drivers/nvme/target/nvmet.h | 68 | ||||
-rw-r--r-- | drivers/nvme/target/rdma.c | 15 | ||||
-rw-r--r-- | drivers/nvme/target/tcp.c | 1737 |
14 files changed, 2452 insertions, 314 deletions
diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index 3c7b61ddb0d1..d94f25cde019 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -60,3 +60,13 @@ config NVME_TARGET_FCLOOP to test NVMe-FC transport interfaces. If unsure, say N. + +config NVME_TARGET_TCP + tristate "NVMe over Fabrics TCP target support" + depends on INET + depends on NVME_TARGET + help + This enables the NVMe TCP target support, which allows exporting NVMe + devices over TCP. + + If unsure, say N. diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile index 8118c93391c6..8c3ad0fb6860 100644 --- a/drivers/nvme/target/Makefile +++ b/drivers/nvme/target/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_NVME_TARGET_LOOP) += nvme-loop.o obj-$(CONFIG_NVME_TARGET_RDMA) += nvmet-rdma.o obj-$(CONFIG_NVME_TARGET_FC) += nvmet-fc.o obj-$(CONFIG_NVME_TARGET_FCLOOP) += nvme-fcloop.o +obj-$(CONFIG_NVME_TARGET_TCP) += nvmet-tcp.o nvmet-y += core.o configfs.o admin-cmd.o fabrics-cmd.o \ discovery.o io-cmd-file.o io-cmd-bdev.o @@ -12,3 +13,4 @@ nvme-loop-y += loop.o nvmet-rdma-y += rdma.o nvmet-fc-y += fc.o nvme-fcloop-y += fcloop.o +nvmet-tcp-y += tcp.o diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 1179f6314323..11baeb14c388 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -19,19 +19,6 @@ #include <asm/unaligned.h> #include "nvmet.h" -/* - * This helper allows us to clear the AEN based on the RAE bit, - * Please use this helper when processing the log pages which are - * associated with the AEN. - */ -static inline void nvmet_clear_aen(struct nvmet_req *req, u32 aen_bit) -{ - int rae = le32_to_cpu(req->cmd->common.cdw10[0]) & 1 << 15; - - if (!rae) - clear_bit(aen_bit, &req->sq->ctrl->aen_masked); -} - u32 nvmet_get_log_page_len(struct nvme_command *cmd) { u32 len = le16_to_cpu(cmd->get_log_page.numdu); @@ -50,6 +37,34 @@ static void nvmet_execute_get_log_page_noop(struct nvmet_req *req) nvmet_req_complete(req, nvmet_zero_sgl(req, 0, req->data_len)); } +static void nvmet_execute_get_log_page_error(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + u16 status = NVME_SC_SUCCESS; + unsigned long flags; + off_t offset = 0; + u64 slot; + u64 i; + + spin_lock_irqsave(&ctrl->error_lock, flags); + slot = ctrl->err_counter % NVMET_ERROR_LOG_SLOTS; + + for (i = 0; i < NVMET_ERROR_LOG_SLOTS; i++) { + status = nvmet_copy_to_sgl(req, offset, &ctrl->slots[slot], + sizeof(struct nvme_error_slot)); + if (status) + break; + + if (slot == 0) + slot = NVMET_ERROR_LOG_SLOTS - 1; + else + slot--; + offset += sizeof(struct nvme_error_slot); + } + spin_unlock_irqrestore(&ctrl->error_lock, flags); + nvmet_req_complete(req, status); +} + static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, struct nvme_smart_log *slog) { @@ -60,6 +75,7 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, if (!ns) { pr_err("Could not find namespace id : %d\n", le32_to_cpu(req->cmd->get_log_page.nsid)); + req->error_loc = offsetof(struct nvme_rw_command, nsid); return NVME_SC_INVALID_NS; } @@ -119,6 +135,7 @@ static void nvmet_execute_get_log_page_smart(struct nvmet_req *req) { struct nvme_smart_log *log; u16 status = NVME_SC_INTERNAL; + unsigned long flags; if (req->data_len != sizeof(*log)) goto out; @@ -134,6 +151,11 @@ static void nvmet_execute_get_log_page_smart(struct nvmet_req *req) if (status) goto out_free_log; + spin_lock_irqsave(&req->sq->ctrl->error_lock, flags); + put_unaligned_le64(req->sq->ctrl->err_counter, + &log->num_err_log_entries); + spin_unlock_irqrestore(&req->sq->ctrl->error_lock, flags); + status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log)); out_free_log: kfree(log); @@ -189,7 +211,7 @@ static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req) if (!status) status = nvmet_zero_sgl(req, len, req->data_len - len); ctrl->nr_changed_ns = 0; - nvmet_clear_aen(req, NVME_AEN_CFG_NS_ATTR); + nvmet_clear_aen_bit(req, NVME_AEN_BIT_NS_ATTR); mutex_unlock(&ctrl->lock); out: nvmet_req_complete(req, status); @@ -252,7 +274,7 @@ static void nvmet_execute_get_log_page_ana(struct nvmet_req *req) hdr.chgcnt = cpu_to_le64(nvmet_ana_chgcnt); hdr.ngrps = cpu_to_le16(ngrps); - nvmet_clear_aen(req, NVME_AEN_CFG_ANA_CHANGE); + nvmet_clear_aen_bit(req, NVME_AEN_BIT_ANA_CHANGE); up_read(&nvmet_ana_sem); kfree(desc); @@ -304,7 +326,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) /* XXX: figure out what to do about RTD3R/RTD3 */ id->oaes = cpu_to_le32(NVMET_AEN_CFG_OPTIONAL); - id->ctratt = cpu_to_le32(1 << 0); + id->ctratt = cpu_to_le32(NVME_CTRL_ATTR_HID_128_BIT | + NVME_CTRL_ATTR_TBKAS); id->oacs = 0; @@ -392,6 +415,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) u16 status = 0; if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) { + req->error_loc = offsetof(struct nvme_identify, nsid); status = NVME_SC_INVALID_NS | NVME_SC_DNR; goto out; } @@ -512,6 +536,7 @@ static void nvmet_execute_identify_desclist(struct nvmet_req *req) ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid); if (!ns) { + req->error_loc = offsetof(struct nvme_identify, nsid); status = NVME_SC_INVALID_NS | NVME_SC_DNR; goto out; } @@ -569,13 +594,15 @@ static u16 nvmet_write_protect_flush_sync(struct nvmet_req *req) static u16 nvmet_set_feat_write_protect(struct nvmet_req *req) { - u32 write_protect = le32_to_cpu(req->cmd->common.cdw10[1]); + u32 write_protect = le32_to_cpu(req->cmd->common.cdw11); struct nvmet_subsys *subsys = req->sq->ctrl->subsys; u16 status = NVME_SC_FEATURE_NOT_CHANGEABLE; req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->rw.nsid); - if (unlikely(!req->ns)) + if (unlikely(!req->ns)) { + req->error_loc = offsetof(struct nvme_common_command, nsid); return status; + } mutex_lock(&subsys->lock); switch (write_protect) { @@ -599,11 +626,36 @@ static u16 nvmet_set_feat_write_protect(struct nvmet_req *req) return status; } +u16 nvmet_set_feat_kato(struct nvmet_req *req) +{ + u32 val32 = le32_to_cpu(req->cmd->common.cdw11); + + req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000); + + nvmet_set_result(req, req->sq->ctrl->kato); + + return 0; +} + +u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask) +{ + u32 val32 = le32_to_cpu(req->cmd->common.cdw11); + + if (val32 & ~mask) { + req->error_loc = offsetof(struct nvme_common_command, cdw11); + return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } + + WRITE_ONCE(req->sq->ctrl->aen_enabled, val32); + nvmet_set_result(req, val32); + + return 0; +} + static void nvmet_execute_set_features(struct nvmet_req *req) { struct nvmet_subsys *subsys = req->sq->ctrl->subsys; - u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]); - u32 val32; + u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); u16 status = 0; switch (cdw10 & 0xff) { @@ -612,19 +664,10 @@ static void nvmet_execute_set_features(struct nvmet_req *req) (subsys->max_qid - 1) | ((subsys->max_qid - 1) << 16)); break; case NVME_FEAT_KATO: - val32 = le32_to_cpu(req->cmd->common.cdw10[1]); - req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000); - nvmet_set_result(req, req->sq->ctrl->kato); + status = nvmet_set_feat_kato(req); break; case NVME_FEAT_ASYNC_EVENT: - val32 = le32_to_cpu(req->cmd->common.cdw10[1]); - if (val32 & ~NVMET_AEN_CFG_ALL) { - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; - break; - } - - WRITE_ONCE(req->sq->ctrl->aen_enabled, val32); - nvmet_set_result(req, val32); + status = nvmet_set_feat_async_event(req, NVMET_AEN_CFG_ALL); break; case NVME_FEAT_HOST_ID: status = NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; @@ -633,6 +676,7 @@ static void nvmet_execute_set_features(struct nvmet_req *req) status = nvmet_set_feat_write_protect(req); break; default: + req->error_loc = offsetof(struct nvme_common_command, cdw10); status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; break; } @@ -646,9 +690,10 @@ static u16 nvmet_get_feat_write_protect(struct nvmet_req *req) u32 result; req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->common.nsid); - if (!req->ns) + if (!req->ns) { + req->error_loc = offsetof(struct nvme_common_command, nsid); return NVME_SC_INVALID_NS | NVME_SC_DNR; - + } mutex_lock(&subsys->lock); if (req->ns->readonly == true) result = NVME_NS_WRITE_PROTECT; @@ -660,10 +705,20 @@ static u16 nvmet_get_feat_write_protect(struct nvmet_req *req) return 0; } +void nvmet_get_feat_kato(struct nvmet_req *req) +{ + nvmet_set_result(req, req->sq->ctrl->kato * 1000); +} + +void nvmet_get_feat_async_event(struct nvmet_req *req) +{ + nvmet_set_result(req, READ_ONCE(req->sq->ctrl->aen_enabled)); +} + static void nvmet_execute_get_features(struct nvmet_req *req) { struct nvmet_subsys *subsys = req->sq->ctrl->subsys; - u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]); + u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); u16 status = 0; switch (cdw10 & 0xff) { @@ -689,7 +744,7 @@ static void nvmet_execute_get_features(struct nvmet_req *req) break; #endif case NVME_FEAT_ASYNC_EVENT: - nvmet_set_result(req, READ_ONCE(req->sq->ctrl->aen_enabled)); + nvmet_get_feat_async_event(req); break; case NVME_FEAT_VOLATILE_WC: nvmet_set_result(req, 1); @@ -699,11 +754,13 @@ static void nvmet_execute_get_features(struct nvmet_req *req) (subsys->max_qid-1) | ((subsys->max_qid-1) << 16)); break; case NVME_FEAT_KATO: - nvmet_set_result(req, req->sq->ctrl->kato * 1000); + nvmet_get_feat_kato(req); break; case NVME_FEAT_HOST_ID: /* need 128-bit host identifier flag */ - if (!(req->cmd->common.cdw10[1] & cpu_to_le32(1 << 0))) { + if (!(req->cmd->common.cdw11 & cpu_to_le32(1 << 0))) { + req->error_loc = + offsetof(struct nvme_common_command, cdw11); status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; break; } @@ -715,6 +772,8 @@ static void nvmet_execute_get_features(struct nvmet_req *req) status = nvmet_get_feat_write_protect(req); break; default: + req->error_loc = + offsetof(struct nvme_common_command, cdw10); status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; break; } @@ -722,7 +781,7 @@ static void nvmet_execute_get_features(struct nvmet_req *req) nvmet_req_complete(req, status); } -static void nvmet_execute_async_event(struct nvmet_req *req) +void nvmet_execute_async_event(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; @@ -738,7 +797,7 @@ static void nvmet_execute_async_event(struct nvmet_req *req) schedule_work(&ctrl->async_event_work); } -static void nvmet_execute_keep_alive(struct nvmet_req *req) +void nvmet_execute_keep_alive(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; @@ -764,13 +823,7 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) switch (cmd->get_log_page.lid) { case NVME_LOG_ERROR: - /* - * We currently never set the More bit in the status - * field, so all error log entries are invalid and can - * be zeroed out. This is called a minum viable - * implementation (TM) of this mandatory log page. - */ - req->execute = nvmet_execute_get_log_page_noop; + req->execute = nvmet_execute_get_log_page_error; return 0; case NVME_LOG_SMART: req->execute = nvmet_execute_get_log_page_smart; @@ -836,5 +889,6 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode, req->sq->qid); + req->error_loc = offsetof(struct nvme_common_command, opcode); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index d895579b6c5d..618bbd006544 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -25,12 +25,16 @@ static const struct config_item_type nvmet_host_type; static const struct config_item_type nvmet_subsys_type; +static LIST_HEAD(nvmet_ports_list); +struct list_head *nvmet_ports = &nvmet_ports_list; + static const struct nvmet_transport_name { u8 type; const char *name; } nvmet_transport_names[] = { { NVMF_TRTYPE_RDMA, "rdma" }, { NVMF_TRTYPE_FC, "fc" }, + { NVMF_TRTYPE_TCP, "tcp" }, { NVMF_TRTYPE_LOOP, "loop" }, }; @@ -150,7 +154,8 @@ CONFIGFS_ATTR(nvmet_, addr_traddr); static ssize_t nvmet_addr_treq_show(struct config_item *item, char *page) { - switch (to_nvmet_port(item)->disc_addr.treq) { + switch (to_nvmet_port(item)->disc_addr.treq & + NVME_TREQ_SECURE_CHANNEL_MASK) { case NVMF_TREQ_NOT_SPECIFIED: return sprintf(page, "not specified\n"); case NVMF_TREQ_REQUIRED: @@ -166,6 +171,7 @@ static ssize_t nvmet_addr_treq_store(struct config_item *item, const char *page, size_t count) { struct nvmet_port *port = to_nvmet_port(item); + u8 treq = port->disc_addr.treq & ~NVME_TREQ_SECURE_CHANNEL_MASK; if (port->enabled) { pr_err("Cannot modify address while enabled\n"); @@ -174,15 +180,16 @@ static ssize_t nvmet_addr_treq_store(struct config_item *item, } if (sysfs_streq(page, "not specified")) { - port->disc_addr.treq = NVMF_TREQ_NOT_SPECIFIED; + treq |= NVMF_TREQ_NOT_SPECIFIED; } else if (sysfs_streq(page, "required")) { - port->disc_addr.treq = NVMF_TREQ_REQUIRED; + treq |= NVMF_TREQ_REQUIRED; } else if (sysfs_streq(page, "not required")) { - port->disc_addr.treq = NVMF_TREQ_NOT_REQUIRED; + treq |= NVMF_TREQ_NOT_REQUIRED; } else { pr_err("Invalid value '%s' for treq\n", page); return -EINVAL; } + port->disc_addr.treq = treq; return count; } @@ -646,7 +653,8 @@ static int nvmet_port_subsys_allow_link(struct config_item *parent, } list_add_tail(&link->entry, &port->subsystems); - nvmet_genctr++; + nvmet_port_disc_changed(port, subsys); + up_write(&nvmet_config_sem); return 0; @@ -673,7 +681,8 @@ static void nvmet_port_subsys_drop_link(struct config_item *parent, found: list_del(&p->entry); - nvmet_genctr++; + nvmet_port_disc_changed(port, subsys); + if (list_empty(&port->subsystems)) nvmet_disable_port(port); up_write(&nvmet_config_sem); @@ -722,7 +731,8 @@ static int nvmet_allowed_hosts_allow_link(struct config_item *parent, goto out_free_link; } list_add_tail(&link->entry, &subsys->hosts); - nvmet_genctr++; + nvmet_subsys_disc_changed(subsys, host); + up_write(&nvmet_config_sem); return 0; out_free_link: @@ -748,7 +758,8 @@ static void nvmet_allowed_hosts_drop_link(struct config_item *parent, found: list_del(&p->entry); - nvmet_genctr++; + nvmet_subsys_disc_changed(subsys, host); + up_write(&nvmet_config_sem); kfree(p); } @@ -787,7 +798,11 @@ static ssize_t nvmet_subsys_attr_allow_any_host_store(struct config_item *item, goto out_unlock; } - subsys->allow_any_host = allow_any_host; + if (subsys->allow_any_host != allow_any_host) { + subsys->allow_any_host = allow_any_host; + nvmet_subsys_disc_changed(subsys, NULL); + } + out_unlock: up_write(&nvmet_config_sem); return ret ? ret : count; @@ -936,7 +951,7 @@ static ssize_t nvmet_referral_enable_store(struct config_item *item, if (enable) nvmet_referral_enable(parent, port); else - nvmet_referral_disable(port); + nvmet_referral_disable(parent, port); return count; inval: @@ -962,9 +977,10 @@ static struct configfs_attribute *nvmet_referral_attrs[] = { static void nvmet_referral_release(struct config_item *item) { + struct nvmet_port *parent = to_nvmet_port(item->ci_parent->ci_parent); struct nvmet_port *port = to_nvmet_port(item); - nvmet_referral_disable(port); + nvmet_referral_disable(parent, port); kfree(port); } @@ -1137,6 +1153,8 @@ static void nvmet_port_release(struct config_item *item) { struct nvmet_port *port = to_nvmet_port(item); + list_del(&port->global_entry); + kfree(port->ana_state); kfree(port); } @@ -1189,12 +1207,15 @@ static struct config_group *nvmet_ports_make(struct config_group *group, port->ana_state[i] = NVME_ANA_INACCESSIBLE; } + list_add(&port->global_entry, &nvmet_ports_list); + INIT_LIST_HEAD(&port->entry); INIT_LIST_HEAD(&port->subsystems); INIT_LIST_HEAD(&port->referrals); port->inline_data_size = -1; /* < 0 == let the transport choose */ port->disc_addr.portid = cpu_to_le16(portid); + port->disc_addr.treq = NVMF_TREQ_DISABLE_SQFLOW; config_group_init_type_name(&port->group, name, &nvmet_port_type); config_group_init_type_name(&port->subsys_group, diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index a5f9bbce863f..88d260f31835 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -45,28 +45,72 @@ u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1]; u64 nvmet_ana_chgcnt; DECLARE_RWSEM(nvmet_ana_sem); +inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno) +{ + u16 status; + + switch (errno) { + case -ENOSPC: + req->error_loc = offsetof(struct nvme_rw_command, length); + status = NVME_SC_CAP_EXCEEDED | NVME_SC_DNR; + break; + case -EREMOTEIO: + req->error_loc = offsetof(struct nvme_rw_command, slba); + status = NVME_SC_LBA_RANGE | NVME_SC_DNR; + break; + case -EOPNOTSUPP: + req->error_loc = offsetof(struct nvme_common_command, opcode); + switch (req->cmd->common.opcode) { + case nvme_cmd_dsm: + case nvme_cmd_write_zeroes: + status = NVME_SC_ONCS_NOT_SUPPORTED | NVME_SC_DNR; + break; + default: + status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + break; + case -ENODATA: + req->error_loc = offsetof(struct nvme_rw_command, nsid); + status = NVME_SC_ACCESS_DENIED; + break; + case -EIO: + /* FALLTHRU */ + default: + req->error_loc = offsetof(struct nvme_common_command, opcode); + status = NVME_SC_INTERNAL | NVME_SC_DNR; + } + + return status; +} + static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, const char *subsysnqn); u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf, size_t len) { - if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len) + if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len) { + req->error_loc = offsetof(struct nvme_common_command, dptr); return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; + } return 0; } u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len) { - if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len) + if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len) { + req->error_loc = offsetof(struct nvme_common_command, dptr); return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; + } return 0; } u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len) { - if (sg_zero_buffer(req->sg, req->sg_cnt, len, off) != len) + if (sg_zero_buffer(req->sg, req->sg_cnt, len, off) != len) { + req->error_loc = offsetof(struct nvme_common_command, dptr); return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; + } return 0; } @@ -130,7 +174,7 @@ static void nvmet_async_event_work(struct work_struct *work) } } -static void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, +void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, u8 event_info, u8 log_page) { struct nvmet_async_event *aen; @@ -150,13 +194,6 @@ static void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, schedule_work(&ctrl->async_event_work); } -static bool nvmet_aen_disabled(struct nvmet_ctrl *ctrl, u32 aen) -{ - if (!(READ_ONCE(ctrl->aen_enabled) & aen)) - return true; - return test_and_set_bit(aen, &ctrl->aen_masked); -} - static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid) { u32 i; @@ -187,7 +224,7 @@ void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid) list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { nvmet_add_to_changed_ns_log(ctrl, cpu_to_le32(nsid)); - if (nvmet_aen_disabled(ctrl, NVME_AEN_CFG_NS_ATTR)) + if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_NS_ATTR)) continue; nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, NVME_AER_NOTICE_NS_CHANGED, @@ -204,7 +241,7 @@ void nvmet_send_ana_event(struct nvmet_subsys *subsys, list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { if (port && ctrl->port != port) continue; - if (nvmet_aen_disabled(ctrl, NVME_AEN_CFG_ANA_CHANGE)) + if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_ANA_CHANGE)) continue; nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, NVME_AER_NOTICE_ANA, NVME_LOG_ANA); @@ -299,6 +336,15 @@ static void nvmet_keep_alive_timer(struct work_struct *work) { struct nvmet_ctrl *ctrl = container_of(to_delayed_work(work), struct nvmet_ctrl, ka_work); + bool cmd_seen = ctrl->cmd_seen; + + ctrl->cmd_seen = false; + if (cmd_seen) { + pr_debug("ctrl %d reschedule traffic based keep-alive timer\n", + ctrl->cntlid); + schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); + return; + } pr_err("ctrl %d keep-alive timer (%d seconds) expired!\n", ctrl->cntlid, ctrl->kato); @@ -595,26 +641,58 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) return ns; } -static void __nvmet_req_complete(struct nvmet_req *req, u16 status) +static void nvmet_update_sq_head(struct nvmet_req *req) { - u32 old_sqhd, new_sqhd; - u16 sqhd; - - if (status) - nvmet_set_status(req, status); - if (req->sq->size) { + u32 old_sqhd, new_sqhd; + do { old_sqhd = req->sq->sqhd; new_sqhd = (old_sqhd + 1) % req->sq->size; } while (cmpxchg(&req->sq->sqhd, old_sqhd, new_sqhd) != old_sqhd); } - sqhd = req->sq->sqhd & 0x0000FFFF; - req->rsp->sq_head = cpu_to_le16(sqhd); + req->rsp->sq_head = cpu_to_le16(req->sq->sqhd & 0x0000FFFF); +} + +static void nvmet_set_error(struct nvmet_req *req, u16 status) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvme_error_slot *new_error_slot; + unsigned long flags; + + req->rsp->status = cpu_to_le16(status << 1); + + if (!ctrl || req->error_loc == NVMET_NO_ERROR_LOC) + return; + + spin_lock_irqsave(&ctrl->error_lock, flags); + ctrl->err_counter++; + new_error_slot = + &ctrl->slots[ctrl->err_counter % NVMET_ERROR_LOG_SLOTS]; + + new_error_slot->error_count = cpu_to_le64(ctrl->err_counter); + new_error_slot->sqid = cpu_to_le16(req->sq->qid); + new_error_slot->cmdid = cpu_to_le16(req->cmd->common.command_id); + new_error_slot->status_field = cpu_to_le16(status << 1); + new_error_slot->param_error_location = cpu_to_le16(req->error_loc); + new_error_slot->lba = cpu_to_le64(req->error_slba); + new_error_slot->nsid = req->cmd->common.nsid; + spin_unlock_irqrestore(&ctrl->error_lock, flags); + + /* set the more bit for this request */ + req->rsp->status |= cpu_to_le16(1 << 14); +} + +static void __nvmet_req_complete(struct nvmet_req *req, u16 status) +{ + if (!req->sq->sqhd_disabled) + nvmet_update_sq_head(req); req->rsp->sq_id = cpu_to_le16(req->sq->qid); req->rsp->command_id = req->cmd->common.command_id; + if (unlikely(status)) + nvmet_set_error(req, status); if (req->ns) nvmet_put_namespace(req->ns); req->ops->queue_response(req); @@ -735,14 +813,20 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req) return ret; req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid); - if (unlikely(!req->ns)) + if (unlikely(!req->ns)) { + req->error_loc = offsetof(struct nvme_common_command, nsid); return NVME_SC_INVALID_NS | NVME_SC_DNR; + } ret = nvmet_check_ana_state(req->port, req->ns); - if (unlikely(ret)) + if (unlikely(ret)) { + req->error_loc = offsetof(struct nvme_common_command, nsid); return ret; + } ret = nvmet_io_cmd_check_access(req); - if (unlikely(ret)) + if (unlikely(ret)) { + req->error_loc = offsetof(struct nvme_common_command, nsid); return ret; + } if (req->ns->file) return nvmet_file_parse_io_cmd(req); @@ -763,10 +847,14 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, req->sg_cnt = 0; req->transfer_len = 0; req->rsp->status = 0; + req->rsp->sq_head = 0; req->ns = NULL; + req->error_loc = NVMET_NO_ERROR_LOC; + req->error_slba = 0; /* no support for fused commands yet */ if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) { + req->error_loc = offsetof(struct nvme_common_command, flags); status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; goto fail; } @@ -777,6 +865,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, * byte aligned. */ if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF)) { + req->error_loc = offsetof(struct nvme_common_command, flags); status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; goto fail; } @@ -801,6 +890,9 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, goto fail; } + if (sq->ctrl) + sq->ctrl->cmd_seen = true; + return true; fail: @@ -819,9 +911,10 @@ EXPORT_SYMBOL_GPL(nvmet_req_uninit); void nvmet_req_execute(struct nvmet_req *req) { - if (unlikely(req->data_len != req->transfer_len)) + if (unlikely(req->data_len != req->transfer_len)) { + req->error_loc = offsetof(struct nvme_common_command, dptr); nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR); - else + } else req->execute(req); } EXPORT_SYMBOL_GPL(nvmet_req_execute); @@ -1027,14 +1120,18 @@ u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd) return 0; } -static bool __nvmet_host_allowed(struct nvmet_subsys *subsys, - const char *hostnqn) +bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn) { struct nvmet_host_link *p; + lockdep_assert_held(&nvmet_config_sem); + if (subsys->allow_any_host) return true; + if (subsys->type == NVME_NQN_DISC) /* allow all access to disc subsys */ + return true; + list_for_each_entry(p, &subsys->hosts, entry) { if (!strcmp(nvmet_host_name(p->host), hostnqn)) return true; @@ -1043,30 +1140,6 @@ static bool __nvmet_host_allowed(struct nvmet_subsys *subsys, return false; } -static bool nvmet_host_discovery_allowed(struct nvmet_req *req, - const char *hostnqn) -{ - struct nvmet_subsys_link *s; - - list_for_each_entry(s, &req->port->subsystems, entry) { - if (__nvmet_host_allowed(s->subsys, hostnqn)) - return true; - } - - return false; -} - -bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys, - const char *hostnqn) -{ - lockdep_assert_held(&nvmet_config_sem); - - if (subsys->type == NVME_NQN_DISC) - return nvmet_host_discovery_allowed(req, hostnqn); - else - return __nvmet_host_allowed(subsys, hostnqn); -} - /* * Note: ctrl->subsys->lock should be held when calling this function */ @@ -1117,7 +1190,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; down_read(&nvmet_config_sem); - if (!nvmet_host_allowed(req, subsys, hostnqn)) { + if (!nvmet_host_allowed(subsys, hostnqn)) { pr_info("connect by host %s for subsystem %s not allowed\n", hostnqn, subsysnqn); req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn); @@ -1175,31 +1248,20 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, ctrl->cntlid = ret; ctrl->ops = req->ops; - if (ctrl->subsys->type == NVME_NQN_DISC) { - /* Don't accept keep-alive timeout for discovery controllers */ - if (kato) { - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; - goto out_remove_ida; - } - /* - * Discovery controllers use some arbitrary high value in order - * to cleanup stale discovery sessions - * - * From the latest base diff RC: - * "The Keep Alive command is not supported by - * Discovery controllers. A transport may specify a - * fixed Discovery controller activity timeout value - * (e.g., 2 minutes). If no commands are received - * by a Discovery controller within that time - * period, the controller may perform the - * actions for Keep Alive Timer expiration". - */ - ctrl->kato = NVMET_DISC_KATO; - } else { - /* keep-alive timeout in seconds */ - ctrl->kato = DIV_ROUND_UP(kato, 1000); - } + /* + * Discovery controllers may use some arbitrary high value + * in order to cleanup stale discovery sessions + */ + if ((ctrl->subsys->type == NVME_NQN_DISC) && !kato) + kato = NVMET_DISC_KATO_MS; + + /* keep-alive timeout in seconds */ + ctrl->kato = DIV_ROUND_UP(kato, 1000); + + ctrl->err_counter = 0; + spin_lock_init(&ctrl->error_lock); + nvmet_start_keep_alive_timer(ctrl); mutex_lock(&subsys->lock); @@ -1210,8 +1272,6 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, *ctrlp = ctrl; return 0; -out_remove_ida: - ida_simple_remove(&cntlid_ida, ctrl->cntlid); out_free_sqs: kfree(ctrl->sqs); out_free_cqs: diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index bc0aa0bf1543..d2cb71a0b419 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -18,7 +18,65 @@ struct nvmet_subsys *nvmet_disc_subsys; -u64 nvmet_genctr; +static u64 nvmet_genctr; + +static void __nvmet_disc_changed(struct nvmet_port *port, + struct nvmet_ctrl *ctrl) +{ + if (ctrl->port != port) + return; + + if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_DISC_CHANGE)) + return; + + nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, + NVME_AER_NOTICE_DISC_CHANGED, NVME_LOG_DISC); +} + +void nvmet_port_disc_changed(struct nvmet_port *port, + struct nvmet_subsys *subsys) +{ + struct nvmet_ctrl *ctrl; + + nvmet_genctr++; + + list_for_each_entry(ctrl, &nvmet_disc_subsys->ctrls, subsys_entry) { + if (subsys && !nvmet_host_allowed(subsys, ctrl->hostnqn)) + continue; + + __nvmet_disc_changed(port, ctrl); + } +} + +static void __nvmet_subsys_disc_changed(struct nvmet_port *port, + struct nvmet_subsys *subsys, + struct nvmet_host *host) +{ + struct nvmet_ctrl *ctrl; + + list_for_each_entry(ctrl, &nvmet_disc_subsys->ctrls, subsys_entry) { + if (host && strcmp(nvmet_host_name(host), ctrl->hostnqn)) + continue; + + __nvmet_disc_changed(port, ctrl); + } +} + +void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys, + struct nvmet_host *host) +{ + struct nvmet_port *port; + struct nvmet_subsys_link *s; + + nvmet_genctr++; + + list_for_each_entry(port, nvmet_ports, global_entry) + list_for_each_entry(s, &port->subsystems, entry) { + if (s->subsys != subsys) + continue; + __nvmet_subsys_disc_changed(port, subsys, host); + } +} void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port) { @@ -26,18 +84,18 @@ void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port) if (list_empty(&port->entry)) { list_add_tail(&port->entry, &parent->referrals); port->enabled = true; - nvmet_genctr++; + nvmet_port_disc_changed(parent, NULL); } up_write(&nvmet_config_sem); } -void nvmet_referral_disable(struct nvmet_port *port) +void nvmet_referral_disable(struct nvmet_port *parent, struct nvmet_port *port) { down_write(&nvmet_config_sem); if (!list_empty(&port->entry)) { port->enabled = false; list_del_init(&port->entry); - nvmet_genctr++; + nvmet_port_disc_changed(parent, NULL); } up_write(&nvmet_config_sem); } @@ -107,7 +165,7 @@ static void nvmet_execute_get_disc_log_page(struct nvmet_req *req) down_read(&nvmet_config_sem); list_for_each_entry(p, &req->port->subsystems, entry) { - if (!nvmet_host_allowed(req, p->subsys, ctrl->hostnqn)) + if (!nvmet_host_allowed(p->subsys, ctrl->hostnqn)) continue; if (residual_len >= entry_size) { char traddr[NVMF_TRADDR_SIZE]; @@ -136,6 +194,8 @@ static void nvmet_execute_get_disc_log_page(struct nvmet_req *req) hdr->numrec = cpu_to_le64(numrec); hdr->recfmt = cpu_to_le16(0); + nvmet_clear_aen_bit(req, NVME_AEN_BIT_DISC_CHANGE); + up_read(&nvmet_config_sem); status = nvmet_copy_to_sgl(req, 0, hdr, data_len); @@ -174,6 +234,8 @@ static void nvmet_execute_identify_disc_ctrl(struct nvmet_req *req) if (req->port->inline_data_size) id->sgls |= cpu_to_le32(1 << 20); + id->oaes = cpu_to_le32(NVMET_DISC_AEN_CFG_OPTIONAL); + strlcpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn)); status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); @@ -183,6 +245,51 @@ out: nvmet_req_complete(req, status); } +static void nvmet_execute_disc_set_features(struct nvmet_req *req) +{ + u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); + u16 stat; + + switch (cdw10 & 0xff) { + case NVME_FEAT_KATO: + stat = nvmet_set_feat_kato(req); + break; + case NVME_FEAT_ASYNC_EVENT: + stat = nvmet_set_feat_async_event(req, + NVMET_DISC_AEN_CFG_OPTIONAL); + break; + default: + req->error_loc = + offsetof(struct nvme_common_command, cdw10); + stat = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + + nvmet_req_complete(req, stat); +} + +static void nvmet_execute_disc_get_features(struct nvmet_req *req) +{ + u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); + u16 stat = 0; + + switch (cdw10 & 0xff) { + case NVME_FEAT_KATO: + nvmet_get_feat_kato(req); + break; + case NVME_FEAT_ASYNC_EVENT: + nvmet_get_feat_async_event(req); + break; + default: + req->error_loc = + offsetof(struct nvme_common_command, cdw10); + stat = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + + nvmet_req_complete(req, stat); +} + u16 nvmet_parse_discovery_cmd(struct nvmet_req *req) { struct nvme_command *cmd = req->cmd; @@ -190,10 +297,28 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req) if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { pr_err("got cmd %d while not ready\n", cmd->common.opcode); + req->error_loc = + offsetof(struct nvme_common_command, opcode); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } switch (cmd->common.opcode) { + case nvme_admin_set_features: + req->execute = nvmet_execute_disc_set_features; + req->data_len = 0; + return 0; + case nvme_admin_get_features: + req->execute = nvmet_execute_disc_get_features; + req->data_len = 0; + return 0; + case nvme_admin_async_event: + req->execute = nvmet_execute_async_event; + req->data_len = 0; + return 0; + case nvme_admin_keep_alive: + req->execute = nvmet_execute_keep_alive; + req->data_len = 0; + return 0; case nvme_admin_get_log_page: req->data_len = nvmet_get_log_page_len(cmd); @@ -204,6 +329,8 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req) default: pr_err("unsupported get_log_page lid %d\n", cmd->get_log_page.lid); + req->error_loc = + offsetof(struct nvme_get_log_page_command, lid); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } case nvme_admin_identify: @@ -216,10 +343,12 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req) default: pr_err("unsupported identify cns %d\n", cmd->identify.cns); + req->error_loc = offsetof(struct nvme_identify, cns); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } default: pr_err("unhandled cmd %d\n", cmd->common.opcode); + req->error_loc = offsetof(struct nvme_common_command, opcode); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index d84ae004cb85..6cf1fd9eb32e 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -17,23 +17,26 @@ static void nvmet_execute_prop_set(struct nvmet_req *req) { + u64 val = le64_to_cpu(req->cmd->prop_set.value); u16 status = 0; - if (!(req->cmd->prop_set.attrib & 1)) { - u64 val = le64_to_cpu(req->cmd->prop_set.value); - - switch (le32_to_cpu(req->cmd->prop_set.offset)) { - case NVME_REG_CC: - nvmet_update_cc(req->sq->ctrl, val); - break; - default: - status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; - break; - } - } else { + if (req->cmd->prop_set.attrib & 1) { + req->error_loc = + offsetof(struct nvmf_property_set_command, attrib); status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto out; } + switch (le32_to_cpu(req->cmd->prop_set.offset)) { + case NVME_REG_CC: + nvmet_update_cc(req->sq->ctrl, val); + break; + default: + req->error_loc = + offsetof(struct nvmf_property_set_command, offset); + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } +out: nvmet_req_complete(req, status); } @@ -69,6 +72,14 @@ static void nvmet_execute_prop_get(struct nvmet_req *req) } } + if (status && req->cmd->prop_get.attrib & 1) { + req->error_loc = + offsetof(struct nvmf_property_get_command, offset); + } else { + req->error_loc = + offsetof(struct nvmf_property_get_command, attrib); + } + req->rsp->result.u64 = cpu_to_le64(val); nvmet_req_complete(req, status); } @@ -89,6 +100,7 @@ u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req) default: pr_err("received unknown capsule type 0x%x\n", cmd->fabrics.fctype); + req->error_loc = offsetof(struct nvmf_common_command, fctype); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } @@ -105,16 +117,34 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) old = cmpxchg(&req->sq->ctrl, NULL, ctrl); if (old) { pr_warn("queue already connected!\n"); + req->error_loc = offsetof(struct nvmf_connect_command, opcode); return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; } if (!sqsize) { pr_warn("queue size zero!\n"); + req->error_loc = offsetof(struct nvmf_connect_command, sqsize); return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; } /* note: convert queue size from 0's-based value to 1's-based value */ nvmet_cq_setup(ctrl, req->cq, qid, sqsize + 1); nvmet_sq_setup(ctrl, req->sq, qid, sqsize + 1); + + if (c->cattr & NVME_CONNECT_DISABLE_SQFLOW) { + req->sq->sqhd_disabled = true; + req->rsp->sq_head = cpu_to_le16(0xffff); + } + + if (ctrl->ops->install_queue) { + u16 ret = ctrl->ops->install_queue(req->sq); + + if (ret) { + pr_err("failed to install queue %d cntlid %d ret %x\n", + qid, ret, ctrl->cntlid); + return ret; + } + } + return 0; } @@ -141,6 +171,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) if (c->recfmt != 0) { pr_warn("invalid connect version (%d).\n", le16_to_cpu(c->recfmt)); + req->error_loc = offsetof(struct nvmf_connect_command, recfmt); status = NVME_SC_CONNECT_FORMAT | NVME_SC_DNR; goto out; } @@ -155,8 +186,13 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req, le32_to_cpu(c->kato), &ctrl); - if (status) + if (status) { + if (status == (NVME_SC_INVALID_FIELD | NVME_SC_DNR)) + req->error_loc = + offsetof(struct nvme_common_command, opcode); goto out; + } + uuid_copy(&ctrl->hostid, &d->hostid); status = nvmet_install_queue(ctrl, req); @@ -243,11 +279,13 @@ u16 nvmet_parse_connect_cmd(struct nvmet_req *req) if (cmd->common.opcode != nvme_fabrics_command) { pr_err("invalid command 0x%x on unconnected queue.\n", cmd->fabrics.opcode); + req->error_loc = offsetof(struct nvme_common_command, opcode); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } if (cmd->fabrics.fctype != nvme_fabrics_type_connect) { pr_err("invalid capsule type 0x%x on unconnected queue.\n", cmd->fabrics.fctype); + req->error_loc = offsetof(struct nvmf_common_command, fctype); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 409081a03b24..f98f5c5bea26 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -86,8 +86,6 @@ struct nvmet_fc_fcp_iod { spinlock_t flock; struct nvmet_req req; - struct work_struct work; - struct work_struct done_work; struct work_struct defer_work; struct nvmet_fc_tgtport *tgtport; @@ -134,7 +132,6 @@ struct nvmet_fc_tgt_queue { u16 sqsize; u16 ersp_ratio; __le16 sqhd; - int cpu; atomic_t connected; atomic_t sqtail; atomic_t zrspcnt; @@ -232,8 +229,6 @@ static LIST_HEAD(nvmet_fc_portentry_list); static void nvmet_fc_handle_ls_rqst_work(struct work_struct *work); -static void nvmet_fc_handle_fcp_rqst_work(struct work_struct *work); -static void nvmet_fc_fcp_rqst_op_done_work(struct work_struct *work); static void nvmet_fc_fcp_rqst_op_defer_work(struct work_struct *work); static void nvmet_fc_tgt_a_put(struct nvmet_fc_tgt_assoc *assoc); static int nvmet_fc_tgt_a_get(struct nvmet_fc_tgt_assoc *assoc); @@ -438,8 +433,6 @@ nvmet_fc_prep_fcp_iodlist(struct nvmet_fc_tgtport *tgtport, int i; for (i = 0; i < queue->sqsize; fod++, i++) { - INIT_WORK(&fod->work, nvmet_fc_handle_fcp_rqst_work); - INIT_WORK(&fod->done_work, nvmet_fc_fcp_rqst_op_done_work); INIT_WORK(&fod->defer_work, nvmet_fc_fcp_rqst_op_defer_work); fod->tgtport = tgtport; fod->queue = queue; @@ -517,10 +510,7 @@ nvmet_fc_queue_fcp_req(struct nvmet_fc_tgtport *tgtport, fcpreq->hwqid = queue->qid ? ((queue->qid - 1) % tgtport->ops->max_hw_queues) : 0; - if (tgtport->ops->target_features & NVMET_FCTGTFEAT_CMD_IN_ISR) - queue_work_on(queue->cpu, queue->work_q, &fod->work); - else - nvmet_fc_handle_fcp_rqst(tgtport, fod); + nvmet_fc_handle_fcp_rqst(tgtport, fod); } static void @@ -599,30 +589,6 @@ nvmet_fc_free_fcp_iod(struct nvmet_fc_tgt_queue *queue, queue_work(queue->work_q, &fod->defer_work); } -static int -nvmet_fc_queue_to_cpu(struct nvmet_fc_tgtport *tgtport, int qid) -{ - int cpu, idx, cnt; - - if (tgtport->ops->max_hw_queues == 1) - return WORK_CPU_UNBOUND; - - /* Simple cpu selection based on qid modulo active cpu count */ - idx = !qid ? 0 : (qid - 1) % num_active_cpus(); - - /* find the n'th active cpu */ - for (cpu = 0, cnt = 0; ; ) { - if (cpu_active(cpu)) { - if (cnt == idx) - break; - cnt++; - } - cpu = (cpu + 1) % num_possible_cpus(); - } - - return cpu; -} - static struct nvmet_fc_tgt_queue * nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc, u16 qid, u16 sqsize) @@ -653,7 +619,6 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc, queue->qid = qid; queue->sqsize = sqsize; queue->assoc = assoc; - queue->cpu = nvmet_fc_queue_to_cpu(assoc->tgtport, qid); INIT_LIST_HEAD(&queue->fod_list); INIT_LIST_HEAD(&queue->avail_defer_list); INIT_LIST_HEAD(&queue->pending_cmd_list); @@ -2146,25 +2111,11 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod) } static void -nvmet_fc_fcp_rqst_op_done_work(struct work_struct *work) -{ - struct nvmet_fc_fcp_iod *fod = - container_of(work, struct nvmet_fc_fcp_iod, done_work); - - nvmet_fc_fod_op_done(fod); -} - -static void nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq) { struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private; - struct nvmet_fc_tgt_queue *queue = fod->queue; - if (fod->tgtport->ops->target_features & NVMET_FCTGTFEAT_OPDONE_IN_ISR) - /* context switch so completion is not in ISR context */ - queue_work_on(queue->cpu, queue->work_q, &fod->done_work); - else - nvmet_fc_fod_op_done(fod); + nvmet_fc_fod_op_done(fod); } /* @@ -2332,19 +2283,6 @@ transport_error: nvmet_fc_abort_op(tgtport, fod); } -/* - * Actual processing routine for received FC-NVME LS Requests from the LLD - */ -static void -nvmet_fc_handle_fcp_rqst_work(struct work_struct *work) -{ - struct nvmet_fc_fcp_iod *fod = - container_of(work, struct nvmet_fc_fcp_iod, work); - struct nvmet_fc_tgtport *tgtport = fod->tgtport; - - nvmet_fc_handle_fcp_rqst(tgtport, fod); -} - /** * nvmet_fc_rcv_fcp_req - transport entry point called by an LLDD * upon the reception of a NVME FCP CMD IU. diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index c1ec3475a140..b6d030d3259f 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -44,13 +44,69 @@ void nvmet_bdev_ns_disable(struct nvmet_ns *ns) } } +static u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts) +{ + u16 status = NVME_SC_SUCCESS; + + if (likely(blk_sts == BLK_STS_OK)) + return status; + /* + * Right now there exists M : 1 mapping between block layer error + * to the NVMe status code (see nvme_error_status()). For consistency, + * when we reverse map we use most appropriate NVMe Status code from + * the group of the NVMe staus codes used in the nvme_error_status(). + */ + switch (blk_sts) { + case BLK_STS_NOSPC: + status = NVME_SC_CAP_EXCEEDED | NVME_SC_DNR; + req->error_loc = offsetof(struct nvme_rw_command, length); + break; + case BLK_STS_TARGET: + status = NVME_SC_LBA_RANGE | NVME_SC_DNR; + req->error_loc = offsetof(struct nvme_rw_command, slba); + break; + case BLK_STS_NOTSUPP: + req->error_loc = offsetof(struct nvme_common_command, opcode); + switch (req->cmd->common.opcode) { + case nvme_cmd_dsm: + case nvme_cmd_write_zeroes: + status = NVME_SC_ONCS_NOT_SUPPORTED | NVME_SC_DNR; + break; + default: + status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + break; + case BLK_STS_MEDIUM: + status = NVME_SC_ACCESS_DENIED; + req->error_loc = offsetof(struct nvme_rw_command, nsid); + break; + case BLK_STS_IOERR: + /* fallthru */ + default: + status = NVME_SC_INTERNAL | NVME_SC_DNR; + req->error_loc = offsetof(struct nvme_common_command, opcode); + } + + switch (req->cmd->common.opcode) { + case nvme_cmd_read: + case nvme_cmd_write: + req->error_slba = le64_to_cpu(req->cmd->rw.slba); + break; + case nvme_cmd_write_zeroes: + req->error_slba = + le64_to_cpu(req->cmd->write_zeroes.slba); + break; + default: + req->error_slba = 0; + } + return status; +} + static void nvmet_bio_done(struct bio *bio) { struct nvmet_req *req = bio->bi_private; - nvmet_req_complete(req, - bio->bi_status ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); - + nvmet_req_complete(req, blk_to_nvme_status(req, bio->bi_status)); if (bio != &req->b.inline_bio) bio_put(bio); } @@ -61,7 +117,6 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req) struct bio *bio; struct scatterlist *sg; sector_t sector; - blk_qc_t cookie; int op, op_flags = 0, i; if (!req->sg_cnt) { @@ -114,9 +169,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req) sg_cnt--; } - cookie = submit_bio(bio); - - blk_poll(bdev_get_queue(req->ns->bdev), cookie); + submit_bio(bio); } static void nvmet_bdev_execute_flush(struct nvmet_req *req) @@ -139,18 +192,21 @@ u16 nvmet_bdev_flush(struct nvmet_req *req) return 0; } -static u16 nvmet_bdev_discard_range(struct nvmet_ns *ns, +static u16 nvmet_bdev_discard_range(struct nvmet_req *req, struct nvme_dsm_range *range, struct bio **bio) { + struct nvmet_ns *ns = req->ns; int ret; ret = __blkdev_issue_discard(ns->bdev, le64_to_cpu(range->slba) << (ns->blksize_shift - 9), le32_to_cpu(range->nlb) << (ns->blksize_shift - 9), GFP_KERNEL, 0, bio); - if (ret && ret != -EOPNOTSUPP) - return NVME_SC_INTERNAL | NVME_SC_DNR; - return 0; + + if (ret) + req->error_slba = le64_to_cpu(range->slba); + + return blk_to_nvme_status(req, errno_to_blk_status(ret)); } static void nvmet_bdev_execute_discard(struct nvmet_req *req) @@ -166,7 +222,7 @@ static void nvmet_bdev_execute_discard(struct nvmet_req *req) if (status) break; - status = nvmet_bdev_discard_range(req->ns, &range, &bio); + status = nvmet_bdev_discard_range(req, &range, &bio); if (status) break; } @@ -207,16 +263,16 @@ static void nvmet_bdev_execute_write_zeroes(struct nvmet_req *req) u16 status = NVME_SC_SUCCESS; sector_t sector; sector_t nr_sector; + int ret; sector = le64_to_cpu(write_zeroes->slba) << (req->ns->blksize_shift - 9); nr_sector = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) << (req->ns->blksize_shift - 9)); - if (__blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector, - GFP_KERNEL, &bio, 0)) - status = NVME_SC_INTERNAL | NVME_SC_DNR; - + ret = __blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector, + GFP_KERNEL, &bio, 0); + status = blk_to_nvme_status(req, errno_to_blk_status(ret)); if (bio) { bio->bi_private = req; bio->bi_end_io = nvmet_bio_done; @@ -251,6 +307,7 @@ u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req) default: pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode, req->sq->qid); + req->error_loc = offsetof(struct nvme_common_command, opcode); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } } diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 01feebec29ea..517522305e5c 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -83,17 +83,16 @@ static void nvmet_file_init_bvec(struct bio_vec *bv, struct sg_page_iter *iter) } static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos, - unsigned long nr_segs, size_t count) + unsigned long nr_segs, size_t count, int ki_flags) { struct kiocb *iocb = &req->f.iocb; ssize_t (*call_iter)(struct kiocb *iocb, struct iov_iter *iter); struct iov_iter iter; - int ki_flags = 0, rw; - ssize_t ret; + int rw; if (req->cmd->rw.opcode == nvme_cmd_write) { if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA)) - ki_flags = IOCB_DSYNC; + ki_flags |= IOCB_DSYNC; call_iter = req->ns->file->f_op->write_iter; rw = WRITE; } else { @@ -107,17 +106,13 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos, iocb->ki_filp = req->ns->file; iocb->ki_flags = ki_flags | iocb_flags(req->ns->file); - ret = call_iter(iocb, &iter); - - if (ret != -EIOCBQUEUED && iocb->ki_complete) - iocb->ki_complete(iocb, ret, 0); - - return ret; + return call_iter(iocb, &iter); } static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2) { struct nvmet_req *req = container_of(iocb, struct nvmet_req, f.iocb); + u16 status = NVME_SC_SUCCESS; if (req->f.bvec != req->inline_bvec) { if (likely(req->f.mpool_alloc == false)) @@ -126,11 +121,12 @@ static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2) mempool_free(req->f.bvec, req->ns->bvec_pool); } - nvmet_req_complete(req, ret != req->data_len ? - NVME_SC_INTERNAL | NVME_SC_DNR : 0); + if (unlikely(ret != req->data_len)) + status = errno_to_nvme_status(req, ret); + nvmet_req_complete(req, status); } -static void nvmet_file_execute_rw(struct nvmet_req *req) +static bool nvmet_file_execute_io(struct nvmet_req *req, int ki_flags) { ssize_t nr_bvec = DIV_ROUND_UP(req->data_len, PAGE_SIZE); struct sg_page_iter sg_pg_iter; @@ -140,30 +136,14 @@ static void nvmet_file_execute_rw(struct nvmet_req *req) ssize_t ret = 0; loff_t pos; - if (!req->sg_cnt || !nr_bvec) { - nvmet_req_complete(req, 0); - return; - } + + if (req->f.mpool_alloc && nr_bvec > NVMET_MAX_MPOOL_BVEC) + is_sync = true; pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift; if (unlikely(pos + req->data_len > req->ns->size)) { - nvmet_req_complete(req, NVME_SC_LBA_RANGE | NVME_SC_DNR); - return; - } - - if (nr_bvec > NVMET_MAX_INLINE_BIOVEC) - req->f.bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec), - GFP_KERNEL); - else - req->f.bvec = req->inline_bvec; - - req->f.mpool_alloc = false; - if (unlikely(!req->f.bvec)) { - /* fallback under memory pressure */ - req->f.bvec = mempool_alloc(req->ns->bvec_pool, GFP_KERNEL); - req->f.mpool_alloc = true; - if (nr_bvec > NVMET_MAX_MPOOL_BVEC) - is_sync = true; + nvmet_req_complete(req, errno_to_nvme_status(req, -ENOSPC)); + return true; } memset(&req->f.iocb, 0, sizeof(struct kiocb)); @@ -177,9 +157,10 @@ static void nvmet_file_execute_rw(struct nvmet_req *req) if (unlikely(is_sync) && (nr_bvec - 1 == 0 || bv_cnt == NVMET_MAX_MPOOL_BVEC)) { - ret = nvmet_file_submit_bvec(req, pos, bv_cnt, len); + ret = nvmet_file_submit_bvec(req, pos, bv_cnt, len, 0); if (ret < 0) - goto out; + goto complete; + pos += len; bv_cnt = 0; len = 0; @@ -187,35 +168,95 @@ static void nvmet_file_execute_rw(struct nvmet_req *req) nr_bvec--; } - if (WARN_ON_ONCE(total_len != req->data_len)) + if (WARN_ON_ONCE(total_len != req->data_len)) { ret = -EIO; -out: - if (unlikely(is_sync || ret)) { - nvmet_file_io_done(&req->f.iocb, ret < 0 ? ret : total_len, 0); - return; + goto complete; + } + + if (unlikely(is_sync)) { + ret = total_len; + goto complete; } - req->f.iocb.ki_complete = nvmet_file_io_done; - nvmet_file_submit_bvec(req, pos, bv_cnt, total_len); + + /* + * A NULL ki_complete ask for synchronous execution, which we want + * for the IOCB_NOWAIT case. + */ + if (!(ki_flags & IOCB_NOWAIT)) + req->f.iocb.ki_complete = nvmet_file_io_done; + + ret = nvmet_file_submit_bvec(req, pos, bv_cnt, total_len, ki_flags); + + switch (ret) { + case -EIOCBQUEUED: + return true; + case -EAGAIN: + if (WARN_ON_ONCE(!(ki_flags & IOCB_NOWAIT))) + goto complete; + return false; + case -EOPNOTSUPP: + /* + * For file systems returning error -EOPNOTSUPP, handle + * IOCB_NOWAIT error case separately and retry without + * IOCB_NOWAIT. + */ + if ((ki_flags & IOCB_NOWAIT)) + return false; + break; + } + +complete: + nvmet_file_io_done(&req->f.iocb, ret, 0); + return true; } static void nvmet_file_buffered_io_work(struct work_struct *w) { struct nvmet_req *req = container_of(w, struct nvmet_req, f.work); - nvmet_file_execute_rw(req); + nvmet_file_execute_io(req, 0); } -static void nvmet_file_execute_rw_buffered_io(struct nvmet_req *req) +static void nvmet_file_submit_buffered_io(struct nvmet_req *req) { INIT_WORK(&req->f.work, nvmet_file_buffered_io_work); queue_work(buffered_io_wq, &req->f.work); } +static void nvmet_file_execute_rw(struct nvmet_req *req) +{ + ssize_t nr_bvec = DIV_ROUND_UP(req->data_len, PAGE_SIZE); + + if (!req->sg_cnt || !nr_bvec) { + nvmet_req_complete(req, 0); + return; + } + + if (nr_bvec > NVMET_MAX_INLINE_BIOVEC) + req->f.bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec), + GFP_KERNEL); + else + req->f.bvec = req->inline_bvec; + + if (unlikely(!req->f.bvec)) { + /* fallback under memory pressure */ + req->f.bvec = mempool_alloc(req->ns->bvec_pool, GFP_KERNEL); + req->f.mpool_alloc = true; + } else + req->f.mpool_alloc = false; + + if (req->ns->buffered_io) { + if (likely(!req->f.mpool_alloc) && + nvmet_file_execute_io(req, IOCB_NOWAIT)) + return; + nvmet_file_submit_buffered_io(req); + } else + nvmet_file_execute_io(req, 0); +} + u16 nvmet_file_flush(struct nvmet_req *req) { - if (vfs_fsync(req->ns->file, 1) < 0) - return NVME_SC_INTERNAL | NVME_SC_DNR; - return 0; + return errno_to_nvme_status(req, vfs_fsync(req->ns->file, 1)); } static void nvmet_file_flush_work(struct work_struct *w) @@ -236,30 +277,34 @@ static void nvmet_file_execute_discard(struct nvmet_req *req) int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; struct nvme_dsm_range range; loff_t offset, len; - u16 ret; + u16 status = 0; + int ret; int i; for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) { - ret = nvmet_copy_from_sgl(req, i * sizeof(range), &range, + status = nvmet_copy_from_sgl(req, i * sizeof(range), &range, sizeof(range)); - if (ret) + if (status) break; offset = le64_to_cpu(range.slba) << req->ns->blksize_shift; len = le32_to_cpu(range.nlb); len <<= req->ns->blksize_shift; if (offset + len > req->ns->size) { - ret = NVME_SC_LBA_RANGE | NVME_SC_DNR; + req->error_slba = le64_to_cpu(range.slba); + status = errno_to_nvme_status(req, -ENOSPC); break; } - if (vfs_fallocate(req->ns->file, mode, offset, len)) { - ret = NVME_SC_INTERNAL | NVME_SC_DNR; + ret = vfs_fallocate(req->ns->file, mode, offset, len); + if (ret) { + req->error_slba = le64_to_cpu(range.slba); + status = errno_to_nvme_status(req, ret); break; } } - nvmet_req_complete(req, ret); + nvmet_req_complete(req, status); } static void nvmet_file_dsm_work(struct work_struct *w) @@ -299,12 +344,12 @@ static void nvmet_file_write_zeroes_work(struct work_struct *w) req->ns->blksize_shift); if (unlikely(offset + len > req->ns->size)) { - nvmet_req_complete(req, NVME_SC_LBA_RANGE | NVME_SC_DNR); + nvmet_req_complete(req, errno_to_nvme_status(req, -ENOSPC)); return; } ret = vfs_fallocate(req->ns->file, mode, offset, len); - nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); + nvmet_req_complete(req, ret < 0 ? errno_to_nvme_status(req, ret) : 0); } static void nvmet_file_execute_write_zeroes(struct nvmet_req *req) @@ -320,10 +365,7 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req) switch (cmd->common.opcode) { case nvme_cmd_read: case nvme_cmd_write: - if (req->ns->buffered_io) - req->execute = nvmet_file_execute_rw_buffered_io; - else - req->execute = nvmet_file_execute_rw; + req->execute = nvmet_file_execute_rw; req->data_len = nvmet_rw_len(req); return 0; case nvme_cmd_flush: @@ -342,6 +384,7 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req) default: pr_err("unhandled cmd for file ns %d on qid %d\n", cmd->common.opcode, req->sq->qid); + req->error_loc = offsetof(struct nvme_common_command, opcode); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; } } diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 9908082b32c4..4aac1b4a8112 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -345,7 +345,7 @@ static int nvme_loop_connect_io_queues(struct nvme_loop_ctrl *ctrl) int i, ret; for (i = 1; i < ctrl->ctrl.queue_count; i++) { - ret = nvmf_connect_io_queue(&ctrl->ctrl, i); + ret = nvmf_connect_io_queue(&ctrl->ctrl, i, false); if (ret) return ret; set_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[i].flags); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index c2b4d9ee6391..3e4719fdba85 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -30,12 +30,15 @@ #define NVMET_ASYNC_EVENTS 4 #define NVMET_ERROR_LOG_SLOTS 128 +#define NVMET_NO_ERROR_LOC ((u16)-1) /* * Supported optional AENs: */ #define NVMET_AEN_CFG_OPTIONAL \ (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_ANA_CHANGE) +#define NVMET_DISC_AEN_CFG_OPTIONAL \ + (NVME_AEN_CFG_DISC_CHANGE) /* * Plus mandatory SMART AENs (we'll never send them, but allow enabling them): @@ -104,6 +107,7 @@ struct nvmet_sq { u16 qid; u16 size; u32 sqhd; + bool sqhd_disabled; struct completion free_done; struct completion confirm_done; }; @@ -137,6 +141,7 @@ struct nvmet_port { struct list_head subsystems; struct config_group referrals_group; struct list_head referrals; + struct list_head global_entry; struct config_group ana_groups_group; struct nvmet_ana_group ana_default_group; enum nvme_ana_state *ana_state; @@ -163,6 +168,8 @@ struct nvmet_ctrl { struct nvmet_cq **cqs; struct nvmet_sq **sqs; + bool cmd_seen; + struct mutex lock; u64 cap; u32 cc; @@ -194,8 +201,12 @@ struct nvmet_ctrl { char subsysnqn[NVMF_NQN_FIELD_LEN]; char hostnqn[NVMF_NQN_FIELD_LEN]; - struct device *p2p_client; - struct radix_tree_root p2p_ns_map; + struct device *p2p_client; + struct radix_tree_root p2p_ns_map; + + spinlock_t error_lock; + u64 err_counter; + struct nvme_error_slot slots[NVMET_ERROR_LOG_SLOTS]; }; struct nvmet_subsys { @@ -273,6 +284,7 @@ struct nvmet_fabrics_ops { void (*delete_ctrl)(struct nvmet_ctrl *ctrl); void (*disc_traddr)(struct nvmet_req *req, struct nvmet_port *port, char *traddr); + u16 (*install_queue)(struct nvmet_sq *nvme_sq); }; #define NVMET_MAX_INLINE_BIOVEC 8 @@ -308,17 +320,14 @@ struct nvmet_req { void (*execute)(struct nvmet_req *req); const struct nvmet_fabrics_ops *ops; - struct pci_dev *p2p_dev; - struct device *p2p_client; + struct pci_dev *p2p_dev; + struct device *p2p_client; + u16 error_loc; + u64 error_slba; }; extern struct workqueue_struct *buffered_io_wq; -static inline void nvmet_set_status(struct nvmet_req *req, u16 status) -{ - req->rsp->status = cpu_to_le16(status << 1); -} - static inline void nvmet_set_result(struct nvmet_req *req, u32 result) { req->rsp->result.u32 = cpu_to_le32(result); @@ -340,6 +349,27 @@ struct nvmet_async_event { u8 log_page; }; +static inline void nvmet_clear_aen_bit(struct nvmet_req *req, u32 bn) +{ + int rae = le32_to_cpu(req->cmd->common.cdw10) & 1 << 15; + + if (!rae) + clear_bit(bn, &req->sq->ctrl->aen_masked); +} + +static inline bool nvmet_aen_bit_disabled(struct nvmet_ctrl *ctrl, u32 bn) +{ + if (!(READ_ONCE(ctrl->aen_enabled) & (1 << bn))) + return true; + return test_and_set_bit(bn, &ctrl->aen_masked); +} + +void nvmet_get_feat_kato(struct nvmet_req *req); +void nvmet_get_feat_async_event(struct nvmet_req *req); +u16 nvmet_set_feat_kato(struct nvmet_req *req); +u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask); +void nvmet_execute_async_event(struct nvmet_req *req); + u16 nvmet_parse_connect_cmd(struct nvmet_req *req); u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req); u16 nvmet_file_parse_io_cmd(struct nvmet_req *req); @@ -355,6 +385,8 @@ void nvmet_req_complete(struct nvmet_req *req, u16 status); int nvmet_req_alloc_sgl(struct nvmet_req *req); void nvmet_req_free_sgl(struct nvmet_req *req); +void nvmet_execute_keep_alive(struct nvmet_req *req); + void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, u16 size); void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid, @@ -395,7 +427,7 @@ int nvmet_enable_port(struct nvmet_port *port); void nvmet_disable_port(struct nvmet_port *port); void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port); -void nvmet_referral_disable(struct nvmet_port *port); +void nvmet_referral_disable(struct nvmet_port *parent, struct nvmet_port *port); u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf, size_t len); @@ -405,6 +437,14 @@ u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len); u32 nvmet_get_log_page_len(struct nvme_command *cmd); +extern struct list_head *nvmet_ports; +void nvmet_port_disc_changed(struct nvmet_port *port, + struct nvmet_subsys *subsys); +void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys, + struct nvmet_host *host); +void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, + u8 event_info, u8 log_page); + #define NVMET_QUEUE_SIZE 1024 #define NVMET_NR_QUEUES 128 #define NVMET_MAX_CMD NVMET_QUEUE_SIZE @@ -425,7 +465,7 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd); #define NVMET_DEFAULT_ANA_GRPID 1 #define NVMET_KAS 10 -#define NVMET_DISC_KATO 120 +#define NVMET_DISC_KATO_MS 120000 int __init nvmet_init_configfs(void); void __exit nvmet_exit_configfs(void); @@ -434,15 +474,13 @@ int __init nvmet_init_discovery(void); void nvmet_exit_discovery(void); extern struct nvmet_subsys *nvmet_disc_subsys; -extern u64 nvmet_genctr; extern struct rw_semaphore nvmet_config_sem; extern u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1]; extern u64 nvmet_ana_chgcnt; extern struct rw_semaphore nvmet_ana_sem; -bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys, - const char *hostnqn); +bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn); int nvmet_bdev_ns_enable(struct nvmet_ns *ns); int nvmet_file_ns_enable(struct nvmet_ns *ns); @@ -457,4 +495,6 @@ static inline u32 nvmet_rw_len(struct nvmet_req *req) return ((u32)le16_to_cpu(req->cmd->rw.length) + 1) << req->ns->blksize_shift; } + +u16 errno_to_nvme_status(struct nvmet_req *req, int errno); #endif /* _NVMET_H */ diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 3f7971d3706d..a8d23eb80192 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -196,7 +196,7 @@ nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp) { unsigned long flags; - if (rsp->allocated) { + if (unlikely(rsp->allocated)) { kfree(rsp); return; } @@ -529,6 +529,7 @@ static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvmet_rdma_rsp *rsp = container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe); + struct nvmet_rdma_queue *queue = cq->cq_context; nvmet_rdma_release_rsp(rsp); @@ -536,7 +537,7 @@ static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) wc->status != IB_WC_WR_FLUSH_ERR)) { pr_err("SEND for CQE 0x%p failed with status %s (%d).\n", wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); - nvmet_rdma_error_comp(rsp->queue); + nvmet_rdma_error_comp(queue); } } @@ -629,8 +630,11 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) u64 off = le64_to_cpu(sgl->addr); u32 len = le32_to_cpu(sgl->length); - if (!nvme_is_write(rsp->req.cmd)) + if (!nvme_is_write(rsp->req.cmd)) { + rsp->req.error_loc = + offsetof(struct nvme_common_command, opcode); return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } if (off + len > rsp->queue->dev->inline_data_size) { pr_err("invalid inline data offset!\n"); @@ -695,6 +699,8 @@ static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) return nvmet_rdma_map_sgl_inline(rsp); default: pr_err("invalid SGL subtype: %#x\n", sgl->type); + rsp->req.error_loc = + offsetof(struct nvme_common_command, dptr); return NVME_SC_INVALID_FIELD | NVME_SC_DNR; } case NVME_KEY_SGL_FMT_DATA_DESC: @@ -705,10 +711,13 @@ static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) return nvmet_rdma_map_sgl_keyed(rsp, sgl, false); default: pr_err("invalid SGL subtype: %#x\n", sgl->type); + rsp->req.error_loc = + offsetof(struct nvme_common_command, dptr); return NVME_SC_INVALID_FIELD | NVME_SC_DNR; } default: pr_err("invalid SGL type: %#x\n", sgl->type); + rsp->req.error_loc = offsetof(struct nvme_common_command, dptr); return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR; } } diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c new file mode 100644 index 000000000000..44b37b202e39 --- /dev/null +++ b/drivers/nvme/target/tcp.c @@ -0,0 +1,1737 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVMe over Fabrics TCP target. + * Copyright (c) 2018 Lightbits Labs. All rights reserved. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/nvme-tcp.h> +#include <net/sock.h> +#include <net/tcp.h> +#include <linux/inet.h> +#include <linux/llist.h> +#include <crypto/hash.h> + +#include "nvmet.h" + +#define NVMET_TCP_DEF_INLINE_DATA_SIZE (4 * PAGE_SIZE) + +#define NVMET_TCP_RECV_BUDGET 8 +#define NVMET_TCP_SEND_BUDGET 8 +#define NVMET_TCP_IO_WORK_BUDGET 64 + +enum nvmet_tcp_send_state { + NVMET_TCP_SEND_DATA_PDU, + NVMET_TCP_SEND_DATA, + NVMET_TCP_SEND_R2T, + NVMET_TCP_SEND_DDGST, + NVMET_TCP_SEND_RESPONSE +}; + +enum nvmet_tcp_recv_state { + NVMET_TCP_RECV_PDU, + NVMET_TCP_RECV_DATA, + NVMET_TCP_RECV_DDGST, + NVMET_TCP_RECV_ERR, +}; + +enum { + NVMET_TCP_F_INIT_FAILED = (1 << 0), +}; + +struct nvmet_tcp_cmd { + struct nvmet_tcp_queue *queue; + struct nvmet_req req; + + struct nvme_tcp_cmd_pdu *cmd_pdu; + struct nvme_tcp_rsp_pdu *rsp_pdu; + struct nvme_tcp_data_pdu *data_pdu; + struct nvme_tcp_r2t_pdu *r2t_pdu; + + u32 rbytes_done; + u32 wbytes_done; + + u32 pdu_len; + u32 pdu_recv; + int sg_idx; + int nr_mapped; + struct msghdr recv_msg; + struct kvec *iov; + u32 flags; + + struct list_head entry; + struct llist_node lentry; + + /* send state */ + u32 offset; + struct scatterlist *cur_sg; + enum nvmet_tcp_send_state state; + + __le32 exp_ddgst; + __le32 recv_ddgst; +}; + +enum nvmet_tcp_queue_state { + NVMET_TCP_Q_CONNECTING, + NVMET_TCP_Q_LIVE, + NVMET_TCP_Q_DISCONNECTING, +}; + +struct nvmet_tcp_queue { + struct socket *sock; + struct nvmet_tcp_port *port; + struct work_struct io_work; + int cpu; + struct nvmet_cq nvme_cq; + struct nvmet_sq nvme_sq; + + /* send state */ + struct nvmet_tcp_cmd *cmds; + unsigned int nr_cmds; + struct list_head free_list; + struct llist_head resp_list; + struct list_head resp_send_list; + int send_list_len; + struct nvmet_tcp_cmd *snd_cmd; + + /* recv state */ + int offset; + int left; + enum nvmet_tcp_recv_state rcv_state; + struct nvmet_tcp_cmd *cmd; + union nvme_tcp_pdu pdu; + + /* digest state */ + bool hdr_digest; + bool data_digest; + struct ahash_request *snd_hash; + struct ahash_request *rcv_hash; + + spinlock_t state_lock; + enum nvmet_tcp_queue_state state; + + struct sockaddr_storage sockaddr; + struct sockaddr_storage sockaddr_peer; + struct work_struct release_work; + + int idx; + struct list_head queue_list; + + struct nvmet_tcp_cmd connect; + + struct page_frag_cache pf_cache; + + void (*data_ready)(struct sock *); + void (*state_change)(struct sock *); + void (*write_space)(struct sock *); +}; + +struct nvmet_tcp_port { + struct socket *sock; + struct work_struct accept_work; + struct nvmet_port *nport; + struct sockaddr_storage addr; + int last_cpu; + void (*data_ready)(struct sock *); +}; + +static DEFINE_IDA(nvmet_tcp_queue_ida); +static LIST_HEAD(nvmet_tcp_queue_list); +static DEFINE_MUTEX(nvmet_tcp_queue_mutex); + +static struct workqueue_struct *nvmet_tcp_wq; +static struct nvmet_fabrics_ops nvmet_tcp_ops; +static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c); +static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd); + +static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue, + struct nvmet_tcp_cmd *cmd) +{ + return cmd - queue->cmds; +} + +static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd) +{ + return nvme_is_write(cmd->req.cmd) && + cmd->rbytes_done < cmd->req.transfer_len; +} + +static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd) +{ + return nvmet_tcp_has_data_in(cmd) && !cmd->req.rsp->status; +} + +static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd *cmd) +{ + return !nvme_is_write(cmd->req.cmd) && + cmd->req.transfer_len > 0 && + !cmd->req.rsp->status; +} + +static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd *cmd) +{ + return nvme_is_write(cmd->req.cmd) && cmd->pdu_len && + !cmd->rbytes_done; +} + +static inline struct nvmet_tcp_cmd * +nvmet_tcp_get_cmd(struct nvmet_tcp_queue *queue) +{ + struct nvmet_tcp_cmd *cmd; + + cmd = list_first_entry_or_null(&queue->free_list, + struct nvmet_tcp_cmd, entry); + if (!cmd) + return NULL; + list_del_init(&cmd->entry); + + cmd->rbytes_done = cmd->wbytes_done = 0; + cmd->pdu_len = 0; + cmd->pdu_recv = 0; + cmd->iov = NULL; + cmd->flags = 0; + return cmd; +} + +static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd *cmd) +{ + if (unlikely(cmd == &cmd->queue->connect)) + return; + + list_add_tail(&cmd->entry, &cmd->queue->free_list); +} + +static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue) +{ + return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0; +} + +static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue) +{ + return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0; +} + +static inline void nvmet_tcp_hdgst(struct ahash_request *hash, + void *pdu, size_t len) +{ + struct scatterlist sg; + + sg_init_one(&sg, pdu, len); + ahash_request_set_crypt(hash, &sg, pdu + len, len); + crypto_ahash_digest(hash); +} + +static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue, + void *pdu, size_t len) +{ + struct nvme_tcp_hdr *hdr = pdu; + __le32 recv_digest; + __le32 exp_digest; + + if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) { + pr_err("queue %d: header digest enabled but no header digest\n", + queue->idx); + return -EPROTO; + } + + recv_digest = *(__le32 *)(pdu + hdr->hlen); + nvmet_tcp_hdgst(queue->rcv_hash, pdu, len); + exp_digest = *(__le32 *)(pdu + hdr->hlen); + if (recv_digest != exp_digest) { + pr_err("queue %d: header digest error: recv %#x expected %#x\n", + queue->idx, le32_to_cpu(recv_digest), + le32_to_cpu(exp_digest)); + return -EPROTO; + } + + return 0; +} + +static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu) +{ + struct nvme_tcp_hdr *hdr = pdu; + u8 digest_len = nvmet_tcp_hdgst_len(queue); + u32 len; + + len = le32_to_cpu(hdr->plen) - hdr->hlen - + (hdr->flags & NVME_TCP_F_HDGST ? digest_len : 0); + + if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) { + pr_err("queue %d: data digest flag is cleared\n", queue->idx); + return -EPROTO; + } + + return 0; +} + +static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd) +{ + struct scatterlist *sg; + int i; + + sg = &cmd->req.sg[cmd->sg_idx]; + + for (i = 0; i < cmd->nr_mapped; i++) + kunmap(sg_page(&sg[i])); +} + +static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd) +{ + struct kvec *iov = cmd->iov; + struct scatterlist *sg; + u32 length, offset, sg_offset; + + length = cmd->pdu_len; + cmd->nr_mapped = DIV_ROUND_UP(length, PAGE_SIZE); + offset = cmd->rbytes_done; + cmd->sg_idx = DIV_ROUND_UP(offset, PAGE_SIZE); + sg_offset = offset % PAGE_SIZE; + sg = &cmd->req.sg[cmd->sg_idx]; + + while (length) { + u32 iov_len = min_t(u32, length, sg->length - sg_offset); + + iov->iov_base = kmap(sg_page(sg)) + sg->offset + sg_offset; + iov->iov_len = iov_len; + + length -= iov_len; + sg = sg_next(sg); + iov++; + } + + iov_iter_kvec(&cmd->recv_msg.msg_iter, READ, cmd->iov, + cmd->nr_mapped, cmd->pdu_len); +} + +static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue) +{ + queue->rcv_state = NVMET_TCP_RECV_ERR; + if (queue->nvme_sq.ctrl) + nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl); + else + kernel_sock_shutdown(queue->sock, SHUT_RDWR); +} + +static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd) +{ + struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl; + u32 len = le32_to_cpu(sgl->length); + + if (!cmd->req.data_len) + return 0; + + if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) | + NVME_SGL_FMT_OFFSET)) { + if (!nvme_is_write(cmd->req.cmd)) + return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + + if (len > cmd->req.port->inline_data_size) + return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; + cmd->pdu_len = len; + } + cmd->req.transfer_len += len; + + cmd->req.sg = sgl_alloc(len, GFP_KERNEL, &cmd->req.sg_cnt); + if (!cmd->req.sg) + return NVME_SC_INTERNAL; + cmd->cur_sg = cmd->req.sg; + + if (nvmet_tcp_has_data_in(cmd)) { + cmd->iov = kmalloc_array(cmd->req.sg_cnt, + sizeof(*cmd->iov), GFP_KERNEL); + if (!cmd->iov) + goto err; + } + + return 0; +err: + sgl_free(cmd->req.sg); + return NVME_SC_INTERNAL; +} + +static void nvmet_tcp_ddgst(struct ahash_request *hash, + struct nvmet_tcp_cmd *cmd) +{ + ahash_request_set_crypt(hash, cmd->req.sg, + (void *)&cmd->exp_ddgst, cmd->req.transfer_len); + crypto_ahash_digest(hash); +} + +static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd) +{ + struct nvme_tcp_data_pdu *pdu = cmd->data_pdu; + struct nvmet_tcp_queue *queue = cmd->queue; + u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); + u8 ddgst = nvmet_tcp_ddgst_len(cmd->queue); + + cmd->offset = 0; + cmd->state = NVMET_TCP_SEND_DATA_PDU; + + pdu->hdr.type = nvme_tcp_c2h_data; + pdu->hdr.flags = NVME_TCP_F_DATA_LAST; + pdu->hdr.hlen = sizeof(*pdu); + pdu->hdr.pdo = pdu->hdr.hlen + hdgst; + pdu->hdr.plen = + cpu_to_le32(pdu->hdr.hlen + hdgst + + cmd->req.transfer_len + ddgst); + pdu->command_id = cmd->req.rsp->command_id; + pdu->data_length = cpu_to_le32(cmd->req.transfer_len); + pdu->data_offset = cpu_to_le32(cmd->wbytes_done); + + if (queue->data_digest) { + pdu->hdr.flags |= NVME_TCP_F_DDGST; + nvmet_tcp_ddgst(queue->snd_hash, cmd); + } + + if (cmd->queue->hdr_digest) { + pdu->hdr.flags |= NVME_TCP_F_HDGST; + nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + } +} + +static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd) +{ + struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu; + struct nvmet_tcp_queue *queue = cmd->queue; + u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); + + cmd->offset = 0; + cmd->state = NVMET_TCP_SEND_R2T; + + pdu->hdr.type = nvme_tcp_r2t; + pdu->hdr.flags = 0; + pdu->hdr.hlen = sizeof(*pdu); + pdu->hdr.pdo = 0; + pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst); + + pdu->command_id = cmd->req.cmd->common.command_id; + pdu->ttag = nvmet_tcp_cmd_tag(cmd->queue, cmd); + pdu->r2t_length = cpu_to_le32(cmd->req.transfer_len - cmd->rbytes_done); + pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done); + if (cmd->queue->hdr_digest) { + pdu->hdr.flags |= NVME_TCP_F_HDGST; + nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + } +} + +static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd) +{ + struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu; + struct nvmet_tcp_queue *queue = cmd->queue; + u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); + + cmd->offset = 0; + cmd->state = NVMET_TCP_SEND_RESPONSE; + + pdu->hdr.type = nvme_tcp_rsp; + pdu->hdr.flags = 0; + pdu->hdr.hlen = sizeof(*pdu); + pdu->hdr.pdo = 0; + pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst); + if (cmd->queue->hdr_digest) { + pdu->hdr.flags |= NVME_TCP_F_HDGST; + nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + } +} + +static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue) +{ + struct llist_node *node; + + node = llist_del_all(&queue->resp_list); + if (!node) + return; + + while (node) { + struct nvmet_tcp_cmd *cmd = llist_entry(node, + struct nvmet_tcp_cmd, lentry); + + list_add(&cmd->entry, &queue->resp_send_list); + node = node->next; + queue->send_list_len++; + } +} + +static struct nvmet_tcp_cmd *nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue *queue) +{ + queue->snd_cmd = list_first_entry_or_null(&queue->resp_send_list, + struct nvmet_tcp_cmd, entry); + if (!queue->snd_cmd) { + nvmet_tcp_process_resp_list(queue); + queue->snd_cmd = + list_first_entry_or_null(&queue->resp_send_list, + struct nvmet_tcp_cmd, entry); + if (unlikely(!queue->snd_cmd)) + return NULL; + } + + list_del_init(&queue->snd_cmd->entry); + queue->send_list_len--; + + if (nvmet_tcp_need_data_out(queue->snd_cmd)) + nvmet_setup_c2h_data_pdu(queue->snd_cmd); + else if (nvmet_tcp_need_data_in(queue->snd_cmd)) + nvmet_setup_r2t_pdu(queue->snd_cmd); + else + nvmet_setup_response_pdu(queue->snd_cmd); + + return queue->snd_cmd; +} + +static void nvmet_tcp_queue_response(struct nvmet_req *req) +{ + struct nvmet_tcp_cmd *cmd = + container_of(req, struct nvmet_tcp_cmd, req); + struct nvmet_tcp_queue *queue = cmd->queue; + + llist_add(&cmd->lentry, &queue->resp_list); + queue_work_on(cmd->queue->cpu, nvmet_tcp_wq, &cmd->queue->io_work); +} + +static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd) +{ + u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); + int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst; + int ret; + + ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->data_pdu), + offset_in_page(cmd->data_pdu) + cmd->offset, + left, MSG_DONTWAIT | MSG_MORE); + if (ret <= 0) + return ret; + + cmd->offset += ret; + left -= ret; + + if (left) + return -EAGAIN; + + cmd->state = NVMET_TCP_SEND_DATA; + cmd->offset = 0; + return 1; +} + +static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd) +{ + struct nvmet_tcp_queue *queue = cmd->queue; + int ret; + + while (cmd->cur_sg) { + struct page *page = sg_page(cmd->cur_sg); + u32 left = cmd->cur_sg->length - cmd->offset; + + ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset, + left, MSG_DONTWAIT | MSG_MORE); + if (ret <= 0) + return ret; + + cmd->offset += ret; + cmd->wbytes_done += ret; + + /* Done with sg?*/ + if (cmd->offset == cmd->cur_sg->length) { + cmd->cur_sg = sg_next(cmd->cur_sg); + cmd->offset = 0; + } + } + + if (queue->data_digest) { + cmd->state = NVMET_TCP_SEND_DDGST; + cmd->offset = 0; + } else { + nvmet_setup_response_pdu(cmd); + } + return 1; + +} + +static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd, + bool last_in_batch) +{ + u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); + int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst; + int flags = MSG_DONTWAIT; + int ret; + + if (!last_in_batch && cmd->queue->send_list_len) + flags |= MSG_MORE; + else + flags |= MSG_EOR; + + ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->rsp_pdu), + offset_in_page(cmd->rsp_pdu) + cmd->offset, left, flags); + if (ret <= 0) + return ret; + cmd->offset += ret; + left -= ret; + + if (left) + return -EAGAIN; + + kfree(cmd->iov); + sgl_free(cmd->req.sg); + cmd->queue->snd_cmd = NULL; + nvmet_tcp_put_cmd(cmd); + return 1; +} + +static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch) +{ + u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); + int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst; + int flags = MSG_DONTWAIT; + int ret; + + if (!last_in_batch && cmd->queue->send_list_len) + flags |= MSG_MORE; + else + flags |= MSG_EOR; + + ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->r2t_pdu), + offset_in_page(cmd->r2t_pdu) + cmd->offset, left, flags); + if (ret <= 0) + return ret; + cmd->offset += ret; + left -= ret; + + if (left) + return -EAGAIN; + + cmd->queue->snd_cmd = NULL; + return 1; +} + +static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd) +{ + struct nvmet_tcp_queue *queue = cmd->queue; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; + struct kvec iov = { + .iov_base = &cmd->exp_ddgst + cmd->offset, + .iov_len = NVME_TCP_DIGEST_LENGTH - cmd->offset + }; + int ret; + + ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); + if (unlikely(ret <= 0)) + return ret; + + cmd->offset += ret; + nvmet_setup_response_pdu(cmd); + return 1; +} + +static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue, + bool last_in_batch) +{ + struct nvmet_tcp_cmd *cmd = queue->snd_cmd; + int ret = 0; + + if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) { + cmd = nvmet_tcp_fetch_cmd(queue); + if (unlikely(!cmd)) + return 0; + } + + if (cmd->state == NVMET_TCP_SEND_DATA_PDU) { + ret = nvmet_try_send_data_pdu(cmd); + if (ret <= 0) + goto done_send; + } + + if (cmd->state == NVMET_TCP_SEND_DATA) { + ret = nvmet_try_send_data(cmd); + if (ret <= 0) + goto done_send; + } + + if (cmd->state == NVMET_TCP_SEND_DDGST) { + ret = nvmet_try_send_ddgst(cmd); + if (ret <= 0) + goto done_send; + } + + if (cmd->state == NVMET_TCP_SEND_R2T) { + ret = nvmet_try_send_r2t(cmd, last_in_batch); + if (ret <= 0) + goto done_send; + } + + if (cmd->state == NVMET_TCP_SEND_RESPONSE) + ret = nvmet_try_send_response(cmd, last_in_batch); + +done_send: + if (ret < 0) { + if (ret == -EAGAIN) + return 0; + return ret; + } + + return 1; +} + +static int nvmet_tcp_try_send(struct nvmet_tcp_queue *queue, + int budget, int *sends) +{ + int i, ret = 0; + + for (i = 0; i < budget; i++) { + ret = nvmet_tcp_try_send_one(queue, i == budget - 1); + if (ret <= 0) + break; + (*sends)++; + } + + return ret; +} + +static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue) +{ + queue->offset = 0; + queue->left = sizeof(struct nvme_tcp_hdr); + queue->cmd = NULL; + queue->rcv_state = NVMET_TCP_RECV_PDU; +} + +static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash); + + ahash_request_free(queue->rcv_hash); + ahash_request_free(queue->snd_hash); + crypto_free_ahash(tfm); +} + +static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue) +{ + struct crypto_ahash *tfm; + + tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL); + if (!queue->snd_hash) + goto free_tfm; + ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL); + + queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL); + if (!queue->rcv_hash) + goto free_snd_hash; + ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL); + + return 0; +free_snd_hash: + ahash_request_free(queue->snd_hash); +free_tfm: + crypto_free_ahash(tfm); + return -ENOMEM; +} + + +static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue) +{ + struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq; + struct nvme_tcp_icresp_pdu *icresp = &queue->pdu.icresp; + struct msghdr msg = {}; + struct kvec iov; + int ret; + + if (le32_to_cpu(icreq->hdr.plen) != sizeof(struct nvme_tcp_icreq_pdu)) { + pr_err("bad nvme-tcp pdu length (%d)\n", + le32_to_cpu(icreq->hdr.plen)); + nvmet_tcp_fatal_error(queue); + } + + if (icreq->pfv != NVME_TCP_PFV_1_0) { + pr_err("queue %d: bad pfv %d\n", queue->idx, icreq->pfv); + return -EPROTO; + } + + if (icreq->hpda != 0) { + pr_err("queue %d: unsupported hpda %d\n", queue->idx, + icreq->hpda); + return -EPROTO; + } + + if (icreq->maxr2t != 0) { + pr_err("queue %d: unsupported maxr2t %d\n", queue->idx, + le32_to_cpu(icreq->maxr2t) + 1); + return -EPROTO; + } + + queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE); + queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE); + if (queue->hdr_digest || queue->data_digest) { + ret = nvmet_tcp_alloc_crypto(queue); + if (ret) + return ret; + } + + memset(icresp, 0, sizeof(*icresp)); + icresp->hdr.type = nvme_tcp_icresp; + icresp->hdr.hlen = sizeof(*icresp); + icresp->hdr.pdo = 0; + icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen); + icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0); + icresp->maxdata = cpu_to_le32(0xffff); /* FIXME: support r2t */ + icresp->cpda = 0; + if (queue->hdr_digest) + icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE; + if (queue->data_digest) + icresp->digest |= NVME_TCP_DATA_DIGEST_ENABLE; + + iov.iov_base = icresp; + iov.iov_len = sizeof(*icresp); + ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); + if (ret < 0) + goto free_crypto; + + queue->state = NVMET_TCP_Q_LIVE; + nvmet_prepare_receive_pdu(queue); + return 0; +free_crypto: + if (queue->hdr_digest || queue->data_digest) + nvmet_tcp_free_crypto(queue); + return ret; +} + +static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue, + struct nvmet_tcp_cmd *cmd, struct nvmet_req *req) +{ + int ret; + + /* recover the expected data transfer length */ + req->data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length); + + if (!nvme_is_write(cmd->req.cmd) || + req->data_len > cmd->req.port->inline_data_size) { + nvmet_prepare_receive_pdu(queue); + return; + } + + ret = nvmet_tcp_map_data(cmd); + if (unlikely(ret)) { + pr_err("queue %d: failed to map data\n", queue->idx); + nvmet_tcp_fatal_error(queue); + return; + } + + queue->rcv_state = NVMET_TCP_RECV_DATA; + nvmet_tcp_map_pdu_iovec(cmd); + cmd->flags |= NVMET_TCP_F_INIT_FAILED; +} + +static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) +{ + struct nvme_tcp_data_pdu *data = &queue->pdu.data; + struct nvmet_tcp_cmd *cmd; + + cmd = &queue->cmds[data->ttag]; + + if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) { + pr_err("ttag %u unexpected data offset %u (expected %u)\n", + data->ttag, le32_to_cpu(data->data_offset), + cmd->rbytes_done); + /* FIXME: use path and transport errors */ + nvmet_req_complete(&cmd->req, + NVME_SC_INVALID_FIELD | NVME_SC_DNR); + return -EPROTO; + } + + cmd->pdu_len = le32_to_cpu(data->data_length); + cmd->pdu_recv = 0; + nvmet_tcp_map_pdu_iovec(cmd); + queue->cmd = cmd; + queue->rcv_state = NVMET_TCP_RECV_DATA; + + return 0; +} + +static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue) +{ + struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr; + struct nvme_command *nvme_cmd = &queue->pdu.cmd.cmd; + struct nvmet_req *req; + int ret; + + if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) { + if (hdr->type != nvme_tcp_icreq) { + pr_err("unexpected pdu type (%d) before icreq\n", + hdr->type); + nvmet_tcp_fatal_error(queue); + return -EPROTO; + } + return nvmet_tcp_handle_icreq(queue); + } + + if (hdr->type == nvme_tcp_h2c_data) { + ret = nvmet_tcp_handle_h2c_data_pdu(queue); + if (unlikely(ret)) + return ret; + return 0; + } + + queue->cmd = nvmet_tcp_get_cmd(queue); + if (unlikely(!queue->cmd)) { + /* This should never happen */ + pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d", + queue->idx, queue->nr_cmds, queue->send_list_len, + nvme_cmd->common.opcode); + nvmet_tcp_fatal_error(queue); + return -ENOMEM; + } + + req = &queue->cmd->req; + memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd)); + + if (unlikely(!nvmet_req_init(req, &queue->nvme_cq, + &queue->nvme_sq, &nvmet_tcp_ops))) { + pr_err("failed cmd %p id %d opcode %d, data_len: %d\n", + req->cmd, req->cmd->common.command_id, + req->cmd->common.opcode, + le32_to_cpu(req->cmd->common.dptr.sgl.length)); + + nvmet_tcp_handle_req_failure(queue, queue->cmd, req); + return -EAGAIN; + } + + ret = nvmet_tcp_map_data(queue->cmd); + if (unlikely(ret)) { + pr_err("queue %d: failed to map data\n", queue->idx); + if (nvmet_tcp_has_inline_data(queue->cmd)) + nvmet_tcp_fatal_error(queue); + else + nvmet_req_complete(req, ret); + ret = -EAGAIN; + goto out; + } + + if (nvmet_tcp_need_data_in(queue->cmd)) { + if (nvmet_tcp_has_inline_data(queue->cmd)) { + queue->rcv_state = NVMET_TCP_RECV_DATA; + nvmet_tcp_map_pdu_iovec(queue->cmd); + return 0; + } + /* send back R2T */ + nvmet_tcp_queue_response(&queue->cmd->req); + goto out; + } + + nvmet_req_execute(&queue->cmd->req); +out: + nvmet_prepare_receive_pdu(queue); + return ret; +} + +static const u8 nvme_tcp_pdu_sizes[] = { + [nvme_tcp_icreq] = sizeof(struct nvme_tcp_icreq_pdu), + [nvme_tcp_cmd] = sizeof(struct nvme_tcp_cmd_pdu), + [nvme_tcp_h2c_data] = sizeof(struct nvme_tcp_data_pdu), +}; + +static inline u8 nvmet_tcp_pdu_size(u8 type) +{ + size_t idx = type; + + return (idx < ARRAY_SIZE(nvme_tcp_pdu_sizes) && + nvme_tcp_pdu_sizes[idx]) ? + nvme_tcp_pdu_sizes[idx] : 0; +} + +static inline bool nvmet_tcp_pdu_valid(u8 type) +{ + switch (type) { + case nvme_tcp_icreq: + case nvme_tcp_cmd: + case nvme_tcp_h2c_data: + /* fallthru */ + return true; + } + + return false; +} + +static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue) +{ + struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr; + int len; + struct kvec iov; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; + +recv: + iov.iov_base = (void *)&queue->pdu + queue->offset; + iov.iov_len = queue->left; + len = kernel_recvmsg(queue->sock, &msg, &iov, 1, + iov.iov_len, msg.msg_flags); + if (unlikely(len < 0)) + return len; + + queue->offset += len; + queue->left -= len; + if (queue->left) + return -EAGAIN; + + if (queue->offset == sizeof(struct nvme_tcp_hdr)) { + u8 hdgst = nvmet_tcp_hdgst_len(queue); + + if (unlikely(!nvmet_tcp_pdu_valid(hdr->type))) { + pr_err("unexpected pdu type %d\n", hdr->type); + nvmet_tcp_fatal_error(queue); + return -EIO; + } + + if (unlikely(hdr->hlen != nvmet_tcp_pdu_size(hdr->type))) { + pr_err("pdu %d bad hlen %d\n", hdr->type, hdr->hlen); + return -EIO; + } + + queue->left = hdr->hlen - queue->offset + hdgst; + goto recv; + } + + if (queue->hdr_digest && + nvmet_tcp_verify_hdgst(queue, &queue->pdu, queue->offset)) { + nvmet_tcp_fatal_error(queue); /* fatal */ + return -EPROTO; + } + + if (queue->data_digest && + nvmet_tcp_check_ddgst(queue, &queue->pdu)) { + nvmet_tcp_fatal_error(queue); /* fatal */ + return -EPROTO; + } + + return nvmet_tcp_done_recv_pdu(queue); +} + +static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd) +{ + struct nvmet_tcp_queue *queue = cmd->queue; + + nvmet_tcp_ddgst(queue->rcv_hash, cmd); + queue->offset = 0; + queue->left = NVME_TCP_DIGEST_LENGTH; + queue->rcv_state = NVMET_TCP_RECV_DDGST; +} + +static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue) +{ + struct nvmet_tcp_cmd *cmd = queue->cmd; + int ret; + + while (msg_data_left(&cmd->recv_msg)) { + ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg, + cmd->recv_msg.msg_flags); + if (ret <= 0) + return ret; + + cmd->pdu_recv += ret; + cmd->rbytes_done += ret; + } + + nvmet_tcp_unmap_pdu_iovec(cmd); + + if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) && + cmd->rbytes_done == cmd->req.transfer_len) { + if (queue->data_digest) { + nvmet_tcp_prep_recv_ddgst(cmd); + return 0; + } + nvmet_req_execute(&cmd->req); + } + + nvmet_prepare_receive_pdu(queue); + return 0; +} + +static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue) +{ + struct nvmet_tcp_cmd *cmd = queue->cmd; + int ret; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; + struct kvec iov = { + .iov_base = (void *)&cmd->recv_ddgst + queue->offset, + .iov_len = queue->left + }; + + ret = kernel_recvmsg(queue->sock, &msg, &iov, 1, + iov.iov_len, msg.msg_flags); + if (unlikely(ret < 0)) + return ret; + + queue->offset += ret; + queue->left -= ret; + if (queue->left) + return -EAGAIN; + + if (queue->data_digest && cmd->exp_ddgst != cmd->recv_ddgst) { + pr_err("queue %d: cmd %d pdu (%d) data digest error: recv %#x expected %#x\n", + queue->idx, cmd->req.cmd->common.command_id, + queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst), + le32_to_cpu(cmd->exp_ddgst)); + nvmet_tcp_finish_cmd(cmd); + nvmet_tcp_fatal_error(queue); + ret = -EPROTO; + goto out; + } + + if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) && + cmd->rbytes_done == cmd->req.transfer_len) + nvmet_req_execute(&cmd->req); + ret = 0; +out: + nvmet_prepare_receive_pdu(queue); + return ret; +} + +static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue *queue) +{ + int result; + + if (unlikely(queue->rcv_state == NVMET_TCP_RECV_ERR)) + return 0; + + if (queue->rcv_state == NVMET_TCP_RECV_PDU) { + result = nvmet_tcp_try_recv_pdu(queue); + if (result != 0) + goto done_recv; + } + + if (queue->rcv_state == NVMET_TCP_RECV_DATA) { + result = nvmet_tcp_try_recv_data(queue); + if (result != 0) + goto done_recv; + } + + if (queue->rcv_state == NVMET_TCP_RECV_DDGST) { + result = nvmet_tcp_try_recv_ddgst(queue); + if (result != 0) + goto done_recv; + } + +done_recv: + if (result < 0) { + if (result == -EAGAIN) + return 0; + return result; + } + return 1; +} + +static int nvmet_tcp_try_recv(struct nvmet_tcp_queue *queue, + int budget, int *recvs) +{ + int i, ret = 0; + + for (i = 0; i < budget; i++) { + ret = nvmet_tcp_try_recv_one(queue); + if (ret <= 0) + break; + (*recvs)++; + } + + return ret; +} + +static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue) +{ + spin_lock(&queue->state_lock); + if (queue->state != NVMET_TCP_Q_DISCONNECTING) { + queue->state = NVMET_TCP_Q_DISCONNECTING; + schedule_work(&queue->release_work); + } + spin_unlock(&queue->state_lock); +} + +static void nvmet_tcp_io_work(struct work_struct *w) +{ + struct nvmet_tcp_queue *queue = + container_of(w, struct nvmet_tcp_queue, io_work); + bool pending; + int ret, ops = 0; + + do { + pending = false; + + ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops); + if (ret > 0) { + pending = true; + } else if (ret < 0) { + if (ret == -EPIPE || ret == -ECONNRESET) + kernel_sock_shutdown(queue->sock, SHUT_RDWR); + else + nvmet_tcp_fatal_error(queue); + return; + } + + ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops); + if (ret > 0) { + /* transmitted message/data */ + pending = true; + } else if (ret < 0) { + if (ret == -EPIPE || ret == -ECONNRESET) + kernel_sock_shutdown(queue->sock, SHUT_RDWR); + else + nvmet_tcp_fatal_error(queue); + return; + } + + } while (pending && ops < NVMET_TCP_IO_WORK_BUDGET); + + /* + * We exahusted our budget, requeue our selves + */ + if (pending) + queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work); +} + +static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue, + struct nvmet_tcp_cmd *c) +{ + u8 hdgst = nvmet_tcp_hdgst_len(queue); + + c->queue = queue; + c->req.port = queue->port->nport; + + c->cmd_pdu = page_frag_alloc(&queue->pf_cache, + sizeof(*c->cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); + if (!c->cmd_pdu) + return -ENOMEM; + c->req.cmd = &c->cmd_pdu->cmd; + + c->rsp_pdu = page_frag_alloc(&queue->pf_cache, + sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); + if (!c->rsp_pdu) + goto out_free_cmd; + c->req.rsp = &c->rsp_pdu->cqe; + + c->data_pdu = page_frag_alloc(&queue->pf_cache, + sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); + if (!c->data_pdu) + goto out_free_rsp; + + c->r2t_pdu = page_frag_alloc(&queue->pf_cache, + sizeof(*c->r2t_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO); + if (!c->r2t_pdu) + goto out_free_data; + + c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; + + list_add_tail(&c->entry, &queue->free_list); + + return 0; +out_free_data: + page_frag_free(c->data_pdu); +out_free_rsp: + page_frag_free(c->rsp_pdu); +out_free_cmd: + page_frag_free(c->cmd_pdu); + return -ENOMEM; +} + +static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c) +{ + page_frag_free(c->r2t_pdu); + page_frag_free(c->data_pdu); + page_frag_free(c->rsp_pdu); + page_frag_free(c->cmd_pdu); +} + +static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue *queue) +{ + struct nvmet_tcp_cmd *cmds; + int i, ret = -EINVAL, nr_cmds = queue->nr_cmds; + + cmds = kcalloc(nr_cmds, sizeof(struct nvmet_tcp_cmd), GFP_KERNEL); + if (!cmds) + goto out; + + for (i = 0; i < nr_cmds; i++) { + ret = nvmet_tcp_alloc_cmd(queue, cmds + i); + if (ret) + goto out_free; + } + + queue->cmds = cmds; + + return 0; +out_free: + while (--i >= 0) + nvmet_tcp_free_cmd(cmds + i); + kfree(cmds); +out: + return ret; +} + +static void nvmet_tcp_free_cmds(struct nvmet_tcp_queue *queue) +{ + struct nvmet_tcp_cmd *cmds = queue->cmds; + int i; + + for (i = 0; i < queue->nr_cmds; i++) + nvmet_tcp_free_cmd(cmds + i); + + nvmet_tcp_free_cmd(&queue->connect); + kfree(cmds); +} + +static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue) +{ + struct socket *sock = queue->sock; + + write_lock_bh(&sock->sk->sk_callback_lock); + sock->sk->sk_data_ready = queue->data_ready; + sock->sk->sk_state_change = queue->state_change; + sock->sk->sk_write_space = queue->write_space; + sock->sk->sk_user_data = NULL; + write_unlock_bh(&sock->sk->sk_callback_lock); +} + +static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd) +{ + nvmet_req_uninit(&cmd->req); + nvmet_tcp_unmap_pdu_iovec(cmd); + sgl_free(cmd->req.sg); +} + +static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue) +{ + struct nvmet_tcp_cmd *cmd = queue->cmds; + int i; + + for (i = 0; i < queue->nr_cmds; i++, cmd++) { + if (nvmet_tcp_need_data_in(cmd)) + nvmet_tcp_finish_cmd(cmd); + } + + if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) { + /* failed in connect */ + nvmet_tcp_finish_cmd(&queue->connect); + } +} + +static void nvmet_tcp_release_queue_work(struct work_struct *w) +{ + struct nvmet_tcp_queue *queue = + container_of(w, struct nvmet_tcp_queue, release_work); + + mutex_lock(&nvmet_tcp_queue_mutex); + list_del_init(&queue->queue_list); + mutex_unlock(&nvmet_tcp_queue_mutex); + + nvmet_tcp_restore_socket_callbacks(queue); + flush_work(&queue->io_work); + + nvmet_tcp_uninit_data_in_cmds(queue); + nvmet_sq_destroy(&queue->nvme_sq); + cancel_work_sync(&queue->io_work); + sock_release(queue->sock); + nvmet_tcp_free_cmds(queue); + if (queue->hdr_digest || queue->data_digest) + nvmet_tcp_free_crypto(queue); + ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx); + + kfree(queue); +} + +static void nvmet_tcp_data_ready(struct sock *sk) +{ + struct nvmet_tcp_queue *queue; + + read_lock_bh(&sk->sk_callback_lock); + queue = sk->sk_user_data; + if (likely(queue)) + queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work); + read_unlock_bh(&sk->sk_callback_lock); +} + +static void nvmet_tcp_write_space(struct sock *sk) +{ + struct nvmet_tcp_queue *queue; + + read_lock_bh(&sk->sk_callback_lock); + queue = sk->sk_user_data; + if (unlikely(!queue)) + goto out; + + if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) { + queue->write_space(sk); + goto out; + } + + if (sk_stream_is_writeable(sk)) { + clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work); + } +out: + read_unlock_bh(&sk->sk_callback_lock); +} + +static void nvmet_tcp_state_change(struct sock *sk) +{ + struct nvmet_tcp_queue *queue; + + write_lock_bh(&sk->sk_callback_lock); + queue = sk->sk_user_data; + if (!queue) + goto done; + + switch (sk->sk_state) { + case TCP_FIN_WAIT1: + case TCP_CLOSE_WAIT: + case TCP_CLOSE: + /* FALLTHRU */ + sk->sk_user_data = NULL; + nvmet_tcp_schedule_release_queue(queue); + break; + default: + pr_warn("queue %d unhandled state %d\n", + queue->idx, sk->sk_state); + } +done: + write_unlock_bh(&sk->sk_callback_lock); +} + +static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) +{ + struct socket *sock = queue->sock; + struct linger sol = { .l_onoff = 1, .l_linger = 0 }; + int ret; + + ret = kernel_getsockname(sock, + (struct sockaddr *)&queue->sockaddr); + if (ret < 0) + return ret; + + ret = kernel_getpeername(sock, + (struct sockaddr *)&queue->sockaddr_peer); + if (ret < 0) + return ret; + + /* + * Cleanup whatever is sitting in the TCP transmit queue on socket + * close. This is done to prevent stale data from being sent should + * the network connection be restored before TCP times out. + */ + ret = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, + (char *)&sol, sizeof(sol)); + if (ret) + return ret; + + write_lock_bh(&sock->sk->sk_callback_lock); + sock->sk->sk_user_data = queue; + queue->data_ready = sock->sk->sk_data_ready; + sock->sk->sk_data_ready = nvmet_tcp_data_ready; + queue->state_change = sock->sk->sk_state_change; + sock->sk->sk_state_change = nvmet_tcp_state_change; + queue->write_space = sock->sk->sk_write_space; + sock->sk->sk_write_space = nvmet_tcp_write_space; + write_unlock_bh(&sock->sk->sk_callback_lock); + + return 0; +} + +static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, + struct socket *newsock) +{ + struct nvmet_tcp_queue *queue; + int ret; + + queue = kzalloc(sizeof(*queue), GFP_KERNEL); + if (!queue) + return -ENOMEM; + + INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work); + INIT_WORK(&queue->io_work, nvmet_tcp_io_work); + queue->sock = newsock; + queue->port = port; + queue->nr_cmds = 0; + spin_lock_init(&queue->state_lock); + queue->state = NVMET_TCP_Q_CONNECTING; + INIT_LIST_HEAD(&queue->free_list); + init_llist_head(&queue->resp_list); + INIT_LIST_HEAD(&queue->resp_send_list); + + queue->idx = ida_simple_get(&nvmet_tcp_queue_ida, 0, 0, GFP_KERNEL); + if (queue->idx < 0) { + ret = queue->idx; + goto out_free_queue; + } + + ret = nvmet_tcp_alloc_cmd(queue, &queue->connect); + if (ret) + goto out_ida_remove; + + ret = nvmet_sq_init(&queue->nvme_sq); + if (ret) + goto out_free_connect; + + port->last_cpu = cpumask_next_wrap(port->last_cpu, + cpu_online_mask, -1, false); + queue->cpu = port->last_cpu; + nvmet_prepare_receive_pdu(queue); + + mutex_lock(&nvmet_tcp_queue_mutex); + list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list); + mutex_unlock(&nvmet_tcp_queue_mutex); + + ret = nvmet_tcp_set_queue_sock(queue); + if (ret) + goto out_destroy_sq; + + queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work); + + return 0; +out_destroy_sq: + mutex_lock(&nvmet_tcp_queue_mutex); + list_del_init(&queue->queue_list); + mutex_unlock(&nvmet_tcp_queue_mutex); + nvmet_sq_destroy(&queue->nvme_sq); +out_free_connect: + nvmet_tcp_free_cmd(&queue->connect); +out_ida_remove: + ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx); +out_free_queue: + kfree(queue); + return ret; +} + +static void nvmet_tcp_accept_work(struct work_struct *w) +{ + struct nvmet_tcp_port *port = + container_of(w, struct nvmet_tcp_port, accept_work); + struct socket *newsock; + int ret; + + while (true) { + ret = kernel_accept(port->sock, &newsock, O_NONBLOCK); + if (ret < 0) { + if (ret != -EAGAIN) + pr_warn("failed to accept err=%d\n", ret); + return; + } + ret = nvmet_tcp_alloc_queue(port, newsock); + if (ret) { + pr_err("failed to allocate queue\n"); + sock_release(newsock); + } + } +} + +static void nvmet_tcp_listen_data_ready(struct sock *sk) +{ + struct nvmet_tcp_port *port; + + read_lock_bh(&sk->sk_callback_lock); + port = sk->sk_user_data; + if (!port) + goto out; + + if (sk->sk_state == TCP_LISTEN) + schedule_work(&port->accept_work); +out: + read_unlock_bh(&sk->sk_callback_lock); +} + +static int nvmet_tcp_add_port(struct nvmet_port *nport) +{ + struct nvmet_tcp_port *port; + __kernel_sa_family_t af; + int opt, ret; + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!port) + return -ENOMEM; + + switch (nport->disc_addr.adrfam) { + case NVMF_ADDR_FAMILY_IP4: + af = AF_INET; + break; + case NVMF_ADDR_FAMILY_IP6: + af = AF_INET6; + break; + default: + pr_err("address family %d not supported\n", + nport->disc_addr.adrfam); + ret = -EINVAL; + goto err_port; + } + + ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr, + nport->disc_addr.trsvcid, &port->addr); + if (ret) { + pr_err("malformed ip/port passed: %s:%s\n", + nport->disc_addr.traddr, nport->disc_addr.trsvcid); + goto err_port; + } + + port->nport = nport; + port->last_cpu = -1; + INIT_WORK(&port->accept_work, nvmet_tcp_accept_work); + if (port->nport->inline_data_size < 0) + port->nport->inline_data_size = NVMET_TCP_DEF_INLINE_DATA_SIZE; + + ret = sock_create(port->addr.ss_family, SOCK_STREAM, + IPPROTO_TCP, &port->sock); + if (ret) { + pr_err("failed to create a socket\n"); + goto err_port; + } + + port->sock->sk->sk_user_data = port; + port->data_ready = port->sock->sk->sk_data_ready; + port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready; + + opt = 1; + ret = kernel_setsockopt(port->sock, IPPROTO_TCP, + TCP_NODELAY, (char *)&opt, sizeof(opt)); + if (ret) { + pr_err("failed to set TCP_NODELAY sock opt %d\n", ret); + goto err_sock; + } + + ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_REUSEADDR, + (char *)&opt, sizeof(opt)); + if (ret) { + pr_err("failed to set SO_REUSEADDR sock opt %d\n", ret); + goto err_sock; + } + + ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr, + sizeof(port->addr)); + if (ret) { + pr_err("failed to bind port socket %d\n", ret); + goto err_sock; + } + + ret = kernel_listen(port->sock, 128); + if (ret) { + pr_err("failed to listen %d on port sock\n", ret); + goto err_sock; + } + + nport->priv = port; + pr_info("enabling port %d (%pISpc)\n", + le16_to_cpu(nport->disc_addr.portid), &port->addr); + + return 0; + +err_sock: + sock_release(port->sock); +err_port: + kfree(port); + return ret; +} + +static void nvmet_tcp_remove_port(struct nvmet_port *nport) +{ + struct nvmet_tcp_port *port = nport->priv; + + write_lock_bh(&port->sock->sk->sk_callback_lock); + port->sock->sk->sk_data_ready = port->data_ready; + port->sock->sk->sk_user_data = NULL; + write_unlock_bh(&port->sock->sk->sk_callback_lock); + cancel_work_sync(&port->accept_work); + + sock_release(port->sock); + kfree(port); +} + +static void nvmet_tcp_delete_ctrl(struct nvmet_ctrl *ctrl) +{ + struct nvmet_tcp_queue *queue; + + mutex_lock(&nvmet_tcp_queue_mutex); + list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list) + if (queue->nvme_sq.ctrl == ctrl) + kernel_sock_shutdown(queue->sock, SHUT_RDWR); + mutex_unlock(&nvmet_tcp_queue_mutex); +} + +static u16 nvmet_tcp_install_queue(struct nvmet_sq *sq) +{ + struct nvmet_tcp_queue *queue = + container_of(sq, struct nvmet_tcp_queue, nvme_sq); + + if (sq->qid == 0) { + /* Let inflight controller teardown complete */ + flush_scheduled_work(); + } + + queue->nr_cmds = sq->size * 2; + if (nvmet_tcp_alloc_cmds(queue)) + return NVME_SC_INTERNAL; + return 0; +} + +static void nvmet_tcp_disc_port_addr(struct nvmet_req *req, + struct nvmet_port *nport, char *traddr) +{ + struct nvmet_tcp_port *port = nport->priv; + + if (inet_addr_is_any((struct sockaddr *)&port->addr)) { + struct nvmet_tcp_cmd *cmd = + container_of(req, struct nvmet_tcp_cmd, req); + struct nvmet_tcp_queue *queue = cmd->queue; + + sprintf(traddr, "%pISc", (struct sockaddr *)&queue->sockaddr); + } else { + memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE); + } +} + +static struct nvmet_fabrics_ops nvmet_tcp_ops = { + .owner = THIS_MODULE, + .type = NVMF_TRTYPE_TCP, + .msdbd = 1, + .has_keyed_sgls = 0, + .add_port = nvmet_tcp_add_port, + .remove_port = nvmet_tcp_remove_port, + .queue_response = nvmet_tcp_queue_response, + .delete_ctrl = nvmet_tcp_delete_ctrl, + .install_queue = nvmet_tcp_install_queue, + .disc_traddr = nvmet_tcp_disc_port_addr, +}; + +static int __init nvmet_tcp_init(void) +{ + int ret; + + nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq", WQ_HIGHPRI, 0); + if (!nvmet_tcp_wq) + return -ENOMEM; + + ret = nvmet_register_transport(&nvmet_tcp_ops); + if (ret) + goto err; + + return 0; +err: + destroy_workqueue(nvmet_tcp_wq); + return ret; +} + +static void __exit nvmet_tcp_exit(void) +{ + struct nvmet_tcp_queue *queue; + + nvmet_unregister_transport(&nvmet_tcp_ops); + + flush_scheduled_work(); + mutex_lock(&nvmet_tcp_queue_mutex); + list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list) + kernel_sock_shutdown(queue->sock, SHUT_RDWR); + mutex_unlock(&nvmet_tcp_queue_mutex); + flush_scheduled_work(); + + destroy_workqueue(nvmet_tcp_wq); +} + +module_init(nvmet_tcp_init); +module_exit(nvmet_tcp_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("nvmet-transport-3"); /* 3 == NVMF_TRTYPE_TCP */ |