summaryrefslogtreecommitdiff
path: root/drivers/nvme/host
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme/host')
-rw-r--r--drivers/nvme/host/Kconfig6
-rw-r--r--drivers/nvme/host/apple.c4
-rw-r--r--drivers/nvme/host/auth.c34
-rw-r--r--drivers/nvme/host/constants.c4
-rw-r--r--drivers/nvme/host/core.c363
-rw-r--r--drivers/nvme/host/fabrics.c2
-rw-r--r--drivers/nvme/host/fabrics.h6
-rw-r--r--drivers/nvme/host/fc.c29
-rw-r--r--drivers/nvme/host/ioctl.c39
-rw-r--r--drivers/nvme/host/multipath.c212
-rw-r--r--drivers/nvme/host/nvme.h38
-rw-r--r--drivers/nvme/host/pci.c886
-rw-r--r--drivers/nvme/host/pr.c2
-rw-r--r--drivers/nvme/host/rdma.c6
-rw-r--r--drivers/nvme/host/sysfs.c7
-rw-r--r--drivers/nvme/host/tcp.c175
16 files changed, 1189 insertions, 624 deletions
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 4d64b6935bb9..31974c7dd20c 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -84,9 +84,9 @@ config NVME_TCP
tristate "NVM Express over Fabrics TCP host driver"
depends on INET
depends on BLOCK
+ select CRC32
+ select NET_CRC32C
select NVME_FABRICS
- select CRYPTO
- select CRYPTO_CRC32C
help
This provides support for the NVMe over Fabrics protocol using
the TCP transport. This allows you to use remote block devices
@@ -106,7 +106,7 @@ config NVME_TCP_TLS
help
Enables TLS encryption for NVMe TCP using the netlink handshake API.
- The TLS handshake daemon is availble at
+ The TLS handshake daemon is available at
https://github.com/oracle/ktls-utils.
If unsure, say N.
diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c
index b1fddfa33ab9..1286c31320e6 100644
--- a/drivers/nvme/host/apple.c
+++ b/drivers/nvme/host/apple.c
@@ -301,8 +301,8 @@ static void apple_nvme_submit_cmd(struct apple_nvme_queue *q,
memcpy(&q->sqes[tag], cmd, sizeof(*cmd));
/*
- * This lock here doesn't make much sense at a first glace but
- * removing it will result in occasional missed completetion
+ * This lock here doesn't make much sense at a first glance but
+ * removing it will result in occasional missed completion
* interrupts even though the commands still appear on the CQ.
* It's unclear why this happens but our best guess is that
* there is a bug in the firmware triggered when a new command
diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c
index 6115fef74c1e..201fc8809a62 100644
--- a/drivers/nvme/host/auth.c
+++ b/drivers/nvme/host/auth.c
@@ -31,6 +31,7 @@ struct nvme_dhchap_queue_context {
u32 s1;
u32 s2;
bool bi_directional;
+ bool authenticated;
u16 transaction;
u8 status;
u8 dhgroup_id;
@@ -682,6 +683,7 @@ static void nvme_auth_reset_dhchap(struct nvme_dhchap_queue_context *chap)
static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap)
{
nvme_auth_reset_dhchap(chap);
+ chap->authenticated = false;
if (chap->shash_tfm)
crypto_free_shash(chap->shash_tfm);
if (chap->dh_tfm)
@@ -740,7 +742,7 @@ static int nvme_auth_secure_concat(struct nvme_ctrl *ctrl,
"%s: qid %d failed to generate digest, error %d\n",
__func__, chap->qid, ret);
goto out_free_psk;
- };
+ }
dev_dbg(ctrl->device, "%s: generated digest %s\n",
__func__, digest);
ret = nvme_auth_derive_tls_psk(chap->hash_id, psk, psk_len,
@@ -750,7 +752,7 @@ static int nvme_auth_secure_concat(struct nvme_ctrl *ctrl,
"%s: qid %d failed to derive TLS psk, error %d\n",
__func__, chap->qid, ret);
goto out_free_digest;
- };
+ }
tls_key = nvme_tls_psk_refresh(ctrl->opts->keyring,
ctrl->opts->host->nqn,
@@ -930,12 +932,14 @@ static void nvme_queue_auth_work(struct work_struct *work)
}
if (!ret) {
chap->error = 0;
+ chap->authenticated = true;
if (ctrl->opts->concat &&
(ret = nvme_auth_secure_concat(ctrl, chap))) {
dev_warn(ctrl->device,
"%s: qid %d failed to enable secure concatenation\n",
__func__, chap->qid);
chap->error = ret;
+ chap->authenticated = false;
}
return;
}
@@ -1023,13 +1027,16 @@ static void nvme_ctrl_auth_work(struct work_struct *work)
return;
for (q = 1; q < ctrl->queue_count; q++) {
- ret = nvme_auth_negotiate(ctrl, q);
- if (ret) {
- dev_warn(ctrl->device,
- "qid %d: error %d setting up authentication\n",
- q, ret);
- break;
- }
+ struct nvme_dhchap_queue_context *chap =
+ &ctrl->dhchap_ctxs[q];
+ /*
+ * Skip re-authentication if the queue had
+ * not been authenticated initially.
+ */
+ if (!chap->authenticated)
+ continue;
+ cancel_work_sync(&chap->auth_work);
+ queue_work(nvme_auth_wq, &chap->auth_work);
}
/*
@@ -1037,7 +1044,13 @@ static void nvme_ctrl_auth_work(struct work_struct *work)
* the controller terminates the connection.
*/
for (q = 1; q < ctrl->queue_count; q++) {
- ret = nvme_auth_wait(ctrl, q);
+ struct nvme_dhchap_queue_context *chap =
+ &ctrl->dhchap_ctxs[q];
+ if (!chap->authenticated)
+ continue;
+ flush_work(&chap->auth_work);
+ ret = chap->error;
+ nvme_auth_reset_dhchap(chap);
if (ret)
dev_warn(ctrl->device,
"qid %d: authentication failed\n", q);
@@ -1076,6 +1089,7 @@ int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl)
chap = &ctrl->dhchap_ctxs[i];
chap->qid = i;
chap->ctrl = ctrl;
+ chap->authenticated = false;
INIT_WORK(&chap->auth_work, nvme_queue_auth_work);
}
diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c
index 2b9e6cfaf2a8..dc90df9e13a2 100644
--- a/drivers/nvme/host/constants.c
+++ b/drivers/nvme/host/constants.c
@@ -133,7 +133,7 @@ static const char * const nvme_statuses[] = {
[NVME_SC_NS_NOT_ATTACHED] = "Namespace Not Attached",
[NVME_SC_THIN_PROV_NOT_SUPP] = "Thin Provisioning Not Supported",
[NVME_SC_CTRL_LIST_INVALID] = "Controller List Invalid",
- [NVME_SC_SELT_TEST_IN_PROGRESS] = "Device Self-test In Progress",
+ [NVME_SC_SELF_TEST_IN_PROGRESS] = "Device Self-test In Progress",
[NVME_SC_BP_WRITE_PROHIBITED] = "Boot Partition Write Prohibited",
[NVME_SC_CTRL_ID_INVALID] = "Invalid Controller Identifier",
[NVME_SC_SEC_CTRL_STATE_INVALID] = "Invalid Secondary Controller State",
@@ -145,7 +145,7 @@ static const char * const nvme_statuses[] = {
[NVME_SC_BAD_ATTRIBUTES] = "Conflicting Attributes",
[NVME_SC_INVALID_PI] = "Invalid Protection Information",
[NVME_SC_READ_ONLY] = "Attempted Write to Read Only Range",
- [NVME_SC_ONCS_NOT_SUPPORTED] = "ONCS Not Supported",
+ [NVME_SC_CMD_SIZE_LIM_EXCEEDED] = "Command Size Limits Exceeded",
[NVME_SC_ZONE_BOUNDARY_ERROR] = "Zoned Boundary Error",
[NVME_SC_ZONE_FULL] = "Zone Is Full",
[NVME_SC_ZONE_READ_ONLY] = "Zone Is Read Only",
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 6b04473c0ab7..812c1565114f 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -38,6 +38,8 @@ struct nvme_ns_info {
u32 nsid;
__le32 anagrpid;
u8 pi_offset;
+ u16 endgid;
+ u64 runs;
bool is_shared;
bool is_readonly;
bool is_ready;
@@ -150,6 +152,8 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
unsigned nsid);
static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
struct nvme_command *cmd);
+static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page,
+ u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi);
void nvme_queue_scan(struct nvme_ctrl *ctrl)
{
@@ -286,7 +290,6 @@ static blk_status_t nvme_error_status(u16 status)
case NVME_SC_NS_NOT_READY:
return BLK_STS_TARGET;
case NVME_SC_BAD_ATTRIBUTES:
- case NVME_SC_ONCS_NOT_SUPPORTED:
case NVME_SC_INVALID_OPCODE:
case NVME_SC_INVALID_FIELD:
case NVME_SC_INVALID_NS:
@@ -378,12 +381,12 @@ static void nvme_log_err_passthru(struct request *req)
nr->status & NVME_SC_MASK, /* Status Code */
nr->status & NVME_STATUS_MORE ? "MORE " : "",
nr->status & NVME_STATUS_DNR ? "DNR " : "",
- nr->cmd->common.cdw10,
- nr->cmd->common.cdw11,
- nr->cmd->common.cdw12,
- nr->cmd->common.cdw13,
- nr->cmd->common.cdw14,
- nr->cmd->common.cdw14);
+ le32_to_cpu(nr->cmd->common.cdw10),
+ le32_to_cpu(nr->cmd->common.cdw11),
+ le32_to_cpu(nr->cmd->common.cdw12),
+ le32_to_cpu(nr->cmd->common.cdw13),
+ le32_to_cpu(nr->cmd->common.cdw14),
+ le32_to_cpu(nr->cmd->common.cdw15));
}
enum nvme_disposition {
@@ -664,10 +667,11 @@ static void nvme_free_ns_head(struct kref *ref)
struct nvme_ns_head *head =
container_of(ref, struct nvme_ns_head, ref);
- nvme_mpath_remove_disk(head);
+ nvme_mpath_put_disk(head);
ida_free(&head->subsys->ns_ida, head->instance);
cleanup_srcu_struct(&head->srcu);
nvme_put_subsystem(head->subsys);
+ kfree(head->plids);
kfree(head);
}
@@ -760,6 +764,10 @@ blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
!test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
return BLK_STS_RESOURCE;
+
+ if (!(rq->rq_flags & RQF_DONTPREP))
+ nvme_clear_nvme_request(rq);
+
return nvme_host_path_error(rq);
}
EXPORT_SYMBOL_GPL(nvme_fail_nonready_command);
@@ -991,6 +999,18 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
if (req->cmd_flags & REQ_RAHEAD)
dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
+ if (op == nvme_cmd_write && ns->head->nr_plids) {
+ u16 write_stream = req->bio->bi_write_stream;
+
+ if (WARN_ON_ONCE(write_stream > ns->head->nr_plids))
+ return BLK_STS_INVAL;
+
+ if (write_stream) {
+ dsmgmt |= ns->head->plids[write_stream - 1] << 16;
+ control |= NVME_RW_DTYPE_DPLCMT;
+ }
+ }
+
if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req))
return BLK_STS_INVAL;
@@ -1010,7 +1030,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
if (ns->head->ms) {
/*
- * If formated with metadata, the block layer always provides a
+ * If formatted with metadata, the block layer always provides a
* metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else
* we enable the PRACT bit for protection information or set the
* namespace capacity to zero to prevent any I/O.
@@ -1157,7 +1177,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
req->cmd_flags &= ~REQ_FAILFAST_DRIVER;
if (buffer && bufflen) {
- ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
+ ret = blk_rq_map_kern(req, buffer, bufflen, GFP_KERNEL);
if (ret)
goto out;
}
@@ -1609,6 +1629,7 @@ static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
info->is_ready = true;
+ info->endgid = le16_to_cpu(id->endgid);
if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
dev_info(ctrl->device,
"Ignoring bogus Namespace Identifiers\n");
@@ -1649,6 +1670,7 @@ static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
info->is_ready = id->nstat & NVME_NSTAT_NRDY;
info->is_rotational = id->nsfeat & NVME_NS_ROTATIONAL;
info->no_vwc = id->nsfeat & NVME_NS_VWC_NOT_PRESENT;
+ info->endgid = le16_to_cpu(id->endgid);
}
kfree(id);
return ret;
@@ -1674,7 +1696,7 @@ static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
- u32 *result)
+ void *result)
{
return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
buflen, result);
@@ -1683,7 +1705,7 @@ EXPORT_SYMBOL_GPL(nvme_set_features);
int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
- u32 *result)
+ void *result)
{
return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
buflen, result);
@@ -1848,8 +1870,11 @@ static bool nvme_init_integrity(struct nvme_ns_head *head,
break;
}
- bi->tuple_size = head->ms;
- bi->pi_offset = info->pi_offset;
+ bi->metadata_size = head->ms;
+ if (bi->csum_type) {
+ bi->pi_tuple_size = head->pi_size;
+ bi->pi_offset = info->pi_offset;
+ }
return true;
}
@@ -1997,21 +2022,41 @@ static void nvme_configure_metadata(struct nvme_ctrl *ctrl,
}
-static void nvme_update_atomic_write_disk_info(struct nvme_ns *ns,
- struct nvme_id_ns *id, struct queue_limits *lim,
- u32 bs, u32 atomic_bs)
+static u32 nvme_configure_atomic_write(struct nvme_ns *ns,
+ struct nvme_id_ns *id, struct queue_limits *lim, u32 bs)
{
- unsigned int boundary = 0;
+ u32 atomic_bs, boundary = 0;
+
+ /*
+ * We do not support an offset for the atomic boundaries.
+ */
+ if (id->nabo)
+ return bs;
- if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) {
- if (le16_to_cpu(id->nabspf))
+ if ((id->nsfeat & NVME_NS_FEAT_ATOMICS) && id->nawupf) {
+ /*
+ * Use the per-namespace atomic write unit when available.
+ */
+ atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
+ if (id->nabspf)
boundary = (le16_to_cpu(id->nabspf) + 1) * bs;
+ } else {
+ /*
+ * Use the controller wide atomic write unit. This sucks
+ * because the limit is defined in terms of logical blocks while
+ * namespaces can have different formats, and because there is
+ * no clear language in the specification prohibiting different
+ * values for different controllers in the subsystem.
+ */
+ atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
}
+
lim->atomic_write_hw_max = atomic_bs;
lim->atomic_write_hw_boundary = boundary;
lim->atomic_write_hw_unit_min = bs;
lim->atomic_write_hw_unit_max = rounddown_pow_of_two(atomic_bs);
lim->features |= BLK_FEAT_ATOMIC_WRITES;
+ return atomic_bs;
}
static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl)
@@ -2049,34 +2094,8 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
valid = false;
}
- atomic_bs = phys_bs = bs;
- if (id->nabo == 0) {
- /*
- * Bit 1 indicates whether NAWUPF is defined for this namespace
- * and whether it should be used instead of AWUPF. If NAWUPF ==
- * 0 then AWUPF must be used instead.
- */
- if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
- atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
- else
- atomic_bs = (1 + ns->ctrl->awupf) * bs;
-
- /*
- * Set subsystem atomic bs.
- */
- if (ns->ctrl->subsys->atomic_bs) {
- if (atomic_bs != ns->ctrl->subsys->atomic_bs) {
- dev_err_ratelimited(ns->ctrl->device,
- "%s: Inconsistent Atomic Write Size, Namespace will not be added: Subsystem=%d bytes, Controller/Namespace=%d bytes\n",
- ns->disk ? ns->disk->disk_name : "?",
- ns->ctrl->subsys->atomic_bs,
- atomic_bs);
- }
- } else
- ns->ctrl->subsys->atomic_bs = atomic_bs;
-
- nvme_update_atomic_write_disk_info(ns, id, lim, bs, atomic_bs);
- }
+ phys_bs = bs;
+ atomic_bs = nvme_configure_atomic_write(ns, id, lim, bs);
if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
/* NPWG = Namespace Preferred Write Granularity */
@@ -2167,6 +2186,148 @@ static int nvme_update_ns_info_generic(struct nvme_ns *ns,
return ret;
}
+static int nvme_query_fdp_granularity(struct nvme_ctrl *ctrl,
+ struct nvme_ns_info *info, u8 fdp_idx)
+{
+ struct nvme_fdp_config_log hdr, *h;
+ struct nvme_fdp_config_desc *desc;
+ size_t size = sizeof(hdr);
+ void *log, *end;
+ int i, n, ret;
+
+ ret = nvme_get_log_lsi(ctrl, 0, NVME_LOG_FDP_CONFIGS, 0,
+ NVME_CSI_NVM, &hdr, size, 0, info->endgid);
+ if (ret) {
+ dev_warn(ctrl->device,
+ "FDP configs log header status:0x%x endgid:%d\n", ret,
+ info->endgid);
+ return ret;
+ }
+
+ size = le32_to_cpu(hdr.sze);
+ if (size > PAGE_SIZE * MAX_ORDER_NR_PAGES) {
+ dev_warn(ctrl->device, "FDP config size too large:%zu\n",
+ size);
+ return 0;
+ }
+
+ h = kvmalloc(size, GFP_KERNEL);
+ if (!h)
+ return -ENOMEM;
+
+ ret = nvme_get_log_lsi(ctrl, 0, NVME_LOG_FDP_CONFIGS, 0,
+ NVME_CSI_NVM, h, size, 0, info->endgid);
+ if (ret) {
+ dev_warn(ctrl->device,
+ "FDP configs log status:0x%x endgid:%d\n", ret,
+ info->endgid);
+ goto out;
+ }
+
+ n = le16_to_cpu(h->numfdpc) + 1;
+ if (fdp_idx > n) {
+ dev_warn(ctrl->device, "FDP index:%d out of range:%d\n",
+ fdp_idx, n);
+ /* Proceed without registering FDP streams */
+ ret = 0;
+ goto out;
+ }
+
+ log = h + 1;
+ desc = log;
+ end = log + size - sizeof(*h);
+ for (i = 0; i < fdp_idx; i++) {
+ log += le16_to_cpu(desc->dsze);
+ desc = log;
+ if (log >= end) {
+ dev_warn(ctrl->device,
+ "FDP invalid config descriptor list\n");
+ ret = 0;
+ goto out;
+ }
+ }
+
+ if (le32_to_cpu(desc->nrg) > 1) {
+ dev_warn(ctrl->device, "FDP NRG > 1 not supported\n");
+ ret = 0;
+ goto out;
+ }
+
+ info->runs = le64_to_cpu(desc->runs);
+out:
+ kvfree(h);
+ return ret;
+}
+
+static int nvme_query_fdp_info(struct nvme_ns *ns, struct nvme_ns_info *info)
+{
+ struct nvme_ns_head *head = ns->head;
+ struct nvme_ctrl *ctrl = ns->ctrl;
+ struct nvme_fdp_ruh_status *ruhs;
+ struct nvme_fdp_config fdp;
+ struct nvme_command c = {};
+ size_t size;
+ int i, ret;
+
+ /*
+ * The FDP configuration is static for the lifetime of the namespace,
+ * so return immediately if we've already registered this namespace's
+ * streams.
+ */
+ if (head->nr_plids)
+ return 0;
+
+ ret = nvme_get_features(ctrl, NVME_FEAT_FDP, info->endgid, NULL, 0,
+ &fdp);
+ if (ret) {
+ dev_warn(ctrl->device, "FDP get feature status:0x%x\n", ret);
+ return ret;
+ }
+
+ if (!(fdp.flags & FDPCFG_FDPE))
+ return 0;
+
+ ret = nvme_query_fdp_granularity(ctrl, info, fdp.fdpcidx);
+ if (!info->runs)
+ return ret;
+
+ size = struct_size(ruhs, ruhsd, S8_MAX - 1);
+ ruhs = kzalloc(size, GFP_KERNEL);
+ if (!ruhs)
+ return -ENOMEM;
+
+ c.imr.opcode = nvme_cmd_io_mgmt_recv;
+ c.imr.nsid = cpu_to_le32(head->ns_id);
+ c.imr.mo = NVME_IO_MGMT_RECV_MO_RUHS;
+ c.imr.numd = cpu_to_le32(nvme_bytes_to_numd(size));
+ ret = nvme_submit_sync_cmd(ns->queue, &c, ruhs, size);
+ if (ret) {
+ dev_warn(ctrl->device, "FDP io-mgmt status:0x%x\n", ret);
+ goto free;
+ }
+
+ head->nr_plids = le16_to_cpu(ruhs->nruhsd);
+ if (!head->nr_plids)
+ goto free;
+
+ head->plids = kcalloc(head->nr_plids, sizeof(*head->plids),
+ GFP_KERNEL);
+ if (!head->plids) {
+ dev_warn(ctrl->device,
+ "failed to allocate %u FDP placement IDs\n",
+ head->nr_plids);
+ head->nr_plids = 0;
+ ret = -ENOMEM;
+ goto free;
+ }
+
+ for (i = 0; i < head->nr_plids; i++)
+ head->plids[i] = le16_to_cpu(ruhs->ruhsd[i].pid);
+free:
+ kfree(ruhs);
+ return ret;
+}
+
static int nvme_update_ns_info_block(struct nvme_ns *ns,
struct nvme_ns_info *info)
{
@@ -2204,6 +2365,12 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
goto out;
}
+ if (ns->ctrl->ctratt & NVME_CTRL_ATTR_FDPS) {
+ ret = nvme_query_fdp_info(ns, info);
+ if (ret < 0)
+ goto out;
+ }
+
lim = queue_limits_start_update(ns->disk->queue);
memflags = blk_mq_freeze_queue(ns->disk->queue);
@@ -2216,16 +2383,6 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
if (!nvme_update_disk_info(ns, id, &lim))
capacity = 0;
- /*
- * Validate the max atomic write size fits within the subsystem's
- * atomic write capabilities.
- */
- if (lim.atomic_write_hw_max > ns->ctrl->subsys->atomic_bs) {
- blk_mq_unfreeze_queue(ns->disk->queue, memflags);
- ret = -ENXIO;
- goto out;
- }
-
nvme_config_discard(ns, &lim);
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
ns->head->ids.csi == NVME_CSI_ZNS)
@@ -2248,13 +2405,11 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
if (!nvme_init_integrity(ns->head, &lim, info))
capacity = 0;
- ret = queue_limits_commit_update(ns->disk->queue, &lim);
- if (ret) {
- blk_mq_unfreeze_queue(ns->disk->queue, memflags);
- goto out;
- }
-
- set_capacity_and_notify(ns->disk, capacity);
+ lim.max_write_streams = ns->head->nr_plids;
+ if (lim.max_write_streams)
+ lim.write_stream_granularity = min(info->runs, U32_MAX);
+ else
+ lim.write_stream_granularity = 0;
/*
* Only set the DEAC bit if the device guarantees that reads from
@@ -2262,8 +2417,18 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
* require that, it must be a no-op if reads from deallocated data
* do not return zeroes.
*/
- if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3)))
+ if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3))) {
ns->head->features |= NVME_NS_DEAC;
+ lim.max_hw_wzeroes_unmap_sectors = lim.max_write_zeroes_sectors;
+ }
+
+ ret = queue_limits_commit_update(ns->disk->queue, &lim);
+ if (ret) {
+ blk_mq_unfreeze_queue(ns->disk->queue, memflags);
+ goto out;
+ }
+
+ set_capacity_and_notify(ns->disk, capacity);
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
set_bit(NVME_NS_READY, &ns->flags);
blk_mq_unfreeze_queue(ns->disk->queue, memflags);
@@ -2351,6 +2516,8 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
ns->head->disk->flags |= GENHD_FL_HIDDEN;
else
nvme_init_integrity(ns->head, &lim, info);
+ lim.max_write_streams = ns_lim->max_write_streams;
+ lim.write_stream_granularity = ns_lim->write_stream_granularity;
ret = queue_limits_commit_update(ns->head->disk->queue, &lim);
set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk));
@@ -2991,6 +3158,11 @@ static inline bool nvme_discovery_ctrl(struct nvme_ctrl *ctrl)
return ctrl->opts && ctrl->opts->discovery_nqn;
}
+static inline bool nvme_admin_ctrl(struct nvme_ctrl *ctrl)
+{
+ return ctrl->cntrltype == NVME_CTRL_ADMIN;
+}
+
static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
{
@@ -3041,6 +3213,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
memcpy(subsys->model, id->mn, sizeof(subsys->model));
subsys->vendor_id = le16_to_cpu(id->vid);
subsys->cmic = id->cmic;
+ subsys->awupf = le16_to_cpu(id->awupf);
/* Versions prior to 1.4 don't necessarily report a valid type */
if (id->cntrltype == NVME_CTRL_DISC ||
@@ -3108,8 +3281,8 @@ out_unlock:
return ret;
}
-int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
- void *log, size_t size, u64 offset)
+static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page,
+ u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi)
{
struct nvme_command c = { };
u32 dwlen = nvme_bytes_to_numd(size);
@@ -3123,10 +3296,18 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
c.get_log_page.csi = csi;
+ c.get_log_page.lsi = cpu_to_le16(lsi);
return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
}
+int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
+ void *log, size_t size, u64 offset)
+{
+ return nvme_get_log_lsi(ctrl, nsid, log_page, lsp, csi, log, size,
+ offset, 0);
+}
+
static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
struct nvme_effects_log **log)
{
@@ -3465,7 +3646,6 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
dev_pm_qos_expose_latency_tolerance(ctrl->device);
else if (!ctrl->apst_enabled && prev_apst_enabled)
dev_pm_qos_hide_latency_tolerance(ctrl->device);
- ctrl->awupf = le16_to_cpu(id->awupf);
out_free:
kfree(id);
return ret;
@@ -3495,6 +3675,17 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended)
if (ret)
return ret;
+ if (nvme_admin_ctrl(ctrl)) {
+ /*
+ * An admin controller has one admin queue, but no I/O queues.
+ * Override queue_count so it only creates an admin queue.
+ */
+ dev_dbg(ctrl->device,
+ "Subsystem %s is an administrative controller",
+ ctrl->subsys->subnqn);
+ ctrl->queue_count = 1;
+ }
+
ret = nvme_configure_apst(ctrl);
if (ret < 0)
return ret;
@@ -3584,7 +3775,7 @@ static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl,
*/
if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h))
continue;
- if (!list_empty(&h->list) && nvme_tryget_ns_head(h))
+ if (nvme_tryget_ns_head(h))
return h;
}
@@ -3828,7 +4019,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
}
} else {
ret = -EINVAL;
- if (!info->is_shared || !head->shared) {
+ if ((!info->is_shared || !head->shared) &&
+ !list_empty(&head->list)) {
dev_err(ctrl->device,
"Duplicate unshared namespace %d\n",
info->nsid);
@@ -3853,6 +4045,10 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
list_add_tail_rcu(&ns->siblings, &head->list);
ns->head = head;
mutex_unlock(&ctrl->subsys->lock);
+
+#ifdef CONFIG_NVME_MULTIPATH
+ cancel_delayed_work(&head->remove_work);
+#endif
return 0;
out_put_ns_head:
@@ -3897,7 +4093,7 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
return;
}
}
- list_add(&ns->list, &ns->ctrl->namespaces);
+ list_add_rcu(&ns->list, &ns->ctrl->namespaces);
}
static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
@@ -3906,6 +4102,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
struct nvme_ns *ns;
struct gendisk *disk;
int node = ctrl->numa_node;
+ bool last_path = false;
ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
if (!ns)
@@ -3998,9 +4195,22 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
out_unlink_ns:
mutex_lock(&ctrl->subsys->lock);
list_del_rcu(&ns->siblings);
- if (list_empty(&ns->head->list))
+ if (list_empty(&ns->head->list)) {
list_del_init(&ns->head->entry);
+ /*
+ * If multipath is not configured, we still create a namespace
+ * head (nshead), but head->disk is not initialized in that
+ * case. As a result, only a single reference to nshead is held
+ * (via kref_init()) when it is created. Therefore, ensure that
+ * we do not release the reference to nshead twice if head->disk
+ * is not present.
+ */
+ if (ns->head->disk)
+ last_path = true;
+ }
mutex_unlock(&ctrl->subsys->lock);
+ if (last_path)
+ nvme_put_ns_head(ns->head);
nvme_put_ns_head(ns->head);
out_cleanup_disk:
put_disk(disk);
@@ -4032,7 +4242,8 @@ static void nvme_ns_remove(struct nvme_ns *ns)
mutex_lock(&ns->ctrl->subsys->lock);
list_del_rcu(&ns->siblings);
if (list_empty(&ns->head->list)) {
- list_del_init(&ns->head->entry);
+ if (!nvme_mpath_queue_if_no_path(ns->head))
+ list_del_init(&ns->head->entry);
last_path = true;
}
mutex_unlock(&ns->ctrl->subsys->lock);
@@ -4053,7 +4264,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
synchronize_srcu(&ns->ctrl->srcu);
if (last_path)
- nvme_mpath_shutdown_disk(ns->head);
+ nvme_mpath_remove_disk(ns->head);
nvme_put_ns(ns);
}
@@ -4105,7 +4316,7 @@ static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
}
/*
- * If available try to use the Command Set Idependent Identify Namespace
+ * If available try to use the Command Set Independent Identify Namespace
* data structure to find all the generic information that is needed to
* set up a namespace. If not fall back to the legacy version.
*/
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 93e9041b9657..2e58a7ce1090 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -582,7 +582,7 @@ EXPORT_SYMBOL_GPL(nvmf_connect_io_queue);
* Do not retry when:
*
* - the DNR bit is set and the specification states no further connect
- * attempts with the same set of paramenters should be attempted.
+ * attempts with the same set of parameters should be attempted.
*
* - when the authentication attempt fails, because the key was invalid.
* This error code is set on the host side.
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 9cf5b020adba..1b58ee7d0dce 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -80,7 +80,7 @@ enum {
* @transport: Holds the fabric transport "technology name" (for a lack of
* better description) that will be used by an NVMe controller
* being added.
- * @subsysnqn: Hold the fully qualified NQN subystem name (format defined
+ * @subsysnqn: Hold the fully qualified NQN subsystem name (format defined
* in the NVMe specification, "NVMe Qualified Names").
* @traddr: The transport-specific TRADDR field for a port on the
* subsystem which is adding a controller.
@@ -156,7 +156,7 @@ struct nvmf_ctrl_options {
* @create_ctrl(): function pointer that points to a non-NVMe
* implementation-specific fabric technology
* that would go into starting up that fabric
- * for the purpose of conneciton to an NVMe controller
+ * for the purpose of connection to an NVMe controller
* using that fabric technology.
*
* Notes:
@@ -165,7 +165,7 @@ struct nvmf_ctrl_options {
* 2. create_ctrl() must be defined (even if it does nothing)
* 3. struct nvmf_transport_ops must be statically allocated in the
* modules .bss section so that a pure module_get on @module
- * prevents the memory from beeing freed.
+ * prevents the memory from being freed.
*/
struct nvmf_transport_ops {
struct list_head entry;
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 2257c3c96dd2..3e12d4683ac7 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -899,7 +899,7 @@ EXPORT_SYMBOL_GPL(nvme_fc_set_remoteport_devloss);
* may crash.
*
* As such:
- * Wrapper all the dma routines and check the dev pointer.
+ * Wrap all the dma routines and check the dev pointer.
*
* If simple mappings (return just a dma address, we'll noop them,
* returning a dma address of 0.
@@ -1363,7 +1363,7 @@ nvme_fc_disconnect_assoc_done(struct nvmefc_ls_req *lsreq, int status)
* down, and the related FC-NVME Association ID and Connection IDs
* become invalid.
*
- * The behavior of the fc-nvme initiator is such that it's
+ * The behavior of the fc-nvme initiator is such that its
* understanding of the association and connections will implicitly
* be torn down. The action is implicit as it may be due to a loss of
* connectivity with the fc-nvme target, so you may never get a
@@ -1410,9 +1410,8 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl)
}
static void
-nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp)
+nvme_fc_xmt_ls_rsp_free(struct nvmefc_ls_rcv_op *lsop)
{
- struct nvmefc_ls_rcv_op *lsop = lsrsp->nvme_fc_private;
struct nvme_fc_rport *rport = lsop->rport;
struct nvme_fc_lport *lport = rport->lport;
unsigned long flags;
@@ -1434,6 +1433,14 @@ nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp)
}
static void
+nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp)
+{
+ struct nvmefc_ls_rcv_op *lsop = lsrsp->nvme_fc_private;
+
+ nvme_fc_xmt_ls_rsp_free(lsop);
+}
+
+static void
nvme_fc_xmt_ls_rsp(struct nvmefc_ls_rcv_op *lsop)
{
struct nvme_fc_rport *rport = lsop->rport;
@@ -1450,7 +1457,7 @@ nvme_fc_xmt_ls_rsp(struct nvmefc_ls_rcv_op *lsop)
dev_warn(lport->dev,
"LLDD rejected LS RSP xmt: LS %d status %d\n",
w0->ls_cmd, ret);
- nvme_fc_xmt_ls_rsp_done(lsop->lsrsp);
+ nvme_fc_xmt_ls_rsp_free(lsop);
return;
}
}
@@ -1948,8 +1955,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
}
/*
- * For the linux implementation, if we have an unsuccesful
- * status, they blk-mq layer can typically be called with the
+ * For the linux implementation, if we have an unsuccessful
+ * status, the blk-mq layer can typically be called with the
* non-zero status and the content of the cqe isn't important.
*/
if (status)
@@ -2422,7 +2429,7 @@ static bool nvme_fc_terminate_exchange(struct request *req, void *data)
/*
* This routine runs through all outstanding commands on the association
- * and aborts them. This routine is typically be called by the
+ * and aborts them. This routine is typically called by the
* delete_association routine. It is also called due to an error during
* reconnect. In that scenario, it is most likely a command that initializes
* the controller, including fabric Connect commands on io queues, that
@@ -2472,7 +2479,7 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues)
* writing the registers for shutdown and polling (call
* nvme_disable_ctrl()). Given a bunch of i/o was potentially
* just aborted and we will wait on those contexts, and given
- * there was no indication of how live the controlelr is on the
+ * there was no indication of how live the controller is on the
* link, don't send more io to create more contexts for the
* shutdown. Let the controller fail via keepalive failure if
* its still present.
@@ -2615,7 +2622,7 @@ nvme_fc_unmap_data(struct nvme_fc_ctrl *ctrl, struct request *rq,
* as part of the exchange. The CQE is the last thing for the io,
* which is transferred (explicitly or implicitly) with the RSP IU
* sent on the exchange. After the CQE is received, the FC exchange is
- * terminaed and the Exchange may be used on a different io.
+ * terminated and the Exchange may be used on a different io.
*
* The transport to LLDD api has the transport making a request for a
* new fcp io request to the LLDD. The LLDD then allocates a FC exchange
@@ -2770,7 +2777,7 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx,
* as WRITE ZEROES will return a non-zero rq payload_bytes yet
* there is no actual payload to be transferred.
* To get it right, key data transmission on there being 1 or
- * more physical segments in the sg list. If there is no
+ * more physical segments in the sg list. If there are no
* physical segments, there is no payload.
*/
if (blk_rq_nr_phys_segments(rq)) {
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index ca86d3bf7ea4..6b3ac8ae3f34 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -429,21 +429,14 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
pdu->result = le64_to_cpu(nvme_req(req)->result.u64);
/*
- * For iopoll, complete it directly. Note that using the uring_cmd
- * helper for this is safe only because we check blk_rq_is_poll().
- * As that returns false if we're NOT on a polled queue, then it's
- * safe to use the polled completion helper.
- *
- * Otherwise, move the completion to task work.
+ * IOPOLL could potentially complete this request directly, but
+ * if multiple rings are polling on the same queue, then it's possible
+ * for one ring to find completions for another ring. Punting the
+ * completion via task_work will always direct it to the right
+ * location, rather than potentially complete requests for ringA
+ * under iopoll invocations from ringB.
*/
- if (blk_rq_is_poll(req)) {
- if (pdu->bio)
- blk_rq_unmap_user(pdu->bio);
- io_uring_cmd_iopoll_done(ioucmd, pdu->result, pdu->status);
- } else {
- io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
- }
-
+ io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
return RQ_END_IO_FREE;
}
@@ -493,13 +486,15 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
d.timeout_ms = READ_ONCE(cmd->timeout_ms);
if (d.data_len && (ioucmd->flags & IORING_URING_CMD_FIXED)) {
- /* fixedbufs is only for non-vectored io */
- if (vec)
- return -EINVAL;
+ int ddir = nvme_is_write(&c) ? WRITE : READ;
- ret = io_uring_cmd_import_fixed(d.addr, d.data_len,
- nvme_is_write(&c) ? WRITE : READ, &iter, ioucmd,
- issue_flags);
+ if (vec)
+ ret = io_uring_cmd_import_fixed_vec(ioucmd,
+ u64_to_user_ptr(d.addr), d.data_len,
+ ddir, &iter, issue_flags);
+ else
+ ret = io_uring_cmd_import_fixed(d.addr, d.data_len,
+ ddir, &iter, ioucmd, issue_flags);
if (ret < 0)
return ret;
@@ -521,7 +516,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
if (d.data_len) {
ret = nvme_map_user_request(req, d.addr, d.data_len,
nvme_to_user_ptr(d.metadata), d.metadata_len,
- map_iter, vec);
+ map_iter, vec ? NVME_IOCTL_VEC : 0);
if (ret)
goto out_free_req;
}
@@ -727,7 +722,7 @@ int nvme_ns_head_ioctl(struct block_device *bdev, blk_mode_t mode,
/*
* Handle ioctls that apply to the controller instead of the namespace
- * seperately and drop the ns SRCU reference early. This avoids a
+ * separately and drop the ns SRCU reference early. This avoids a
* deadlock when deleting namespaces using the passthrough interface.
*/
if (is_ctrl_ioctl(cmd))
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index cf0ef4745564..3da980dc60d9 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -10,10 +10,61 @@
#include "nvme.h"
bool multipath = true;
-module_param(multipath, bool, 0444);
+static bool multipath_always_on;
+
+static int multipath_param_set(const char *val, const struct kernel_param *kp)
+{
+ int ret;
+ bool *arg = kp->arg;
+
+ ret = param_set_bool(val, kp);
+ if (ret)
+ return ret;
+
+ if (multipath_always_on && !*arg) {
+ pr_err("Can't disable multipath when multipath_always_on is configured.\n");
+ *arg = true;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct kernel_param_ops multipath_param_ops = {
+ .set = multipath_param_set,
+ .get = param_get_bool,
+};
+
+module_param_cb(multipath, &multipath_param_ops, &multipath, 0444);
MODULE_PARM_DESC(multipath,
"turn on native support for multiple controllers per subsystem");
+static int multipath_always_on_set(const char *val,
+ const struct kernel_param *kp)
+{
+ int ret;
+ bool *arg = kp->arg;
+
+ ret = param_set_bool(val, kp);
+ if (ret < 0)
+ return ret;
+
+ if (*arg)
+ multipath = true;
+
+ return 0;
+}
+
+static const struct kernel_param_ops multipath_always_on_ops = {
+ .set = multipath_always_on_set,
+ .get = param_get_bool,
+};
+
+module_param_cb(multipath_always_on, &multipath_always_on_ops,
+ &multipath_always_on, 0444);
+MODULE_PARM_DESC(multipath_always_on,
+ "create multipath node always except for private namespace with non-unique nsid; note that this also implicitly enables native multipath support");
+
static const char *nvme_iopolicy_names[] = {
[NVME_IOPOLICY_NUMA] = "numa",
[NVME_IOPOLICY_RR] = "round-robin",
@@ -442,7 +493,17 @@ static bool nvme_available_path(struct nvme_ns_head *head)
break;
}
}
- return false;
+
+ /*
+ * If "head->delayed_removal_secs" is configured (i.e., non-zero), do
+ * not immediately fail I/O. Instead, requeue the I/O for the configured
+ * duration, anticipating that if there's a transient link failure then
+ * it may recover within this time window. This parameter is exported to
+ * userspace via sysfs, and its default value is zero. It is internally
+ * mapped to NVME_NSHEAD_QUEUE_IF_NO_PATH. When delayed_removal_secs is
+ * non-zero, this flag is set to true. When zero, the flag is cleared.
+ */
+ return nvme_mpath_queue_if_no_path(head);
}
static void nvme_ns_head_submit_bio(struct bio *bio)
@@ -617,6 +678,40 @@ static void nvme_requeue_work(struct work_struct *work)
}
}
+static void nvme_remove_head(struct nvme_ns_head *head)
+{
+ if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
+ /*
+ * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
+ * to allow multipath to fail all I/O.
+ */
+ kblockd_schedule_work(&head->requeue_work);
+
+ nvme_cdev_del(&head->cdev, &head->cdev_device);
+ synchronize_srcu(&head->srcu);
+ del_gendisk(head->disk);
+ }
+ nvme_put_ns_head(head);
+}
+
+static void nvme_remove_head_work(struct work_struct *work)
+{
+ struct nvme_ns_head *head = container_of(to_delayed_work(work),
+ struct nvme_ns_head, remove_work);
+ bool remove = false;
+
+ mutex_lock(&head->subsys->lock);
+ if (list_empty(&head->list)) {
+ list_del_init(&head->entry);
+ remove = true;
+ }
+ mutex_unlock(&head->subsys->lock);
+ if (remove)
+ nvme_remove_head(head);
+
+ module_put(THIS_MODULE);
+}
+
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
{
struct queue_limits lim;
@@ -626,14 +721,25 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
spin_lock_init(&head->requeue_lock);
INIT_WORK(&head->requeue_work, nvme_requeue_work);
INIT_WORK(&head->partition_scan_work, nvme_partition_scan_work);
+ INIT_DELAYED_WORK(&head->remove_work, nvme_remove_head_work);
+ head->delayed_removal_secs = 0;
/*
- * Add a multipath node if the subsystems supports multiple controllers.
- * We also do this for private namespaces as the namespace sharing flag
- * could change after a rescan.
+ * If "multipath_always_on" is enabled, a multipath node is added
+ * regardless of whether the disk is single/multi ported, and whether
+ * the namespace is shared or private. If "multipath_always_on" is not
+ * enabled, a multipath node is added only if the subsystem supports
+ * multiple controllers and the "multipath" option is configured. In
+ * either case, for private namespaces, we ensure that the NSID is
+ * unique.
*/
- if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
- !nvme_is_unique_nsid(ctrl, head) || !multipath)
+ if (!multipath_always_on) {
+ if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
+ !multipath)
+ return 0;
+ }
+
+ if (!nvme_is_unique_nsid(ctrl, head))
return 0;
blk_set_stacking_limits(&lim);
@@ -654,12 +760,13 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
* controller's scan_work context. If a path error occurs here, the IO
* will wait until a path becomes available or all paths are torn down,
* but that action also occurs within scan_work, so it would deadlock.
- * Defer the partion scan to a different context that does not block
+ * Defer the partition scan to a different context that does not block
* scan_work.
*/
set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state);
sprintf(head->disk->disk_name, "nvme%dn%d",
ctrl->subsys->instance, head->instance);
+ nvme_tryget_ns_head(head);
return 0;
}
@@ -891,7 +998,7 @@ void nvme_mpath_update(struct nvme_ctrl *ctrl)
static void nvme_anatt_timeout(struct timer_list *t)
{
- struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);
+ struct nvme_ctrl *ctrl = timer_container_of(ctrl, t, anatt_timer);
dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
nvme_reset_ctrl(ctrl);
@@ -1016,6 +1123,49 @@ static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr
}
DEVICE_ATTR_RO(numa_nodes);
+static ssize_t delayed_removal_secs_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ struct nvme_ns_head *head = disk->private_data;
+ int ret;
+
+ mutex_lock(&head->subsys->lock);
+ ret = sysfs_emit(buf, "%u\n", head->delayed_removal_secs);
+ mutex_unlock(&head->subsys->lock);
+ return ret;
+}
+
+static ssize_t delayed_removal_secs_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ struct nvme_ns_head *head = disk->private_data;
+ unsigned int sec;
+ int ret;
+
+ ret = kstrtouint(buf, 0, &sec);
+ if (ret < 0)
+ return ret;
+
+ mutex_lock(&head->subsys->lock);
+ head->delayed_removal_secs = sec;
+ if (sec)
+ set_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
+ else
+ clear_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
+ mutex_unlock(&head->subsys->lock);
+ /*
+ * Ensure that update to NVME_NSHEAD_QUEUE_IF_NO_PATH is seen
+ * by its reader.
+ */
+ synchronize_srcu(&head->srcu);
+
+ return count;
+}
+
+DEVICE_ATTR_RW(delayed_removal_secs);
+
static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
struct nvme_ana_group_desc *desc, void *data)
{
@@ -1050,7 +1200,8 @@ void nvme_mpath_add_sysfs_link(struct nvme_ns_head *head)
*/
srcu_idx = srcu_read_lock(&head->srcu);
- list_for_each_entry_rcu(ns, &head->list, siblings) {
+ list_for_each_entry_srcu(ns, &head->list, siblings,
+ srcu_read_lock_held(&head->srcu)) {
/*
* Ensure that ns path disk node is already added otherwise we
* may get invalid kobj name for target
@@ -1137,23 +1288,46 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
#endif
}
-void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
+void nvme_mpath_remove_disk(struct nvme_ns_head *head)
{
+ bool remove = false;
+
if (!head->disk)
return;
- if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
- nvme_cdev_del(&head->cdev, &head->cdev_device);
+
+ mutex_lock(&head->subsys->lock);
+ /*
+ * We are called when all paths have been removed, and at that point
+ * head->list is expected to be empty. However, nvme_remove_ns() and
+ * nvme_init_ns_head() can run concurrently and so if head->delayed_
+ * removal_secs is configured, it is possible that by the time we reach
+ * this point, head->list may no longer be empty. Therefore, we recheck
+ * head->list here. If it is no longer empty then we skip enqueuing the
+ * delayed head removal work.
+ */
+ if (!list_empty(&head->list))
+ goto out;
+
+ if (head->delayed_removal_secs) {
/*
- * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
- * to allow multipath to fail all I/O.
+ * Ensure that no one could remove this module while the head
+ * remove work is pending.
*/
- synchronize_srcu(&head->srcu);
- kblockd_schedule_work(&head->requeue_work);
- del_gendisk(head->disk);
+ if (!try_module_get(THIS_MODULE))
+ goto out;
+ mod_delayed_work(nvme_wq, &head->remove_work,
+ head->delayed_removal_secs * HZ);
+ } else {
+ list_del_init(&head->entry);
+ remove = true;
}
+out:
+ mutex_unlock(&head->subsys->lock);
+ if (remove)
+ nvme_remove_head(head);
}
-void nvme_mpath_remove_disk(struct nvme_ns_head *head)
+void nvme_mpath_put_disk(struct nvme_ns_head *head)
{
if (!head->disk)
return;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 8fc4683418a3..cfd2b5b90b91 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -69,7 +69,7 @@ enum nvme_quirks {
NVME_QUIRK_IDENTIFY_CNS = (1 << 1),
/*
- * The controller deterministically returns O's on reads to
+ * The controller deterministically returns 0's on reads to
* logical blocks that deallocate was called on.
*/
NVME_QUIRK_DEALLOCATE_ZEROES = (1 << 2),
@@ -410,7 +410,6 @@ struct nvme_ctrl {
enum nvme_ctrl_type cntrltype;
enum nvme_dctype dctype;
- u16 awupf; /* 0's based value. */
};
static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl)
@@ -443,11 +442,11 @@ struct nvme_subsystem {
u8 cmic;
enum nvme_subsys_type subtype;
u16 vendor_id;
+ u16 awupf; /* 0's based value. */
struct ida ns_ida;
#ifdef CONFIG_NVME_MULTIPATH
enum nvme_iopolicy iopolicy;
#endif
- u32 atomic_bs;
};
/*
@@ -497,6 +496,9 @@ struct nvme_ns_head {
struct device cdev_device;
struct gendisk *disk;
+
+ u16 nr_plids;
+ u16 *plids;
#ifdef CONFIG_NVME_MULTIPATH
struct bio_list requeue_list;
spinlock_t requeue_lock;
@@ -504,7 +506,10 @@ struct nvme_ns_head {
struct work_struct partition_scan_work;
struct mutex lock;
unsigned long flags;
-#define NVME_NSHEAD_DISK_LIVE 0
+ struct delayed_work remove_work;
+ unsigned int delayed_removal_secs;
+#define NVME_NSHEAD_DISK_LIVE 0
+#define NVME_NSHEAD_QUEUE_IF_NO_PATH 1
struct nvme_ns __rcu *current_path[];
#endif
};
@@ -517,7 +522,7 @@ static inline bool nvme_ns_head_multipath(struct nvme_ns_head *head)
enum nvme_ns_features {
NVME_NS_EXT_LBAS = 1 << 0, /* support extended LBA format */
NVME_NS_METADATA_SUPPORTED = 1 << 1, /* support getting generated md */
- NVME_NS_DEAC = 1 << 2, /* DEAC bit in Write Zeores supported */
+ NVME_NS_DEAC = 1 << 2, /* DEAC bit in Write Zeroes supported */
};
struct nvme_ns {
@@ -897,10 +902,10 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
int qid, nvme_submit_flags_t flags);
int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
- u32 *result);
+ void *result);
int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
- u32 *result);
+ void *result);
int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
@@ -961,7 +966,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
void nvme_mpath_add_sysfs_link(struct nvme_ns_head *ns);
void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns);
void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid);
-void nvme_mpath_remove_disk(struct nvme_ns_head *head);
+void nvme_mpath_put_disk(struct nvme_ns_head *head);
int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl);
void nvme_mpath_update(struct nvme_ctrl *ctrl);
@@ -970,7 +975,7 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl);
bool nvme_mpath_clear_current_path(struct nvme_ns *ns);
void nvme_mpath_revalidate_paths(struct nvme_ns *ns);
void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl);
-void nvme_mpath_shutdown_disk(struct nvme_ns_head *head);
+void nvme_mpath_remove_disk(struct nvme_ns_head *head);
void nvme_mpath_start_request(struct request *rq);
void nvme_mpath_end_request(struct request *rq);
@@ -987,12 +992,19 @@ extern struct device_attribute dev_attr_ana_grpid;
extern struct device_attribute dev_attr_ana_state;
extern struct device_attribute dev_attr_queue_depth;
extern struct device_attribute dev_attr_numa_nodes;
+extern struct device_attribute dev_attr_delayed_removal_secs;
extern struct device_attribute subsys_attr_iopolicy;
static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
{
return disk->fops == &nvme_ns_head_ops;
}
+static inline bool nvme_mpath_queue_if_no_path(struct nvme_ns_head *head)
+{
+ if (test_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags))
+ return true;
+ return false;
+}
#else
#define multipath false
static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
@@ -1013,7 +1025,7 @@ static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,
static inline void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
{
}
-static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
+static inline void nvme_mpath_put_disk(struct nvme_ns_head *head)
{
}
static inline void nvme_mpath_add_sysfs_link(struct nvme_ns *ns)
@@ -1032,7 +1044,7 @@ static inline void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
{
}
-static inline void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
+static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
{
}
static inline void nvme_trace_bio_complete(struct request *req)
@@ -1080,6 +1092,10 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
{
return false;
}
+static inline bool nvme_mpath_queue_if_no_path(struct nvme_ns_head *head)
+{
+ return false;
+}
#endif /* CONFIG_NVME_MULTIPATH */
int nvme_ns_get_unique_id(struct nvme_ns *ns, u8 id[16],
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index f1dd804151b1..2c6d9506b172 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -7,7 +7,7 @@
#include <linux/acpi.h>
#include <linux/async.h>
#include <linux/blkdev.h>
-#include <linux/blk-mq.h>
+#include <linux/blk-mq-dma.h>
#include <linux/blk-integrity.h>
#include <linux/dmi.h>
#include <linux/init.h>
@@ -18,6 +18,7 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
+#include <linux/nodemask.h>
#include <linux/once.h>
#include <linux/pci.h>
#include <linux/suspend.h>
@@ -26,7 +27,6 @@
#include <linux/io-64-nonatomic-lo-hi.h>
#include <linux/io-64-nonatomic-hi-lo.h>
#include <linux/sed-opal.h>
-#include <linux/pci-p2pdma.h>
#include "trace.h"
#include "nvme.h"
@@ -34,16 +34,43 @@
#define SQ_SIZE(q) ((q)->q_depth << (q)->sqes)
#define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion))
-#define SGES_PER_PAGE (NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc))
+/* Optimisation for I/Os between 4k and 128k */
+#define NVME_SMALL_POOL_SIZE 256
/*
- * These can be higher, but we need to ensure that any command doesn't
- * require an sg allocation that needs more than a page of data.
+ * Arbitrary upper bound.
*/
-#define NVME_MAX_KB_SZ 8192
-#define NVME_MAX_SEGS 128
-#define NVME_MAX_META_SEGS 15
-#define NVME_MAX_NR_ALLOCATIONS 5
+#define NVME_MAX_BYTES SZ_8M
+#define NVME_MAX_NR_DESCRIPTORS 5
+
+/*
+ * For data SGLs we support a single descriptors worth of SGL entries.
+ * For PRPs, segments don't matter at all.
+ */
+#define NVME_MAX_SEGS \
+ (NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc))
+
+/*
+ * For metadata SGLs, only the small descriptor is supported, and the first
+ * entry is the segment descriptor, which for the data pointer sits in the SQE.
+ */
+#define NVME_MAX_META_SEGS \
+ ((NVME_SMALL_POOL_SIZE / sizeof(struct nvme_sgl_desc)) - 1)
+
+/*
+ * The last entry is used to link to the next descriptor.
+ */
+#define PRPS_PER_PAGE \
+ (((NVME_CTRL_PAGE_SIZE / sizeof(__le64))) - 1)
+
+/*
+ * I/O could be non-aligned both at the beginning and end.
+ */
+#define MAX_PRP_RANGE \
+ (NVME_MAX_BYTES + 2 * (NVME_CTRL_PAGE_SIZE - 1))
+
+static_assert(MAX_PRP_RANGE / NVME_CTRL_PAGE_SIZE <=
+ (1 /* prp1 */ + NVME_MAX_NR_DESCRIPTORS * PRPS_PER_PAGE));
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0444);
@@ -81,7 +108,7 @@ static int io_queue_count_set(const char *val, const struct kernel_param *kp)
int ret;
ret = kstrtouint(val, 10, &n);
- if (ret != 0 || n > num_possible_cpus())
+ if (ret != 0 || n > blk_mq_num_possible_queues(0))
return -EINVAL;
return param_set_uint(val, kp);
}
@@ -112,6 +139,11 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
static void nvme_delete_io_queues(struct nvme_dev *dev);
static void nvme_update_attrs(struct nvme_dev *dev);
+struct nvme_descriptor_pools {
+ struct dma_pool *large;
+ struct dma_pool *small;
+};
+
/*
* Represents an NVM Express device. Each nvme_dev is a PCI function.
*/
@@ -121,8 +153,6 @@ struct nvme_dev {
struct blk_mq_tag_set admin_tagset;
u32 __iomem *dbs;
struct device *dev;
- struct dma_pool *prp_page_pool;
- struct dma_pool *prp_small_pool;
unsigned online_queues;
unsigned max_qid;
unsigned io_queues[HCTX_MAX_TYPES];
@@ -143,7 +173,7 @@ struct nvme_dev {
bool hmb;
struct sg_table *hmb_sgt;
- mempool_t *iod_mempool;
+ mempool_t *dmavec_mempool;
mempool_t *iod_meta_mempool;
/* shadow doorbell buffer support: */
@@ -162,6 +192,7 @@ struct nvme_dev {
unsigned int nr_allocated_queues;
unsigned int nr_write_queues;
unsigned int nr_poll_queues;
+ struct nvme_descriptor_pools descriptor_pools[];
};
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
@@ -191,6 +222,7 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
*/
struct nvme_queue {
struct nvme_dev *dev;
+ struct nvme_descriptor_pools descriptor_pools;
spinlock_t sq_lock;
void *sq_cmds;
/* only used for poll queues: */
@@ -219,30 +251,41 @@ struct nvme_queue {
struct completion delete_done;
};
-union nvme_descriptor {
- struct nvme_sgl_desc *sg_list;
- __le64 *prp_list;
+/* bits for iod->flags */
+enum nvme_iod_flags {
+ /* this command has been aborted by the timeout handler */
+ IOD_ABORTED = 1U << 0,
+
+ /* uses the small descriptor pool */
+ IOD_SMALL_DESCRIPTOR = 1U << 1,
+
+ /* single segment dma mapping */
+ IOD_SINGLE_SEGMENT = 1U << 2,
+};
+
+struct nvme_dma_vec {
+ dma_addr_t addr;
+ unsigned int len;
};
/*
* The nvme_iod describes the data in an I/O.
- *
- * The sg pointer contains the list of PRP/SGL chunk allocations in addition
- * to the actual struct scatterlist.
*/
struct nvme_iod {
struct nvme_request req;
struct nvme_command cmd;
- bool aborted;
- s8 nr_allocations; /* PRP list pool allocations. 0 means small
- pool in use */
- unsigned int dma_len; /* length of single DMA segment mapping */
- dma_addr_t first_dma;
+ u8 flags;
+ u8 nr_descriptors;
+
+ unsigned int total_len;
+ struct dma_iova_state dma_state;
+ void *descriptors[NVME_MAX_NR_DESCRIPTORS];
+ struct nvme_dma_vec *dma_vecs;
+ unsigned int nr_dma_vecs;
+
dma_addr_t meta_dma;
- struct sg_table sgt;
struct sg_table meta_sgt;
- union nvme_descriptor meta_list;
- union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS];
+ struct nvme_sgl_desc *meta_descriptor;
};
static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
@@ -385,40 +428,76 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db,
return true;
}
-/*
- * Will slightly overestimate the number of pages needed. This is OK
- * as it only leads to a small amount of wasted memory for the lifetime of
- * the I/O.
- */
-static __always_inline int nvme_pci_npages_prp(void)
+static struct nvme_descriptor_pools *
+nvme_setup_descriptor_pools(struct nvme_dev *dev, unsigned numa_node)
+{
+ struct nvme_descriptor_pools *pools = &dev->descriptor_pools[numa_node];
+ size_t small_align = NVME_SMALL_POOL_SIZE;
+
+ if (pools->small)
+ return pools; /* already initialized */
+
+ pools->large = dma_pool_create_node("nvme descriptor page", dev->dev,
+ NVME_CTRL_PAGE_SIZE, NVME_CTRL_PAGE_SIZE, 0, numa_node);
+ if (!pools->large)
+ return ERR_PTR(-ENOMEM);
+
+ if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512)
+ small_align = 512;
+
+ pools->small = dma_pool_create_node("nvme descriptor small", dev->dev,
+ NVME_SMALL_POOL_SIZE, small_align, 0, numa_node);
+ if (!pools->small) {
+ dma_pool_destroy(pools->large);
+ pools->large = NULL;
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return pools;
+}
+
+static void nvme_release_descriptor_pools(struct nvme_dev *dev)
{
- unsigned max_bytes = (NVME_MAX_KB_SZ * 1024) + NVME_CTRL_PAGE_SIZE;
- unsigned nprps = DIV_ROUND_UP(max_bytes, NVME_CTRL_PAGE_SIZE);
- return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8);
+ unsigned i;
+
+ for (i = 0; i < nr_node_ids; i++) {
+ struct nvme_descriptor_pools *pools = &dev->descriptor_pools[i];
+
+ dma_pool_destroy(pools->large);
+ dma_pool_destroy(pools->small);
+ }
}
-static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
- unsigned int hctx_idx)
+static int nvme_init_hctx_common(struct blk_mq_hw_ctx *hctx, void *data,
+ unsigned qid)
{
struct nvme_dev *dev = to_nvme_dev(data);
- struct nvme_queue *nvmeq = &dev->queues[0];
+ struct nvme_queue *nvmeq = &dev->queues[qid];
+ struct nvme_descriptor_pools *pools;
+ struct blk_mq_tags *tags;
- WARN_ON(hctx_idx != 0);
- WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
+ tags = qid ? dev->tagset.tags[qid - 1] : dev->admin_tagset.tags[0];
+ WARN_ON(tags != hctx->tags);
+ pools = nvme_setup_descriptor_pools(dev, hctx->numa_node);
+ if (IS_ERR(pools))
+ return PTR_ERR(pools);
+ nvmeq->descriptor_pools = *pools;
hctx->driver_data = nvmeq;
return 0;
}
-static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
- unsigned int hctx_idx)
+static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+ unsigned int hctx_idx)
{
- struct nvme_dev *dev = to_nvme_dev(data);
- struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
+ WARN_ON(hctx_idx != 0);
+ return nvme_init_hctx_common(hctx, data, 0);
+}
- WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
- hctx->driver_data = nvmeq;
- return 0;
+static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+ unsigned int hctx_idx)
+{
+ return nvme_init_hctx_common(hctx, data, hctx_idx + 1);
}
static int nvme_pci_init_request(struct blk_mq_tag_set *set,
@@ -509,184 +588,302 @@ static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
spin_unlock(&nvmeq->sq_lock);
}
-static inline bool nvme_pci_metadata_use_sgls(struct nvme_dev *dev,
- struct request *req)
+enum nvme_use_sgl {
+ SGL_UNSUPPORTED,
+ SGL_SUPPORTED,
+ SGL_FORCED,
+};
+
+static inline bool nvme_pci_metadata_use_sgls(struct request *req)
{
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ struct nvme_dev *dev = nvmeq->dev;
+
if (!nvme_ctrl_meta_sgl_supported(&dev->ctrl))
return false;
return req->nr_integrity_segments > 1 ||
nvme_req(req)->flags & NVME_REQ_USERCMD;
}
-static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
- int nseg)
+static inline enum nvme_use_sgl nvme_pci_use_sgls(struct nvme_dev *dev,
+ struct request *req)
{
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
- unsigned int avg_seg_size;
- avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
+ if (nvmeq->qid && nvme_ctrl_sgl_supported(&dev->ctrl)) {
+ if (nvme_req(req)->flags & NVME_REQ_USERCMD)
+ return SGL_FORCED;
+ if (req->nr_integrity_segments > 1)
+ return SGL_FORCED;
+ return SGL_SUPPORTED;
+ }
- if (!nvme_ctrl_sgl_supported(&dev->ctrl))
- return false;
- if (!nvmeq->qid)
- return false;
- if (nvme_pci_metadata_use_sgls(dev, req))
- return true;
- if (!sgl_threshold || avg_seg_size < sgl_threshold)
- return nvme_req(req)->flags & NVME_REQ_USERCMD;
- return true;
+ return SGL_UNSUPPORTED;
+}
+
+static unsigned int nvme_pci_avg_seg_size(struct request *req)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ unsigned int nseg;
+
+ if (blk_rq_dma_map_coalesce(&iod->dma_state))
+ nseg = 1;
+ else
+ nseg = blk_rq_nr_phys_segments(req);
+ return DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
+}
+
+static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq,
+ struct nvme_iod *iod)
+{
+ if (iod->flags & IOD_SMALL_DESCRIPTOR)
+ return nvmeq->descriptor_pools.small;
+ return nvmeq->descriptor_pools.large;
}
-static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
+static inline bool nvme_pci_cmd_use_sgl(struct nvme_command *cmd)
{
+ return cmd->common.flags &
+ (NVME_CMD_SGL_METABUF | NVME_CMD_SGL_METASEG);
+}
+
+static inline dma_addr_t nvme_pci_first_desc_dma_addr(struct nvme_command *cmd)
+{
+ if (nvme_pci_cmd_use_sgl(cmd))
+ return le64_to_cpu(cmd->common.dptr.sgl.addr);
+ return le64_to_cpu(cmd->common.dptr.prp2);
+}
+
+static void nvme_free_descriptors(struct request *req)
+{
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- dma_addr_t dma_addr = iod->first_dma;
+ dma_addr_t dma_addr = nvme_pci_first_desc_dma_addr(&iod->cmd);
int i;
- for (i = 0; i < iod->nr_allocations; i++) {
- __le64 *prp_list = iod->list[i].prp_list;
+ if (iod->nr_descriptors == 1) {
+ dma_pool_free(nvme_dma_pool(nvmeq, iod), iod->descriptors[0],
+ dma_addr);
+ return;
+ }
+
+ for (i = 0; i < iod->nr_descriptors; i++) {
+ __le64 *prp_list = iod->descriptors[i];
dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]);
- dma_pool_free(dev->prp_page_pool, prp_list, dma_addr);
+ dma_pool_free(nvmeq->descriptor_pools.large, prp_list,
+ dma_addr);
dma_addr = next_dma_addr;
}
}
-static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
+static void nvme_free_prps(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ unsigned int i;
- if (iod->dma_len) {
- dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len,
- rq_dma_dir(req));
- return;
+ for (i = 0; i < iod->nr_dma_vecs; i++)
+ dma_unmap_page(nvmeq->dev->dev, iod->dma_vecs[i].addr,
+ iod->dma_vecs[i].len, rq_dma_dir(req));
+ mempool_free(iod->dma_vecs, nvmeq->dev->dmavec_mempool);
+}
+
+static void nvme_free_sgls(struct request *req)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ struct device *dma_dev = nvmeq->dev->dev;
+ dma_addr_t sqe_dma_addr = le64_to_cpu(iod->cmd.common.dptr.sgl.addr);
+ unsigned int sqe_dma_len = le32_to_cpu(iod->cmd.common.dptr.sgl.length);
+ struct nvme_sgl_desc *sg_list = iod->descriptors[0];
+ enum dma_data_direction dir = rq_dma_dir(req);
+
+ if (iod->nr_descriptors) {
+ unsigned int nr_entries = sqe_dma_len / sizeof(*sg_list), i;
+
+ for (i = 0; i < nr_entries; i++)
+ dma_unmap_page(dma_dev, le64_to_cpu(sg_list[i].addr),
+ le32_to_cpu(sg_list[i].length), dir);
+ } else {
+ dma_unmap_page(dma_dev, sqe_dma_addr, sqe_dma_len, dir);
}
+}
- WARN_ON_ONCE(!iod->sgt.nents);
+static void nvme_unmap_data(struct request *req)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ struct device *dma_dev = nvmeq->dev->dev;
- dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
+ if (iod->flags & IOD_SINGLE_SEGMENT) {
+ static_assert(offsetof(union nvme_data_ptr, prp1) ==
+ offsetof(union nvme_data_ptr, sgl.addr));
+ dma_unmap_page(dma_dev, le64_to_cpu(iod->cmd.common.dptr.prp1),
+ iod->total_len, rq_dma_dir(req));
+ return;
+ }
- if (iod->nr_allocations == 0)
- dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list,
- iod->first_dma);
- else if (iod->nr_allocations == 1)
- dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list,
- iod->first_dma);
- else
- nvme_free_prps(dev, req);
- mempool_free(iod->sgt.sgl, dev->iod_mempool);
+ if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) {
+ if (nvme_pci_cmd_use_sgl(&iod->cmd))
+ nvme_free_sgls(req);
+ else
+ nvme_free_prps(req);
+ }
+
+ if (iod->nr_descriptors)
+ nvme_free_descriptors(req);
}
-static void nvme_print_sgl(struct scatterlist *sgl, int nents)
+static bool nvme_pci_prp_iter_next(struct request *req, struct device *dma_dev,
+ struct blk_dma_iter *iter)
{
- int i;
- struct scatterlist *sg;
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- for_each_sg(sgl, sg, nents, i) {
- dma_addr_t phys = sg_phys(sg);
- pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d "
- "dma_address:%pad dma_length:%d\n",
- i, &phys, sg->offset, sg->length, &sg_dma_address(sg),
- sg_dma_len(sg));
+ if (iter->len)
+ return true;
+ if (!blk_rq_dma_map_iter_next(req, dma_dev, &iod->dma_state, iter))
+ return false;
+ if (!dma_use_iova(&iod->dma_state) && dma_need_unmap(dma_dev)) {
+ iod->dma_vecs[iod->nr_dma_vecs].addr = iter->addr;
+ iod->dma_vecs[iod->nr_dma_vecs].len = iter->len;
+ iod->nr_dma_vecs++;
}
+ return true;
}
-static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
- struct request *req, struct nvme_rw_command *cmnd)
+static blk_status_t nvme_pci_setup_data_prp(struct request *req,
+ struct blk_dma_iter *iter)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct dma_pool *pool;
- int length = blk_rq_payload_bytes(req);
- struct scatterlist *sg = iod->sgt.sgl;
- int dma_len = sg_dma_len(sg);
- u64 dma_addr = sg_dma_address(sg);
- int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ unsigned int length = blk_rq_payload_bytes(req);
+ dma_addr_t prp1_dma, prp2_dma = 0;
+ unsigned int prp_len, i;
__le64 *prp_list;
- dma_addr_t prp_dma;
- int nprps, i;
- length -= (NVME_CTRL_PAGE_SIZE - offset);
- if (length <= 0) {
- iod->first_dma = 0;
- goto done;
+ if (!dma_use_iova(&iod->dma_state) && dma_need_unmap(nvmeq->dev->dev)) {
+ iod->dma_vecs = mempool_alloc(nvmeq->dev->dmavec_mempool,
+ GFP_ATOMIC);
+ if (!iod->dma_vecs)
+ return BLK_STS_RESOURCE;
+ iod->dma_vecs[0].addr = iter->addr;
+ iod->dma_vecs[0].len = iter->len;
+ iod->nr_dma_vecs = 1;
}
- dma_len -= (NVME_CTRL_PAGE_SIZE - offset);
- if (dma_len) {
- dma_addr += (NVME_CTRL_PAGE_SIZE - offset);
- } else {
- sg = sg_next(sg);
- dma_addr = sg_dma_address(sg);
- dma_len = sg_dma_len(sg);
+ /*
+ * PRP1 always points to the start of the DMA transfers.
+ *
+ * This is the only PRP (except for the list entries) that could be
+ * non-aligned.
+ */
+ prp1_dma = iter->addr;
+ prp_len = min(length, NVME_CTRL_PAGE_SIZE -
+ (iter->addr & (NVME_CTRL_PAGE_SIZE - 1)));
+ iod->total_len += prp_len;
+ iter->addr += prp_len;
+ iter->len -= prp_len;
+ length -= prp_len;
+ if (!length)
+ goto done;
+
+ if (!nvme_pci_prp_iter_next(req, nvmeq->dev->dev, iter)) {
+ if (WARN_ON_ONCE(!iter->status))
+ goto bad_sgl;
+ goto done;
}
+ /*
+ * PRP2 is usually a list, but can point to data if all data to be
+ * transferred fits into PRP1 + PRP2:
+ */
if (length <= NVME_CTRL_PAGE_SIZE) {
- iod->first_dma = dma_addr;
+ prp2_dma = iter->addr;
+ iod->total_len += length;
goto done;
}
- nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
- if (nprps <= (256 / 8)) {
- pool = dev->prp_small_pool;
- iod->nr_allocations = 0;
- } else {
- pool = dev->prp_page_pool;
- iod->nr_allocations = 1;
- }
+ if (DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE) <=
+ NVME_SMALL_POOL_SIZE / sizeof(__le64))
+ iod->flags |= IOD_SMALL_DESCRIPTOR;
- prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
+ prp_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC,
+ &prp2_dma);
if (!prp_list) {
- iod->nr_allocations = -1;
- return BLK_STS_RESOURCE;
+ iter->status = BLK_STS_RESOURCE;
+ goto done;
}
- iod->list[0].prp_list = prp_list;
- iod->first_dma = prp_dma;
+ iod->descriptors[iod->nr_descriptors++] = prp_list;
+
i = 0;
for (;;) {
+ prp_list[i++] = cpu_to_le64(iter->addr);
+ prp_len = min(length, NVME_CTRL_PAGE_SIZE);
+ if (WARN_ON_ONCE(iter->len < prp_len))
+ goto bad_sgl;
+
+ iod->total_len += prp_len;
+ iter->addr += prp_len;
+ iter->len -= prp_len;
+ length -= prp_len;
+ if (!length)
+ break;
+
+ if (!nvme_pci_prp_iter_next(req, nvmeq->dev->dev, iter)) {
+ if (WARN_ON_ONCE(!iter->status))
+ goto bad_sgl;
+ goto done;
+ }
+
+ /*
+ * If we've filled the entire descriptor, allocate a new that is
+ * pointed to be the last entry in the previous PRP list. To
+ * accommodate for that move the last actual entry to the new
+ * descriptor.
+ */
if (i == NVME_CTRL_PAGE_SIZE >> 3) {
__le64 *old_prp_list = prp_list;
- prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
- if (!prp_list)
- goto free_prps;
- iod->list[iod->nr_allocations++].prp_list = prp_list;
+ dma_addr_t prp_list_dma;
+
+ prp_list = dma_pool_alloc(nvmeq->descriptor_pools.large,
+ GFP_ATOMIC, &prp_list_dma);
+ if (!prp_list) {
+ iter->status = BLK_STS_RESOURCE;
+ goto done;
+ }
+ iod->descriptors[iod->nr_descriptors++] = prp_list;
+
prp_list[0] = old_prp_list[i - 1];
- old_prp_list[i - 1] = cpu_to_le64(prp_dma);
+ old_prp_list[i - 1] = cpu_to_le64(prp_list_dma);
i = 1;
}
- prp_list[i++] = cpu_to_le64(dma_addr);
- dma_len -= NVME_CTRL_PAGE_SIZE;
- dma_addr += NVME_CTRL_PAGE_SIZE;
- length -= NVME_CTRL_PAGE_SIZE;
- if (length <= 0)
- break;
- if (dma_len > 0)
- continue;
- if (unlikely(dma_len < 0))
- goto bad_sgl;
- sg = sg_next(sg);
- dma_addr = sg_dma_address(sg);
- dma_len = sg_dma_len(sg);
}
+
done:
- cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sgt.sgl));
- cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
- return BLK_STS_OK;
-free_prps:
- nvme_free_prps(dev, req);
- return BLK_STS_RESOURCE;
+ /*
+ * nvme_unmap_data uses the DPT field in the SQE to tear down the
+ * mapping, so initialize it even for failures.
+ */
+ iod->cmd.common.dptr.prp1 = cpu_to_le64(prp1_dma);
+ iod->cmd.common.dptr.prp2 = cpu_to_le64(prp2_dma);
+ if (unlikely(iter->status))
+ nvme_unmap_data(req);
+ return iter->status;
+
bad_sgl:
- WARN(DO_ONCE(nvme_print_sgl, iod->sgt.sgl, iod->sgt.nents),
- "Invalid SGL for payload:%d nents:%d\n",
- blk_rq_payload_bytes(req), iod->sgt.nents);
+ dev_err_once(nvmeq->dev->dev,
+ "Incorrectly formed request for payload:%d nents:%d\n",
+ blk_rq_payload_bytes(req), blk_rq_nr_phys_segments(req));
return BLK_STS_IOERR;
}
static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
- struct scatterlist *sg)
+ struct blk_dma_iter *iter)
{
- sge->addr = cpu_to_le64(sg_dma_address(sg));
- sge->length = cpu_to_le32(sg_dma_len(sg));
+ sge->addr = cpu_to_le64(iter->addr);
+ sge->length = cpu_to_le32(iter->len);
sge->type = NVME_SGL_FMT_DATA_DESC << 4;
}
@@ -698,152 +895,131 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
}
-static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
- struct request *req, struct nvme_rw_command *cmd)
+static blk_status_t nvme_pci_setup_data_sgl(struct request *req,
+ struct blk_dma_iter *iter)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct dma_pool *pool;
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ unsigned int entries = blk_rq_nr_phys_segments(req);
struct nvme_sgl_desc *sg_list;
- struct scatterlist *sg = iod->sgt.sgl;
- unsigned int entries = iod->sgt.nents;
dma_addr_t sgl_dma;
- int i = 0;
+ unsigned int mapped = 0;
- /* setting the transfer type as SGL */
- cmd->flags = NVME_CMD_SGL_METABUF;
+ /* set the transfer type as SGL */
+ iod->cmd.common.flags = NVME_CMD_SGL_METABUF;
- if (entries == 1) {
- nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
+ if (entries == 1 || blk_rq_dma_map_coalesce(&iod->dma_state)) {
+ nvme_pci_sgl_set_data(&iod->cmd.common.dptr.sgl, iter);
+ iod->total_len += iter->len;
return BLK_STS_OK;
}
- if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
- pool = dev->prp_small_pool;
- iod->nr_allocations = 0;
- } else {
- pool = dev->prp_page_pool;
- iod->nr_allocations = 1;
- }
+ if (entries <= NVME_SMALL_POOL_SIZE / sizeof(*sg_list))
+ iod->flags |= IOD_SMALL_DESCRIPTOR;
- sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
- if (!sg_list) {
- iod->nr_allocations = -1;
+ sg_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC,
+ &sgl_dma);
+ if (!sg_list)
return BLK_STS_RESOURCE;
- }
+ iod->descriptors[iod->nr_descriptors++] = sg_list;
- iod->list[0].sg_list = sg_list;
- iod->first_dma = sgl_dma;
-
- nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);
do {
- nvme_pci_sgl_set_data(&sg_list[i++], sg);
- sg = sg_next(sg);
- } while (--entries > 0);
+ if (WARN_ON_ONCE(mapped == entries)) {
+ iter->status = BLK_STS_IOERR;
+ break;
+ }
+ nvme_pci_sgl_set_data(&sg_list[mapped++], iter);
+ iod->total_len += iter->len;
+ } while (blk_rq_dma_map_iter_next(req, nvmeq->dev->dev, &iod->dma_state,
+ iter));
- return BLK_STS_OK;
+ nvme_pci_sgl_set_seg(&iod->cmd.common.dptr.sgl, sgl_dma, mapped);
+ if (unlikely(iter->status))
+ nvme_unmap_data(req);
+ return iter->status;
}
-static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
- struct request *req, struct nvme_rw_command *cmnd,
- struct bio_vec *bv)
+static blk_status_t nvme_pci_setup_data_simple(struct request *req,
+ enum nvme_use_sgl use_sgl)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1);
- unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset;
-
- iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
- if (dma_mapping_error(dev->dev, iod->first_dma))
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ struct bio_vec bv = req_bvec(req);
+ unsigned int prp1_offset = bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1);
+ bool prp_possible = prp1_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2;
+ dma_addr_t dma_addr;
+
+ if (!use_sgl && !prp_possible)
+ return BLK_STS_AGAIN;
+ if (is_pci_p2pdma_page(bv.bv_page))
+ return BLK_STS_AGAIN;
+
+ dma_addr = dma_map_bvec(nvmeq->dev->dev, &bv, rq_dma_dir(req), 0);
+ if (dma_mapping_error(nvmeq->dev->dev, dma_addr))
return BLK_STS_RESOURCE;
- iod->dma_len = bv->bv_len;
-
- cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma);
- if (bv->bv_len > first_prp_len)
- cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len);
- else
- cmnd->dptr.prp2 = 0;
- return BLK_STS_OK;
-}
-
-static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
- struct request *req, struct nvme_rw_command *cmnd,
- struct bio_vec *bv)
-{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ iod->total_len = bv.bv_len;
+ iod->flags |= IOD_SINGLE_SEGMENT;
+
+ if (use_sgl == SGL_FORCED || !prp_possible) {
+ iod->cmd.common.flags = NVME_CMD_SGL_METABUF;
+ iod->cmd.common.dptr.sgl.addr = cpu_to_le64(dma_addr);
+ iod->cmd.common.dptr.sgl.length = cpu_to_le32(bv.bv_len);
+ iod->cmd.common.dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4;
+ } else {
+ unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - prp1_offset;
- iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
- if (dma_mapping_error(dev->dev, iod->first_dma))
- return BLK_STS_RESOURCE;
- iod->dma_len = bv->bv_len;
+ iod->cmd.common.dptr.prp1 = cpu_to_le64(dma_addr);
+ iod->cmd.common.dptr.prp2 = 0;
+ if (bv.bv_len > first_prp_len)
+ iod->cmd.common.dptr.prp2 =
+ cpu_to_le64(dma_addr + first_prp_len);
+ }
- cmnd->flags = NVME_CMD_SGL_METABUF;
- cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma);
- cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len);
- cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4;
return BLK_STS_OK;
}
-static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
- struct nvme_command *cmnd)
+static blk_status_t nvme_map_data(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- blk_status_t ret = BLK_STS_RESOURCE;
- int rc;
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ struct nvme_dev *dev = nvmeq->dev;
+ enum nvme_use_sgl use_sgl = nvme_pci_use_sgls(dev, req);
+ struct blk_dma_iter iter;
+ blk_status_t ret;
+ /*
+ * Try to skip the DMA iterator for single segment requests, as that
+ * significantly improves performances for small I/O sizes.
+ */
if (blk_rq_nr_phys_segments(req) == 1) {
- struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
- struct bio_vec bv = req_bvec(req);
-
- if (!is_pci_p2pdma_page(bv.bv_page)) {
- if (!nvme_pci_metadata_use_sgls(dev, req) &&
- (bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) +
- bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
- return nvme_setup_prp_simple(dev, req,
- &cmnd->rw, &bv);
-
- if (nvmeq->qid && sgl_threshold &&
- nvme_ctrl_sgl_supported(&dev->ctrl))
- return nvme_setup_sgl_simple(dev, req,
- &cmnd->rw, &bv);
- }
+ ret = nvme_pci_setup_data_simple(req, use_sgl);
+ if (ret != BLK_STS_AGAIN)
+ return ret;
}
- iod->dma_len = 0;
- iod->sgt.sgl = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
- if (!iod->sgt.sgl)
- return BLK_STS_RESOURCE;
- sg_init_table(iod->sgt.sgl, blk_rq_nr_phys_segments(req));
- iod->sgt.orig_nents = blk_rq_map_sg(req, iod->sgt.sgl);
- if (!iod->sgt.orig_nents)
- goto out_free_sg;
+ if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter))
+ return iter.status;
- rc = dma_map_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req),
- DMA_ATTR_NO_WARN);
- if (rc) {
- if (rc == -EREMOTEIO)
- ret = BLK_STS_TARGET;
- goto out_free_sg;
- }
-
- if (nvme_pci_use_sgls(dev, req, iod->sgt.nents))
- ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
- else
- ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
- if (ret != BLK_STS_OK)
- goto out_unmap_sg;
- return BLK_STS_OK;
+ if (use_sgl == SGL_FORCED ||
+ (use_sgl == SGL_SUPPORTED &&
+ (sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold)))
+ return nvme_pci_setup_data_sgl(req, &iter);
+ return nvme_pci_setup_data_prp(req, &iter);
+}
-out_unmap_sg:
- dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
-out_free_sg:
- mempool_free(iod->sgt.sgl, dev->iod_mempool);
- return ret;
+static void nvme_pci_sgl_set_data_sg(struct nvme_sgl_desc *sge,
+ struct scatterlist *sg)
+{
+ sge->addr = cpu_to_le64(sg_dma_address(sg));
+ sge->length = cpu_to_le32(sg_dma_len(sg));
+ sge->type = NVME_SGL_FMT_DATA_DESC << 4;
}
-static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev,
- struct request *req)
+static blk_status_t nvme_pci_setup_meta_sgls(struct request *req)
{
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ struct nvme_dev *dev = nvmeq->dev;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct nvme_rw_command *cmnd = &iod->cmd.rw;
struct nvme_sgl_desc *sg_list;
struct scatterlist *sgl, *sg;
unsigned int entries;
@@ -865,27 +1041,28 @@ static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev,
if (rc)
goto out_free_sg;
- sg_list = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, &sgl_dma);
+ sg_list = dma_pool_alloc(nvmeq->descriptor_pools.small, GFP_ATOMIC,
+ &sgl_dma);
if (!sg_list)
goto out_unmap_sg;
entries = iod->meta_sgt.nents;
- iod->meta_list.sg_list = sg_list;
+ iod->meta_descriptor = sg_list;
iod->meta_dma = sgl_dma;
- cmnd->flags = NVME_CMD_SGL_METASEG;
- cmnd->metadata = cpu_to_le64(sgl_dma);
+ iod->cmd.common.flags = NVME_CMD_SGL_METASEG;
+ iod->cmd.common.metadata = cpu_to_le64(sgl_dma);
sgl = iod->meta_sgt.sgl;
if (entries == 1) {
- nvme_pci_sgl_set_data(sg_list, sgl);
+ nvme_pci_sgl_set_data_sg(sg_list, sgl);
return BLK_STS_OK;
}
sgl_dma += sizeof(*sg_list);
nvme_pci_sgl_set_seg(sg_list, sgl_dma, entries);
for_each_sg(sgl, sg, entries, i)
- nvme_pci_sgl_set_data(&sg_list[i + 1], sg);
+ nvme_pci_sgl_set_data_sg(&sg_list[i + 1], sg);
return BLK_STS_OK;
@@ -896,35 +1073,37 @@ out_free_sg:
return BLK_STS_RESOURCE;
}
-static blk_status_t nvme_pci_setup_meta_mptr(struct nvme_dev *dev,
- struct request *req)
+static blk_status_t nvme_pci_setup_meta_mptr(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct bio_vec bv = rq_integrity_vec(req);
- struct nvme_command *cmnd = &iod->cmd;
- iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0);
- if (dma_mapping_error(dev->dev, iod->meta_dma))
+ iod->meta_dma = dma_map_bvec(nvmeq->dev->dev, &bv, rq_dma_dir(req), 0);
+ if (dma_mapping_error(nvmeq->dev->dev, iod->meta_dma))
return BLK_STS_IOERR;
- cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
+ iod->cmd.common.metadata = cpu_to_le64(iod->meta_dma);
return BLK_STS_OK;
}
-static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req)
+static blk_status_t nvme_map_metadata(struct request *req)
{
- if (nvme_pci_metadata_use_sgls(dev, req))
- return nvme_pci_setup_meta_sgls(dev, req);
- return nvme_pci_setup_meta_mptr(dev, req);
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+
+ if ((iod->cmd.common.flags & NVME_CMD_SGL_METABUF) &&
+ nvme_pci_metadata_use_sgls(req))
+ return nvme_pci_setup_meta_sgls(req);
+ return nvme_pci_setup_meta_mptr(req);
}
-static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
+static blk_status_t nvme_prep_rq(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
blk_status_t ret;
- iod->aborted = false;
- iod->nr_allocations = -1;
- iod->sgt.nents = 0;
+ iod->flags = 0;
+ iod->nr_descriptors = 0;
+ iod->total_len = 0;
iod->meta_sgt.nents = 0;
ret = nvme_setup_cmd(req->q->queuedata, req);
@@ -932,13 +1111,13 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
return ret;
if (blk_rq_nr_phys_segments(req)) {
- ret = nvme_map_data(dev, req, &iod->cmd);
+ ret = nvme_map_data(req);
if (ret)
goto out_free_cmd;
}
if (blk_integrity_rq(req)) {
- ret = nvme_map_metadata(dev, req);
+ ret = nvme_map_metadata(req);
if (ret)
goto out_unmap_data;
}
@@ -947,7 +1126,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
return BLK_STS_OK;
out_unmap_data:
if (blk_rq_nr_phys_segments(req))
- nvme_unmap_data(dev, req);
+ nvme_unmap_data(req);
out_free_cmd:
nvme_cleanup_cmd(req);
return ret;
@@ -972,7 +1151,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
if (unlikely(!nvme_check_ready(&dev->ctrl, req, true)))
return nvme_fail_nonready_command(&dev->ctrl, req);
- ret = nvme_prep_rq(dev, req);
+ ret = nvme_prep_rq(req);
if (unlikely(ret))
return ret;
spin_lock(&nvmeq->sq_lock);
@@ -1010,7 +1189,7 @@ static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req)
if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true)))
return false;
- return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK;
+ return nvme_prep_rq(req) == BLK_STS_OK;
}
static void nvme_queue_rqs(struct rq_list *rqlist)
@@ -1036,10 +1215,11 @@ static void nvme_queue_rqs(struct rq_list *rqlist)
*rqlist = requeue_list;
}
-static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev,
- struct request *req)
+static __always_inline void nvme_unmap_metadata(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ struct nvme_dev *dev = nvmeq->dev;
if (!iod->meta_sgt.nents) {
dma_unmap_page(dev->dev, iod->meta_dma,
@@ -1048,22 +1228,18 @@ static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev,
return;
}
- dma_pool_free(dev->prp_small_pool, iod->meta_list.sg_list,
- iod->meta_dma);
+ dma_pool_free(nvmeq->descriptor_pools.small, iod->meta_descriptor,
+ iod->meta_dma);
dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
}
static __always_inline void nvme_pci_unmap_rq(struct request *req)
{
- struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
- struct nvme_dev *dev = nvmeq->dev;
-
if (blk_integrity_rq(req))
- nvme_unmap_metadata(dev, req);
-
+ nvme_unmap_metadata(req);
if (blk_rq_nr_phys_segments(req))
- nvme_unmap_data(dev, req);
+ nvme_unmap_data(req);
}
static void nvme_pci_complete_rq(struct request *req)
@@ -1490,7 +1666,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
* returned to the driver, or if this is the admin queue.
*/
opcode = nvme_req(req)->cmd->common.opcode;
- if (!nvmeq->qid || iod->aborted) {
+ if (!nvmeq->qid || (iod->flags & IOD_ABORTED)) {
dev_warn(dev->ctrl.device,
"I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n",
req->tag, nvme_cid(req), opcode,
@@ -1503,7 +1679,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
atomic_inc(&dev->ctrl.abort_limit);
return BLK_EH_RESET_TIMER;
}
- iod->aborted = true;
+ iod->flags |= IOD_ABORTED;
cmd.abort.opcode = nvme_admin_abort_cmd;
cmd.abort.cid = nvme_cid(req);
@@ -1888,8 +2064,28 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
* might be pointing at!
*/
result = nvme_disable_ctrl(&dev->ctrl, false);
- if (result < 0)
- return result;
+ if (result < 0) {
+ struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+ /*
+ * The NVMe Controller Reset method did not get an expected
+ * CSTS.RDY transition, so something with the device appears to
+ * be stuck. Use the lower level and bigger hammer PCIe
+ * Function Level Reset to attempt restoring the device to its
+ * initial state, and try again.
+ */
+ result = pcie_reset_flr(pdev, false);
+ if (result < 0)
+ return result;
+
+ pci_restore_state(pdev);
+ result = nvme_disable_ctrl(&dev->ctrl, false);
+ if (result < 0)
+ return result;
+
+ dev_info(dev->ctrl.device,
+ "controller reset completed after pcie flr\n");
+ }
result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
if (result)
@@ -2031,8 +2227,6 @@ static void nvme_map_cmb(struct nvme_dev *dev)
if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) ==
(NVME_CMBSZ_WDS | NVME_CMBSZ_RDS))
pci_p2pmem_publish(pdev, true);
-
- nvme_update_attrs(dev);
}
static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
@@ -2263,7 +2457,7 @@ static ssize_t cmb_show(struct device *dev, struct device_attribute *attr,
{
struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
- return sysfs_emit(buf, "cmbloc : x%08x\ncmbsz : x%08x\n",
+ return sysfs_emit(buf, "cmbloc : 0x%08x\ncmbsz : 0x%08x\n",
ndev->cmbloc, ndev->cmbsz);
}
static DEVICE_ATTR_RO(cmb);
@@ -2450,7 +2644,8 @@ static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
*/
if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
return 1;
- return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues;
+ return blk_mq_num_possible_queues(0) + dev->nr_write_queues +
+ dev->nr_poll_queues;
}
static int nvme_setup_io_queues(struct nvme_dev *dev)
@@ -2842,45 +3037,16 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
return 0;
}
-static int nvme_setup_prp_pools(struct nvme_dev *dev)
-{
- size_t small_align = 256;
-
- dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
- NVME_CTRL_PAGE_SIZE,
- NVME_CTRL_PAGE_SIZE, 0);
- if (!dev->prp_page_pool)
- return -ENOMEM;
-
- if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512)
- small_align = 512;
-
- /* Optimisation for I/Os between 4k and 128k */
- dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
- 256, small_align, 0);
- if (!dev->prp_small_pool) {
- dma_pool_destroy(dev->prp_page_pool);
- return -ENOMEM;
- }
- return 0;
-}
-
-static void nvme_release_prp_pools(struct nvme_dev *dev)
-{
- dma_pool_destroy(dev->prp_page_pool);
- dma_pool_destroy(dev->prp_small_pool);
-}
-
static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
{
size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1);
- size_t alloc_size = sizeof(struct scatterlist) * NVME_MAX_SEGS;
+ size_t alloc_size = sizeof(struct nvme_dma_vec) * NVME_MAX_SEGS;
- dev->iod_mempool = mempool_create_node(1,
+ dev->dmavec_mempool = mempool_create_node(1,
mempool_kmalloc, mempool_kfree,
(void *)alloc_size, GFP_KERNEL,
dev_to_node(dev->dev));
- if (!dev->iod_mempool)
+ if (!dev->dmavec_mempool)
return -ENOMEM;
dev->iod_meta_mempool = mempool_create_node(1,
@@ -2889,10 +3055,9 @@ static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
dev_to_node(dev->dev));
if (!dev->iod_meta_mempool)
goto free;
-
return 0;
free:
- mempool_destroy(dev->iod_mempool);
+ mempool_destroy(dev->dmavec_mempool);
return -ENOMEM;
}
@@ -2969,12 +3134,14 @@ static void nvme_reset_work(struct work_struct *work)
if (result < 0)
goto out;
+ nvme_update_attrs(dev);
+
result = nvme_setup_io_queues(dev);
if (result)
goto out;
/*
- * Freeze and update the number of I/O queues as thos might have
+ * Freeze and update the number of I/O queues as those might have
* changed. If there are no I/O queues left after this reset, keep the
* controller around but remove all namespaces.
*/
@@ -3145,7 +3312,7 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
/*
* Exclude some Kingston NV1 and A2000 devices from
* NVME_QUIRK_SIMPLE_SUSPEND. Do a full suspend to save a
- * lot fo energy with s2idle sleep on some TUXEDO platforms.
+ * lot of energy with s2idle sleep on some TUXEDO platforms.
*/
if (dmi_match(DMI_BOARD_NAME, "NS5X_NS7XAU") ||
dmi_match(DMI_BOARD_NAME, "NS5x_7xAU") ||
@@ -3185,7 +3352,8 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
struct nvme_dev *dev;
int ret = -ENOMEM;
- dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
+ dev = kzalloc_node(struct_size(dev, descriptor_pools, nr_node_ids),
+ GFP_KERNEL, node);
if (!dev)
return ERR_PTR(-ENOMEM);
INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
@@ -3230,7 +3398,8 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
* over a single page.
*/
dev->ctrl.max_hw_sectors = min_t(u32,
- NVME_MAX_KB_SZ << 1, dma_opt_mapping_size(&pdev->dev) >> 9);
+ NVME_MAX_BYTES >> SECTOR_SHIFT,
+ dma_opt_mapping_size(&pdev->dev) >> 9);
dev->ctrl.max_segments = NVME_MAX_SEGS;
dev->ctrl.max_integrity_segments = 1;
return dev;
@@ -3260,13 +3429,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (result)
goto out_uninit_ctrl;
- result = nvme_setup_prp_pools(dev);
- if (result)
- goto out_dev_unmap;
-
result = nvme_pci_alloc_iod_mempool(dev);
if (result)
- goto out_release_prp_pools;
+ goto out_dev_unmap;
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
@@ -3305,6 +3470,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (result < 0)
goto out_disable;
+ nvme_update_attrs(dev);
+
result = nvme_setup_io_queues(dev);
if (result)
goto out_disable;
@@ -3340,10 +3507,8 @@ out_disable:
nvme_dbbuf_dma_free(dev);
nvme_free_queues(dev, 0);
out_release_iod_mempool:
- mempool_destroy(dev->iod_mempool);
+ mempool_destroy(dev->dmavec_mempool);
mempool_destroy(dev->iod_meta_mempool);
-out_release_prp_pools:
- nvme_release_prp_pools(dev);
out_dev_unmap:
nvme_dev_unmap(dev);
out_uninit_ctrl:
@@ -3406,9 +3571,9 @@ static void nvme_remove(struct pci_dev *pdev)
nvme_dev_remove_admin(dev);
nvme_dbbuf_dma_free(dev);
nvme_free_queues(dev, 0);
- mempool_destroy(dev->iod_mempool);
+ mempool_destroy(dev->dmavec_mempool);
mempool_destroy(dev->iod_meta_mempool);
- nvme_release_prp_pools(dev);
+ nvme_release_descriptor_pools(dev);
nvme_dev_unmap(dev);
nvme_uninit_ctrl(&dev->ctrl);
}
@@ -3809,9 +3974,6 @@ static int __init nvme_init(void)
BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
- BUILD_BUG_ON(NVME_MAX_SEGS > SGES_PER_PAGE);
- BUILD_BUG_ON(sizeof(struct scatterlist) * NVME_MAX_SEGS > PAGE_SIZE);
- BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_ALLOCATIONS);
return pci_register_driver(&nvme_driver);
}
diff --git a/drivers/nvme/host/pr.c b/drivers/nvme/host/pr.c
index cf2d2c5039dd..ca6a74607b13 100644
--- a/drivers/nvme/host/pr.c
+++ b/drivers/nvme/host/pr.c
@@ -82,8 +82,6 @@ static int nvme_status_to_pr_err(int status)
return PR_STS_SUCCESS;
case NVME_SC_RESERVATION_CONFLICT:
return PR_STS_RESERVATION_CONFLICT;
- case NVME_SC_ONCS_NOT_SUPPORTED:
- return -EOPNOTSUPP;
case NVME_SC_BAD_ATTRIBUTES:
case NVME_SC_INVALID_OPCODE:
case NVME_SC_INVALID_FIELD:
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index b5a0295b5bf4..190a4cfa8a5e 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -221,7 +221,7 @@ static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev,
/*
* Bind the CQEs (post recv buffers) DMA mapping to the RDMA queue
- * lifetime. It's safe, since any chage in the underlying RDMA device
+ * lifetime. It's safe, since any change in the underlying RDMA device
* will issue error recovery and queue re-creation.
*/
for (i = 0; i < ib_queue_size; i++) {
@@ -800,7 +800,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
/*
* Bind the async event SQE DMA mapping to the admin queue lifetime.
- * It's safe, since any chage in the underlying RDMA device will issue
+ * It's safe, since any change in the underlying RDMA device will issue
* error recovery and queue re-creation.
*/
error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe,
@@ -877,7 +877,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
/*
* Only start IO queues for which we have allocated the tagset
- * and limitted it to the available queues. On reconnects, the
+ * and limited it to the available queues. On reconnects, the
* queue number might have changed.
*/
nr_queues = min(ctrl->tag_set.nr_hw_queues + 1, ctrl->ctrl.queue_count);
diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c
index a5bc3bb483d5..29430949ce2f 100644
--- a/drivers/nvme/host/sysfs.c
+++ b/drivers/nvme/host/sysfs.c
@@ -260,6 +260,7 @@ static struct attribute *nvme_ns_attrs[] = {
&dev_attr_ana_state.attr,
&dev_attr_queue_depth.attr,
&dev_attr_numa_nodes.attr,
+ &dev_attr_delayed_removal_secs.attr,
#endif
&dev_attr_io_passthru_err_log_enabled.attr,
NULL,
@@ -296,6 +297,12 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
if (nvme_disk_is_ns_head(dev_to_disk(dev)))
return 0;
}
+ if (a == &dev_attr_delayed_removal_secs.attr) {
+ struct gendisk *disk = dev_to_disk(dev);
+
+ if (!nvme_disk_is_ns_head(disk))
+ return 0;
+ }
#endif
return a->mode;
}
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index aba365f97cf6..c0fe8cfb7229 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -8,6 +8,7 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/err.h>
+#include <linux/crc32.h>
#include <linux/nvme-tcp.h>
#include <linux/nvme-keyring.h>
#include <net/sock.h>
@@ -16,7 +17,6 @@
#include <net/tls_prot.h>
#include <net/handshake.h>
#include <linux/blk-mq.h>
-#include <crypto/hash.h>
#include <net/busy_poll.h>
#include <trace/events/sock.h>
@@ -168,8 +168,8 @@ struct nvme_tcp_queue {
bool hdr_digest;
bool data_digest;
bool tls_enabled;
- struct ahash_request *rcv_hash;
- struct ahash_request *snd_hash;
+ u32 rcv_crc;
+ u32 snd_crc;
__le32 exp_ddgst;
__le32 recv_ddgst;
struct completion tls_complete;
@@ -403,7 +403,7 @@ static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
}
static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
- bool sync, bool last)
+ bool last)
{
struct nvme_tcp_queue *queue = req->queue;
bool empty;
@@ -417,7 +417,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
* are on the same cpu, so we don't introduce contention.
*/
if (queue->io_cpu == raw_smp_processor_id() &&
- sync && empty && mutex_trylock(&queue->send_mutex)) {
+ empty && mutex_trylock(&queue->send_mutex)) {
nvme_tcp_send_all(queue);
mutex_unlock(&queue->send_mutex);
}
@@ -452,36 +452,43 @@ nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
return NULL;
}
- list_del(&req->entry);
+ list_del_init(&req->entry);
+ init_llist_node(&req->lentry);
return req;
}
-static inline void nvme_tcp_ddgst_final(struct ahash_request *hash,
- __le32 *dgst)
+#define NVME_TCP_CRC_SEED (~0)
+
+static inline void nvme_tcp_ddgst_update(u32 *crcp,
+ struct page *page, size_t off, size_t len)
{
- ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
- crypto_ahash_final(hash);
+ page += off / PAGE_SIZE;
+ off %= PAGE_SIZE;
+ while (len) {
+ const void *vaddr = kmap_local_page(page);
+ size_t n = min(len, (size_t)PAGE_SIZE - off);
+
+ *crcp = crc32c(*crcp, vaddr + off, n);
+ kunmap_local(vaddr);
+ page++;
+ off = 0;
+ len -= n;
+ }
}
-static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
- struct page *page, off_t off, size_t len)
+static inline __le32 nvme_tcp_ddgst_final(u32 crc)
{
- struct scatterlist sg;
-
- sg_init_table(&sg, 1);
- sg_set_page(&sg, page, len, off);
- ahash_request_set_crypt(hash, &sg, NULL, len);
- crypto_ahash_update(hash);
+ return cpu_to_le32(~crc);
}
-static inline void nvme_tcp_hdgst(struct ahash_request *hash,
- void *pdu, size_t len)
+static inline __le32 nvme_tcp_hdgst(const void *pdu, size_t len)
{
- struct scatterlist sg;
+ return cpu_to_le32(~crc32c(NVME_TCP_CRC_SEED, pdu, len));
+}
- sg_init_one(&sg, pdu, len);
- ahash_request_set_crypt(hash, &sg, pdu + len, len);
- crypto_ahash_digest(hash);
+static inline void nvme_tcp_set_hdgst(void *pdu, size_t len)
+{
+ *(__le32 *)(pdu + len) = nvme_tcp_hdgst(pdu, len);
}
static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
@@ -499,8 +506,7 @@ static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
}
recv_digest = *(__le32 *)(pdu + hdr->hlen);
- nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
- exp_digest = *(__le32 *)(pdu + hdr->hlen);
+ exp_digest = nvme_tcp_hdgst(pdu, pdu_len);
if (recv_digest != exp_digest) {
dev_err(queue->ctrl->ctrl.device,
"header digest error: recv %#x expected %#x\n",
@@ -526,7 +532,7 @@ static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
nvme_tcp_queue_id(queue));
return -EPROTO;
}
- crypto_ahash_init(queue->rcv_hash);
+ queue->rcv_crc = NVME_TCP_CRC_SEED;
return 0;
}
@@ -560,6 +566,8 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
req->queue = queue;
nvme_req(rq)->ctrl = &ctrl->ctrl;
nvme_req(rq)->cmd = &pdu->cmd;
+ init_llist_node(&req->lentry);
+ INIT_LIST_HEAD(&req->entry);
return 0;
}
@@ -764,13 +772,23 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
return -EPROTO;
}
+ if (llist_on_list(&req->lentry) ||
+ !list_empty(&req->entry)) {
+ dev_err(queue->ctrl->ctrl.device,
+ "req %d unexpected r2t while processing request\n",
+ rq->tag);
+ return -EPROTO;
+ }
+
req->pdu_len = 0;
req->h2cdata_left = r2t_length;
req->h2cdata_offset = r2t_offset;
req->ttag = pdu->ttag;
nvme_tcp_setup_h2c_data_pdu(req);
- nvme_tcp_queue_request(req, false, true);
+
+ llist_add(&req->lentry, &queue->req_list);
+ queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
return 0;
}
@@ -926,8 +944,8 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
iov_iter_count(&req->iter));
if (queue->data_digest)
- ret = skb_copy_and_hash_datagram_iter(skb, *offset,
- &req->iter, recv_len, queue->rcv_hash);
+ ret = skb_copy_and_crc32c_datagram_iter(skb, *offset,
+ &req->iter, recv_len, &queue->rcv_crc);
else
ret = skb_copy_datagram_iter(skb, *offset,
&req->iter, recv_len);
@@ -945,7 +963,7 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
if (!queue->data_remaining) {
if (queue->data_digest) {
- nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
+ queue->exp_ddgst = nvme_tcp_ddgst_final(queue->rcv_crc);
queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
} else {
if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
@@ -1147,7 +1165,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
return ret;
if (queue->data_digest)
- nvme_tcp_ddgst_update(queue->snd_hash, page,
+ nvme_tcp_ddgst_update(&queue->snd_crc, page,
offset, ret);
/*
@@ -1161,8 +1179,8 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
/* fully successful last send in current PDU */
if (last && ret == len) {
if (queue->data_digest) {
- nvme_tcp_ddgst_final(queue->snd_hash,
- &req->ddgst);
+ req->ddgst =
+ nvme_tcp_ddgst_final(queue->snd_crc);
req->state = NVME_TCP_SEND_DDGST;
req->offset = 0;
} else {
@@ -1194,7 +1212,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
msg.msg_flags |= MSG_EOR;
if (queue->hdr_digest && !req->offset)
- nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+ nvme_tcp_set_hdgst(pdu, sizeof(*pdu));
bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
@@ -1207,7 +1225,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
if (inline_data) {
req->state = NVME_TCP_SEND_DATA;
if (queue->data_digest)
- crypto_ahash_init(queue->snd_hash);
+ queue->snd_crc = NVME_TCP_CRC_SEED;
} else {
nvme_tcp_done_send_req(queue);
}
@@ -1229,7 +1247,7 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
int ret;
if (queue->hdr_digest && !req->offset)
- nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+ nvme_tcp_set_hdgst(pdu, sizeof(*pdu));
if (!req->h2cdata_left)
msg.msg_flags |= MSG_SPLICE_PAGES;
@@ -1244,7 +1262,7 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
if (!len) {
req->state = NVME_TCP_SEND_DATA;
if (queue->data_digest)
- crypto_ahash_init(queue->snd_hash);
+ queue->snd_crc = NVME_TCP_CRC_SEED;
return 1;
}
req->offset += ret;
@@ -1348,7 +1366,7 @@ static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
queue->nr_cqe = 0;
consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
release_sock(sk);
- return consumed;
+ return consumed == -EAGAIN ? 0 : consumed;
}
static void nvme_tcp_io_work(struct work_struct *w)
@@ -1376,6 +1394,11 @@ static void nvme_tcp_io_work(struct work_struct *w)
else if (unlikely(result < 0))
return;
+ /* did we get some space after spending time in recv? */
+ if (nvme_tcp_queue_has_pending(queue) &&
+ sk_stream_is_writeable(queue->sock->sk))
+ pending = true;
+
if (!pending || !queue->rd_enabled)
return;
@@ -1384,41 +1407,6 @@ static void nvme_tcp_io_work(struct work_struct *w)
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
}
-static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
-{
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
-
- ahash_request_free(queue->rcv_hash);
- ahash_request_free(queue->snd_hash);
- crypto_free_ahash(tfm);
-}
-
-static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
-{
- struct crypto_ahash *tfm;
-
- tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(tfm))
- return PTR_ERR(tfm);
-
- queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
- if (!queue->snd_hash)
- goto free_tfm;
- ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
-
- queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
- if (!queue->rcv_hash)
- goto free_snd_hash;
- ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
-
- return 0;
-free_snd_hash:
- ahash_request_free(queue->snd_hash);
-free_tfm:
- crypto_free_ahash(tfm);
- return -ENOMEM;
-}
-
static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
{
struct nvme_tcp_request *async = &ctrl->async_req;
@@ -1451,9 +1439,6 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
return;
- if (queue->hdr_digest || queue->data_digest)
- nvme_tcp_free_crypto(queue);
-
page_frag_cache_drain(&queue->pf_cache);
noreclaim_flag = memalloc_noreclaim_save();
@@ -1760,9 +1745,14 @@ static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl,
qid, ret);
tls_handshake_cancel(queue->sock->sk);
} else {
- dev_dbg(nctrl->device,
- "queue %d: TLS handshake complete, error %d\n",
- qid, queue->tls_err);
+ if (queue->tls_err) {
+ dev_err(nctrl->device,
+ "queue %d: TLS handshake complete, error %d\n",
+ qid, queue->tls_err);
+ } else {
+ dev_dbg(nctrl->device,
+ "queue %d: TLS handshake complete\n", qid);
+ }
ret = queue->tls_err;
}
return ret;
@@ -1867,21 +1857,13 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
queue->hdr_digest = nctrl->opts->hdr_digest;
queue->data_digest = nctrl->opts->data_digest;
- if (queue->hdr_digest || queue->data_digest) {
- ret = nvme_tcp_alloc_crypto(queue);
- if (ret) {
- dev_err(nctrl->device,
- "failed to allocate queue %d crypto\n", qid);
- goto err_sock;
- }
- }
rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
nvme_tcp_hdgst_len(queue);
queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
if (!queue->pdu) {
ret = -ENOMEM;
- goto err_crypto;
+ goto err_sock;
}
dev_dbg(nctrl->device, "connecting queue %d\n",
@@ -1914,9 +1896,6 @@ err_init_connect:
kernel_sock_shutdown(queue->sock, SHUT_RDWR);
err_rcv_pdu:
kfree(queue->pdu);
-err_crypto:
- if (queue->hdr_digest || queue->data_digest)
- nvme_tcp_free_crypto(queue);
err_sock:
/* ->sock will be released by fput() */
fput(queue->sock->file);
@@ -2200,7 +2179,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
/*
* Only start IO queues for which we have allocated the tagset
- * and limitted it to the available queues. On reconnects, the
+ * and limited it to the available queues. On reconnects, the
* queue number might have changed.
*/
nr_queues = min(ctrl->tagset->nr_hw_queues + 1, ctrl->queue_count);
@@ -2385,14 +2364,14 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
if (ret)
return ret;
- if (ctrl->opts && ctrl->opts->concat && !ctrl->tls_pskid) {
+ if (ctrl->opts->concat && !ctrl->tls_pskid) {
/* See comments for nvme_tcp_key_revoke_needed() */
dev_dbg(ctrl->device, "restart admin queue for secure concatenation\n");
nvme_stop_keep_alive(ctrl);
nvme_tcp_teardown_admin_queue(ctrl, false);
ret = nvme_tcp_configure_admin_queue(ctrl, false);
if (ret)
- return ret;
+ goto destroy_admin;
}
if (ctrl->icdoff) {
@@ -2636,8 +2615,10 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
ctrl->async_req.offset = 0;
ctrl->async_req.curr_bio = NULL;
ctrl->async_req.data_len = 0;
+ init_llist_node(&ctrl->async_req.lentry);
+ INIT_LIST_HEAD(&ctrl->async_req.entry);
- nvme_tcp_queue_request(&ctrl->async_req, true, true);
+ nvme_tcp_queue_request(&ctrl->async_req, true);
}
static void nvme_tcp_complete_timed_out(struct request *rq)
@@ -2789,7 +2770,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
nvme_start_request(rq);
- nvme_tcp_queue_request(req, true, bd->last);
+ nvme_tcp_queue_request(req, bd->last);
return BLK_STS_OK;
}