diff options
Diffstat (limited to 'drivers/nvme/host')
-rw-r--r-- | drivers/nvme/host/apple.c | 30 | ||||
-rw-r--r-- | drivers/nvme/host/auth.c | 258 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 329 | ||||
-rw-r--r-- | drivers/nvme/host/fc.c | 59 | ||||
-rw-r--r-- | drivers/nvme/host/ioctl.c | 118 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 29 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 69 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 608 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 42 | ||||
-rw-r--r-- | drivers/nvme/host/tcp.c | 49 |
10 files changed, 824 insertions, 767 deletions
diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index ff8b083dc5c6..e36aeb50b4ed 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -763,7 +763,7 @@ static blk_status_t apple_nvme_queue_rq(struct blk_mq_hw_ctx *hctx, goto out_free_cmd; } - blk_mq_start_request(req); + nvme_start_request(req); apple_nvme_submit_cmd(q, cmnd); return BLK_STS_OK; @@ -821,7 +821,7 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown) if (!dead && shutdown && freeze) nvme_wait_freeze_timeout(&anv->ctrl, NVME_IO_TIMEOUT); - nvme_stop_queues(&anv->ctrl); + nvme_quiesce_io_queues(&anv->ctrl); if (!dead) { if (READ_ONCE(anv->ioq.enabled)) { @@ -829,15 +829,13 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown) apple_nvme_remove_cq(anv); } - if (shutdown) - nvme_shutdown_ctrl(&anv->ctrl); - nvme_disable_ctrl(&anv->ctrl); + nvme_disable_ctrl(&anv->ctrl, shutdown); } WRITE_ONCE(anv->ioq.enabled, false); WRITE_ONCE(anv->adminq.enabled, false); mb(); /* ensure that nvme_queue_rq() sees that enabled is cleared */ - nvme_stop_admin_queue(&anv->ctrl); + nvme_quiesce_admin_queue(&anv->ctrl); /* last chance to complete any requests before nvme_cancel_request */ spin_lock_irqsave(&anv->lock, flags); @@ -854,8 +852,8 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown) * deadlocking blk-mq hot-cpu notifier. */ if (shutdown) { - nvme_start_queues(&anv->ctrl); - nvme_start_admin_queue(&anv->ctrl); + nvme_unquiesce_io_queues(&anv->ctrl); + nvme_unquiesce_admin_queue(&anv->ctrl); } } @@ -1093,7 +1091,7 @@ static void apple_nvme_reset_work(struct work_struct *work) dev_dbg(anv->dev, "Starting admin queue"); apple_nvme_init_queue(&anv->adminq); - nvme_start_admin_queue(&anv->ctrl); + nvme_unquiesce_admin_queue(&anv->ctrl); if (!nvme_change_ctrl_state(&anv->ctrl, NVME_CTRL_CONNECTING)) { dev_warn(anv->ctrl.device, @@ -1102,7 +1100,7 @@ static void apple_nvme_reset_work(struct work_struct *work) goto out; } - ret = nvme_init_ctrl_finish(&anv->ctrl); + ret = nvme_init_ctrl_finish(&anv->ctrl, false); if (ret) goto out; @@ -1127,7 +1125,7 @@ static void apple_nvme_reset_work(struct work_struct *work) anv->ctrl.queue_count = nr_io_queues + 1; - nvme_start_queues(&anv->ctrl); + nvme_unquiesce_io_queues(&anv->ctrl); nvme_wait_freeze(&anv->ctrl); blk_mq_update_nr_hw_queues(&anv->tagset, 1); nvme_unfreeze(&anv->ctrl); @@ -1153,7 +1151,7 @@ out: nvme_change_ctrl_state(&anv->ctrl, NVME_CTRL_DELETING); nvme_get_ctrl(&anv->ctrl); apple_nvme_disable(anv, false); - nvme_kill_queues(&anv->ctrl); + nvme_mark_namespaces_dead(&anv->ctrl); if (!queue_work(nvme_wq, &anv->remove_work)) nvme_put_ctrl(&anv->ctrl); } @@ -1507,14 +1505,6 @@ static int apple_nvme_probe(struct platform_device *pdev) goto put_dev; } - if (!blk_get_queue(anv->ctrl.admin_q)) { - nvme_start_admin_queue(&anv->ctrl); - blk_mq_destroy_queue(anv->ctrl.admin_q); - anv->ctrl.admin_q = NULL; - ret = -ENODEV; - goto put_dev; - } - nvme_reset_ctrl(&anv->ctrl); async_schedule(apple_nvme_async_probe, anv); diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index c8a6db7c4498..bb0abbe4491c 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -13,6 +13,10 @@ #include "fabrics.h" #include <linux/nvme-auth.h> +#define CHAP_BUF_SIZE 4096 +static struct kmem_cache *nvme_chap_buf_cache; +static mempool_t *nvme_chap_buf_pool; + struct nvme_dhchap_queue_context { struct list_head entry; struct work_struct auth_work; @@ -20,7 +24,6 @@ struct nvme_dhchap_queue_context { struct crypto_shash *shash_tfm; struct crypto_kpp *dh_tfm; void *buf; - size_t buf_size; int qid; int error; u32 s1; @@ -47,6 +50,12 @@ struct nvme_dhchap_queue_context { #define nvme_auth_queue_from_qid(ctrl, qid) \ (qid == 0) ? (ctrl)->fabrics_q : (ctrl)->connect_q +static inline int ctrl_max_dhchaps(struct nvme_ctrl *ctrl) +{ + return ctrl->opts->nr_io_queues + ctrl->opts->nr_write_queues + + ctrl->opts->nr_poll_queues + 1; +} + static int nvme_auth_submit(struct nvme_ctrl *ctrl, int qid, void *data, size_t data_len, bool auth_send) { @@ -112,7 +121,7 @@ static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl, struct nvmf_auth_dhchap_negotiate_data *data = chap->buf; size_t size = sizeof(*data) + sizeof(union nvmf_auth_protocol); - if (chap->buf_size < size) { + if (size > CHAP_BUF_SIZE) { chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return -EINVAL; } @@ -147,7 +156,7 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl, const char *gid_name = nvme_auth_dhgroup_name(data->dhgid); const char *hmac_name, *kpp_name; - if (chap->buf_size < size) { + if (size > CHAP_BUF_SIZE) { chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return NVME_SC_INVALID_FIELD; } @@ -197,12 +206,6 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl, return NVME_SC_AUTH_REQUIRED; } - /* Reset host response if the hash had been changed */ - if (chap->hash_id != data->hashid) { - kfree(chap->host_response); - chap->host_response = NULL; - } - chap->hash_id = data->hashid; chap->hash_len = data->hl; dev_dbg(ctrl->device, "qid %d: selected hash %s\n", @@ -219,14 +222,6 @@ select_kpp: return NVME_SC_AUTH_REQUIRED; } - /* Clear host and controller key to avoid accidental reuse */ - kfree_sensitive(chap->host_key); - chap->host_key = NULL; - chap->host_key_len = 0; - kfree_sensitive(chap->ctrl_key); - chap->ctrl_key = NULL; - chap->ctrl_key_len = 0; - if (chap->dhgroup_id == data->dhgid && (data->dhgid == NVME_AUTH_DHGROUP_NULL || chap->dh_tfm)) { dev_dbg(ctrl->device, @@ -302,7 +297,7 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl, if (chap->host_key_len) size += chap->host_key_len; - if (chap->buf_size < size) { + if (size > CHAP_BUF_SIZE) { chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return -EINVAL; } @@ -344,10 +339,10 @@ static int nvme_auth_process_dhchap_success1(struct nvme_ctrl *ctrl, struct nvmf_auth_dhchap_success1_data *data = chap->buf; size_t size = sizeof(*data); - if (ctrl->ctrl_key) + if (chap->ctrl_key) size += chap->hash_len; - if (chap->buf_size < size) { + if (size > CHAP_BUF_SIZE) { chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return NVME_SC_INVALID_FIELD; } @@ -521,6 +516,7 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl, ret = PTR_ERR(ctrl_response); return ret; } + ret = crypto_shash_setkey(chap->shash_tfm, ctrl_response, ctrl->ctrl_key->len); if (ret) { @@ -621,9 +617,6 @@ static int nvme_auth_dhchap_exponential(struct nvme_ctrl *ctrl, if (ret) { dev_dbg(ctrl->device, "failed to generate public key, error %d\n", ret); - kfree(chap->host_key); - chap->host_key = NULL; - chap->host_key_len = 0; chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return ret; } @@ -643,9 +636,6 @@ gen_sesskey: if (ret) { dev_dbg(ctrl->device, "failed to generate shared secret, error %d\n", ret); - kfree_sensitive(chap->sess_key); - chap->sess_key = NULL; - chap->sess_key_len = 0; chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return ret; } @@ -654,7 +644,7 @@ gen_sesskey: return 0; } -static void __nvme_auth_reset(struct nvme_dhchap_queue_context *chap) +static void nvme_auth_reset_dhchap(struct nvme_dhchap_queue_context *chap) { kfree_sensitive(chap->host_response); chap->host_response = NULL; @@ -674,24 +664,20 @@ static void __nvme_auth_reset(struct nvme_dhchap_queue_context *chap) chap->transaction = 0; memset(chap->c1, 0, sizeof(chap->c1)); memset(chap->c2, 0, sizeof(chap->c2)); + mempool_free(chap->buf, nvme_chap_buf_pool); + chap->buf = NULL; } -static void __nvme_auth_free(struct nvme_dhchap_queue_context *chap) +static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap) { - __nvme_auth_reset(chap); + nvme_auth_reset_dhchap(chap); if (chap->shash_tfm) crypto_free_shash(chap->shash_tfm); if (chap->dh_tfm) crypto_free_kpp(chap->dh_tfm); - kfree_sensitive(chap->ctrl_key); - kfree_sensitive(chap->host_key); - kfree_sensitive(chap->sess_key); - kfree_sensitive(chap->host_response); - kfree(chap->buf); - kfree(chap); } -static void __nvme_auth_work(struct work_struct *work) +static void nvme_queue_auth_work(struct work_struct *work) { struct nvme_dhchap_queue_context *chap = container_of(work, struct nvme_dhchap_queue_context, auth_work); @@ -699,6 +685,16 @@ static void __nvme_auth_work(struct work_struct *work) size_t tl; int ret = 0; + /* + * Allocate a large enough buffer for the entire negotiation: + * 4k is enough to ffdhe8192. + */ + chap->buf = mempool_alloc(nvme_chap_buf_pool, GFP_KERNEL); + if (!chap->buf) { + chap->error = -ENOMEM; + return; + } + chap->transaction = ctrl->transaction++; /* DH-HMAC-CHAP Step 1: send negotiate */ @@ -720,8 +716,9 @@ static void __nvme_auth_work(struct work_struct *work) dev_dbg(ctrl->device, "%s: qid %d receive challenge\n", __func__, chap->qid); - memset(chap->buf, 0, chap->buf_size); - ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, chap->buf_size, false); + memset(chap->buf, 0, CHAP_BUF_SIZE); + ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, CHAP_BUF_SIZE, + false); if (ret) { dev_warn(ctrl->device, "qid %d failed to receive challenge, %s %d\n", @@ -757,11 +754,14 @@ static void __nvme_auth_work(struct work_struct *work) dev_dbg(ctrl->device, "%s: qid %d host response\n", __func__, chap->qid); + mutex_lock(&ctrl->dhchap_auth_mutex); ret = nvme_auth_dhchap_setup_host_response(ctrl, chap); if (ret) { + mutex_unlock(&ctrl->dhchap_auth_mutex); chap->error = ret; goto fail2; } + mutex_unlock(&ctrl->dhchap_auth_mutex); /* DH-HMAC-CHAP Step 3: send reply */ dev_dbg(ctrl->device, "%s: qid %d send reply\n", @@ -783,8 +783,9 @@ static void __nvme_auth_work(struct work_struct *work) dev_dbg(ctrl->device, "%s: qid %d receive success1\n", __func__, chap->qid); - memset(chap->buf, 0, chap->buf_size); - ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, chap->buf_size, false); + memset(chap->buf, 0, CHAP_BUF_SIZE); + ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, CHAP_BUF_SIZE, + false); if (ret) { dev_warn(ctrl->device, "qid %d failed to receive success1, %s %d\n", @@ -801,16 +802,19 @@ static void __nvme_auth_work(struct work_struct *work) return; } + mutex_lock(&ctrl->dhchap_auth_mutex); if (ctrl->ctrl_key) { dev_dbg(ctrl->device, "%s: qid %d controller response\n", __func__, chap->qid); ret = nvme_auth_dhchap_setup_ctrl_response(ctrl, chap); if (ret) { + mutex_unlock(&ctrl->dhchap_auth_mutex); chap->error = ret; goto fail2; } } + mutex_unlock(&ctrl->dhchap_auth_mutex); ret = nvme_auth_process_dhchap_success1(ctrl, chap); if (ret) { @@ -819,7 +823,7 @@ static void __nvme_auth_work(struct work_struct *work) goto fail2; } - if (ctrl->ctrl_key) { + if (chap->ctrl_key) { /* DH-HMAC-CHAP Step 5: send success2 */ dev_dbg(ctrl->device, "%s: qid %d send success2\n", __func__, chap->qid); @@ -860,42 +864,8 @@ int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid) return -ENOKEY; } - mutex_lock(&ctrl->dhchap_auth_mutex); - /* Check if the context is already queued */ - list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) { - WARN_ON(!chap->buf); - if (chap->qid == qid) { - dev_dbg(ctrl->device, "qid %d: re-using context\n", qid); - mutex_unlock(&ctrl->dhchap_auth_mutex); - flush_work(&chap->auth_work); - __nvme_auth_reset(chap); - queue_work(nvme_wq, &chap->auth_work); - return 0; - } - } - chap = kzalloc(sizeof(*chap), GFP_KERNEL); - if (!chap) { - mutex_unlock(&ctrl->dhchap_auth_mutex); - return -ENOMEM; - } - chap->qid = (qid == NVME_QID_ANY) ? 0 : qid; - chap->ctrl = ctrl; - - /* - * Allocate a large enough buffer for the entire negotiation: - * 4k should be enough to ffdhe8192. - */ - chap->buf_size = 4096; - chap->buf = kzalloc(chap->buf_size, GFP_KERNEL); - if (!chap->buf) { - mutex_unlock(&ctrl->dhchap_auth_mutex); - kfree(chap); - return -ENOMEM; - } - - INIT_WORK(&chap->auth_work, __nvme_auth_work); - list_add(&chap->entry, &ctrl->dhchap_auth_list); - mutex_unlock(&ctrl->dhchap_auth_mutex); + chap = &ctrl->dhchap_ctxs[qid]; + cancel_work_sync(&chap->auth_work); queue_work(nvme_wq, &chap->auth_work); return 0; } @@ -906,40 +876,28 @@ int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid) struct nvme_dhchap_queue_context *chap; int ret; - mutex_lock(&ctrl->dhchap_auth_mutex); - list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) { - if (chap->qid != qid) - continue; - mutex_unlock(&ctrl->dhchap_auth_mutex); - flush_work(&chap->auth_work); - ret = chap->error; - return ret; - } - mutex_unlock(&ctrl->dhchap_auth_mutex); - return -ENXIO; + chap = &ctrl->dhchap_ctxs[qid]; + flush_work(&chap->auth_work); + ret = chap->error; + /* clear sensitive info */ + nvme_auth_reset_dhchap(chap); + return ret; } EXPORT_SYMBOL_GPL(nvme_auth_wait); -void nvme_auth_reset(struct nvme_ctrl *ctrl) -{ - struct nvme_dhchap_queue_context *chap; - - mutex_lock(&ctrl->dhchap_auth_mutex); - list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) { - mutex_unlock(&ctrl->dhchap_auth_mutex); - flush_work(&chap->auth_work); - __nvme_auth_reset(chap); - } - mutex_unlock(&ctrl->dhchap_auth_mutex); -} -EXPORT_SYMBOL_GPL(nvme_auth_reset); - -static void nvme_dhchap_auth_work(struct work_struct *work) +static void nvme_ctrl_auth_work(struct work_struct *work) { struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, dhchap_auth_work); int ret, q; + /* + * If the ctrl is no connected, bail as reconnect will handle + * authentication. + */ + if (ctrl->state != NVME_CTRL_LIVE) + return; + /* Authenticate admin queue first */ ret = nvme_auth_negotiate(ctrl, 0); if (ret) { @@ -968,43 +926,75 @@ static void nvme_dhchap_auth_work(struct work_struct *work) * Failure is a soft-state; credentials remain valid until * the controller terminates the connection. */ + for (q = 1; q < ctrl->queue_count; q++) { + ret = nvme_auth_wait(ctrl, q); + if (ret) + dev_warn(ctrl->device, + "qid %d: authentication failed\n", q); + } } -void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) +int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) { - INIT_LIST_HEAD(&ctrl->dhchap_auth_list); - INIT_WORK(&ctrl->dhchap_auth_work, nvme_dhchap_auth_work); + struct nvme_dhchap_queue_context *chap; + int i, ret; + mutex_init(&ctrl->dhchap_auth_mutex); + INIT_WORK(&ctrl->dhchap_auth_work, nvme_ctrl_auth_work); if (!ctrl->opts) - return; - nvme_auth_generate_key(ctrl->opts->dhchap_secret, &ctrl->host_key); - nvme_auth_generate_key(ctrl->opts->dhchap_ctrl_secret, &ctrl->ctrl_key); + return 0; + ret = nvme_auth_generate_key(ctrl->opts->dhchap_secret, + &ctrl->host_key); + if (ret) + return ret; + ret = nvme_auth_generate_key(ctrl->opts->dhchap_ctrl_secret, + &ctrl->ctrl_key); + if (ret) + goto err_free_dhchap_secret; + + if (!ctrl->opts->dhchap_secret && !ctrl->opts->dhchap_ctrl_secret) + return ret; + + ctrl->dhchap_ctxs = kvcalloc(ctrl_max_dhchaps(ctrl), + sizeof(*chap), GFP_KERNEL); + if (!ctrl->dhchap_ctxs) { + ret = -ENOMEM; + goto err_free_dhchap_ctrl_secret; + } + + for (i = 0; i < ctrl_max_dhchaps(ctrl); i++) { + chap = &ctrl->dhchap_ctxs[i]; + chap->qid = i; + chap->ctrl = ctrl; + INIT_WORK(&chap->auth_work, nvme_queue_auth_work); + } + + return 0; +err_free_dhchap_ctrl_secret: + nvme_auth_free_key(ctrl->ctrl_key); + ctrl->ctrl_key = NULL; +err_free_dhchap_secret: + nvme_auth_free_key(ctrl->host_key); + ctrl->host_key = NULL; + return ret; } EXPORT_SYMBOL_GPL(nvme_auth_init_ctrl); void nvme_auth_stop(struct nvme_ctrl *ctrl) { - struct nvme_dhchap_queue_context *chap = NULL, *tmp; - cancel_work_sync(&ctrl->dhchap_auth_work); - mutex_lock(&ctrl->dhchap_auth_mutex); - list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry) - cancel_work_sync(&chap->auth_work); - mutex_unlock(&ctrl->dhchap_auth_mutex); } EXPORT_SYMBOL_GPL(nvme_auth_stop); void nvme_auth_free(struct nvme_ctrl *ctrl) { - struct nvme_dhchap_queue_context *chap = NULL, *tmp; + int i; - mutex_lock(&ctrl->dhchap_auth_mutex); - list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry) { - list_del_init(&chap->entry); - flush_work(&chap->auth_work); - __nvme_auth_free(chap); + if (ctrl->dhchap_ctxs) { + for (i = 0; i < ctrl_max_dhchaps(ctrl); i++) + nvme_auth_free_dhchap(&ctrl->dhchap_ctxs[i]); + kfree(ctrl->dhchap_ctxs); } - mutex_unlock(&ctrl->dhchap_auth_mutex); if (ctrl->host_key) { nvme_auth_free_key(ctrl->host_key); ctrl->host_key = NULL; @@ -1015,3 +1005,27 @@ void nvme_auth_free(struct nvme_ctrl *ctrl) } } EXPORT_SYMBOL_GPL(nvme_auth_free); + +int __init nvme_init_auth(void) +{ + nvme_chap_buf_cache = kmem_cache_create("nvme-chap-buf-cache", + CHAP_BUF_SIZE, 0, SLAB_HWCACHE_ALIGN, NULL); + if (!nvme_chap_buf_cache) + return -ENOMEM; + + nvme_chap_buf_pool = mempool_create(16, mempool_alloc_slab, + mempool_free_slab, nvme_chap_buf_cache); + if (!nvme_chap_buf_pool) + goto err_destroy_chap_buf_cache; + + return 0; +err_destroy_chap_buf_cache: + kmem_cache_destroy(nvme_chap_buf_cache); + return -ENOMEM; +} + +void __exit nvme_exit_auth(void) +{ + mempool_destroy(nvme_chap_buf_pool); + kmem_cache_destroy(nvme_chap_buf_cache); +} diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index da55ce45ac70..e26b085a007a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -384,6 +384,8 @@ static inline void nvme_end_req(struct request *req) nvme_log_error(req); nvme_end_req_zoned(req); nvme_trace_bio_complete(req); + if (req->cmd_flags & REQ_NVME_MPATH) + nvme_mpath_end_request(req); blk_mq_end_request(req, status); } @@ -851,8 +853,11 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, cmnd->write_zeroes.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + if (!(req->cmd_flags & REQ_NOUNMAP) && (ns->features & NVME_NS_DEAC)) + cmnd->write_zeroes.control |= cpu_to_le16(NVME_WZ_DEAC); + if (nvme_ns_has_pi(ns)) { - cmnd->write_zeroes.control = cpu_to_le16(NVME_RW_PRINFO_PRACT); + cmnd->write_zeroes.control |= cpu_to_le16(NVME_RW_PRINFO_PRACT); switch (ns->pi_type) { case NVME_NS_DPS_PI_TYPE1: @@ -1118,11 +1123,12 @@ void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects, nvme_unfreeze(ctrl); nvme_mpath_unfreeze(ctrl->subsys); mutex_unlock(&ctrl->subsys->lock); - nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL); mutex_unlock(&ctrl->scan_lock); } - if (effects & NVME_CMD_EFFECTS_CCC) - nvme_init_ctrl_finish(ctrl); + if (effects & NVME_CMD_EFFECTS_CCC) { + dev_info(ctrl->device, +"controller capabilities changed, reset may be required to take effect.\n"); + } if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) { nvme_queue_scan(ctrl); flush_work(&ctrl->scan_work); @@ -2003,6 +2009,14 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, } } + /* + * Only set the DEAC bit if the device guarantees that reads from + * deallocated data return zeroes. While the DEAC bit does not + * require that, it must be a no-op if reads from deallocated data + * do not return zeroes. + */ + if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3))) + ns->features |= NVME_NS_DEAC; set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info)); set_bit(NVME_NS_READY, &ns->flags); blk_mq_unfreeze_queue(ns->disk->queue); @@ -2179,7 +2193,7 @@ const struct pr_ops nvme_pr_ops = { }; #ifdef CONFIG_BLK_SED_OPAL -int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, +static int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, bool send) { struct nvme_ctrl *ctrl = data; @@ -2196,7 +2210,23 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, NVME_QID_ANY, 1, 0); } -EXPORT_SYMBOL_GPL(nvme_sec_submit); + +static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended) +{ + if (ctrl->oacs & NVME_CTRL_OACS_SEC_SUPP) { + if (!ctrl->opal_dev) + ctrl->opal_dev = init_opal_dev(ctrl, &nvme_sec_submit); + else if (was_suspended) + opal_unlock_from_suspend(ctrl->opal_dev); + } else { + free_opal_dev(ctrl->opal_dev); + ctrl->opal_dev = NULL; + } +} +#else +static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended) +{ +} #endif /* CONFIG_BLK_SED_OPAL */ #ifdef CONFIG_BLK_DEV_ZONED @@ -2221,16 +2251,17 @@ static const struct block_device_operations nvme_bdev_ops = { .pr_ops = &nvme_pr_ops, }; -static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 timeout, bool enabled) +static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 mask, u32 val, + u32 timeout, const char *op) { - unsigned long timeout_jiffies = ((timeout + 1) * HZ / 2) + jiffies; - u32 csts, bit = enabled ? NVME_CSTS_RDY : 0; + unsigned long timeout_jiffies = jiffies + timeout * HZ; + u32 csts; int ret; while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { if (csts == ~0) return -ENODEV; - if ((csts & NVME_CSTS_RDY) == bit) + if ((csts & mask) == val) break; usleep_range(1000, 2000); @@ -2239,7 +2270,7 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 timeout, bool enabled) if (time_after(jiffies, timeout_jiffies)) { dev_err(ctrl->device, "Device not ready; aborting %s, CSTS=0x%x\n", - enabled ? "initialisation" : "reset", csts); + op, csts); return -ENODEV; } } @@ -2247,27 +2278,29 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 timeout, bool enabled) return ret; } -/* - * If the device has been passed off to us in an enabled state, just clear - * the enabled bit. The spec says we should set the 'shutdown notification - * bits', but doing so may cause the device to complete commands to the - * admin queue ... and we don't know what memory that might be pointing at! - */ -int nvme_disable_ctrl(struct nvme_ctrl *ctrl) +int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown) { int ret; ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; - ctrl->ctrl_config &= ~NVME_CC_ENABLE; + if (shutdown) + ctrl->ctrl_config |= NVME_CC_SHN_NORMAL; + else + ctrl->ctrl_config &= ~NVME_CC_ENABLE; ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); if (ret) return ret; + if (shutdown) { + return nvme_wait_ready(ctrl, NVME_CSTS_SHST_MASK, + NVME_CSTS_SHST_CMPLT, + ctrl->shutdown_timeout, "shutdown"); + } if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) msleep(NVME_QUIRK_DELAY_AMOUNT); - - return nvme_wait_ready(ctrl, NVME_CAP_TIMEOUT(ctrl->cap), false); + return nvme_wait_ready(ctrl, NVME_CSTS_RDY, 0, + (NVME_CAP_TIMEOUT(ctrl->cap) + 1) / 2, "reset"); } EXPORT_SYMBOL_GPL(nvme_disable_ctrl); @@ -2332,41 +2365,11 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl) ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); if (ret) return ret; - return nvme_wait_ready(ctrl, timeout, true); + return nvme_wait_ready(ctrl, NVME_CSTS_RDY, NVME_CSTS_RDY, + (timeout + 1) / 2, "initialisation"); } EXPORT_SYMBOL_GPL(nvme_enable_ctrl); -int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) -{ - unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ); - u32 csts; - int ret; - - ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; - ctrl->ctrl_config |= NVME_CC_SHN_NORMAL; - - ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); - if (ret) - return ret; - - while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { - if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT) - break; - - msleep(100); - if (fatal_signal_pending(current)) - return -EINTR; - if (time_after(jiffies, timeout)) { - dev_err(ctrl->device, - "Device shutdown incomplete; abort shutdown\n"); - return -ENODEV; - } - } - - return ret; -} -EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl); - static int nvme_configure_timestamp(struct nvme_ctrl *ctrl) { __le64 ts; @@ -3049,7 +3052,7 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) id = kzalloc(sizeof(*id), GFP_KERNEL); if (!id) - return 0; + return -ENOMEM; c.identify.opcode = nvme_admin_identify; c.identify.cns = NVME_ID_CNS_CS_CTRL; @@ -3095,10 +3098,6 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl) if (!ctrl->identified) { unsigned int i; - ret = nvme_init_subsystem(ctrl, id); - if (ret) - goto out_free; - /* * Check for quirks. Quirk can depend on firmware version, * so, in principle, the set of quirks present can change @@ -3111,6 +3110,10 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl) if (quirk_matches(id, &core_quirks[i])) ctrl->quirks |= core_quirks[i].quirks; } + + ret = nvme_init_subsystem(ctrl, id); + if (ret) + goto out_free; } memcpy(ctrl->subsys->firmware_rev, id->fr, sizeof(ctrl->subsys->firmware_rev)); @@ -3229,7 +3232,7 @@ out_free: * register in our nvme_ctrl structure. This should be called as soon as * the admin queue is fully up and running. */ -int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl) +int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended) { int ret; @@ -3260,6 +3263,8 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl) if (ret < 0) return ret; + nvme_configure_opal(ctrl, was_suspended); + if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) { /* * Do not return errors unless we are in a controller reset, @@ -3745,15 +3750,19 @@ static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev, memcpy(dhchap_secret, buf, count); nvme_auth_stop(ctrl); if (strcmp(dhchap_secret, opts->dhchap_secret)) { + struct nvme_dhchap_key *key, *host_key; int ret; - ret = nvme_auth_generate_key(dhchap_secret, &ctrl->host_key); + ret = nvme_auth_generate_key(dhchap_secret, &key); if (ret) return ret; kfree(opts->dhchap_secret); opts->dhchap_secret = dhchap_secret; - /* Key has changed; re-authentication with new key */ - nvme_auth_reset(ctrl); + host_key = ctrl->host_key; + mutex_lock(&ctrl->dhchap_auth_mutex); + ctrl->host_key = key; + mutex_unlock(&ctrl->dhchap_auth_mutex); + nvme_auth_free_key(host_key); } /* Start re-authentication */ dev_info(ctrl->device, "re-authenticating controller\n"); @@ -3795,15 +3804,19 @@ static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev, memcpy(dhchap_secret, buf, count); nvme_auth_stop(ctrl); if (strcmp(dhchap_secret, opts->dhchap_ctrl_secret)) { + struct nvme_dhchap_key *key, *ctrl_key; int ret; - ret = nvme_auth_generate_key(dhchap_secret, &ctrl->ctrl_key); + ret = nvme_auth_generate_key(dhchap_secret, &key); if (ret) return ret; kfree(opts->dhchap_ctrl_secret); opts->dhchap_ctrl_secret = dhchap_secret; - /* Key has changed; re-authentication with new key */ - nvme_auth_reset(ctrl); + ctrl_key = ctrl->ctrl_key; + mutex_lock(&ctrl->dhchap_auth_mutex); + ctrl->ctrl_key = key; + mutex_unlock(&ctrl->dhchap_auth_mutex); + nvme_auth_free_key(ctrl_key); } /* Start re-authentication */ dev_info(ctrl->device, "re-authenticating controller\n"); @@ -3875,10 +3888,11 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, return a->mode; } -static const struct attribute_group nvme_dev_attrs_group = { +const struct attribute_group nvme_dev_attrs_group = { .attrs = nvme_dev_attrs, .is_visible = nvme_dev_attrs_are_visible, }; +EXPORT_SYMBOL_GPL(nvme_dev_attrs_group); static const struct attribute_group *nvme_dev_attr_groups[] = { &nvme_dev_attrs_group, @@ -4304,7 +4318,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) mutex_unlock(&ns->ctrl->subsys->lock); /* guarantee not available in head->list */ - synchronize_rcu(); + synchronize_srcu(&ns->head->srcu); if (!nvme_ns_head_multipath(ns->head)) nvme_cdev_del(&ns->cdev, &ns->cdev_device); @@ -4333,10 +4347,6 @@ static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info) { int ret = NVME_SC_INVALID_NS | NVME_SC_DNR; - if (test_bit(NVME_NS_DEAD, &ns->flags)) - goto out; - - ret = NVME_SC_INVALID_NS | NVME_SC_DNR; if (!nvme_ns_ids_equal(&ns->head->ids, &info->ids)) { dev_err(ns->ctrl->device, "identifiers changed for nsid %d\n", ns->head->ns_id); @@ -4407,7 +4417,7 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, down_write(&ctrl->namespaces_rwsem); list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { - if (ns->head->ns_id > nsid || test_bit(NVME_NS_DEAD, &ns->flags)) + if (ns->head->ns_id > nsid) list_move_tail(&ns->list, &rm_list); } up_write(&ctrl->namespaces_rwsem); @@ -4424,9 +4434,6 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) u32 prev = 0; int ret = 0, i; - if (nvme_ctrl_limited_cns(ctrl)) - return -EOPNOTSUPP; - ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); if (!ns_list) return -ENOMEM; @@ -4534,8 +4541,18 @@ static void nvme_scan_work(struct work_struct *work) } mutex_lock(&ctrl->scan_lock); - if (nvme_scan_ns_list(ctrl) != 0) + if (nvme_ctrl_limited_cns(ctrl)) { nvme_scan_ns_sequential(ctrl); + } else { + /* + * Fall back to sequential scan if DNR is set to handle broken + * devices which should support Identify NS List (as per the VS + * they report) but don't actually support it. + */ + ret = nvme_scan_ns_list(ctrl); + if (ret > 0 && ret & NVME_SC_DNR) + nvme_scan_ns_sequential(ctrl); + } mutex_unlock(&ctrl->scan_lock); } @@ -4565,8 +4582,10 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) * removing the namespaces' disks; fail all the queues now to avoid * potentially having to clean up the failed sync later. */ - if (ctrl->state == NVME_CTRL_DEAD) - nvme_kill_queues(ctrl); + if (ctrl->state == NVME_CTRL_DEAD) { + nvme_mark_namespaces_dead(ctrl); + nvme_unquiesce_io_queues(ctrl); + } /* this is a no-op when called from the controller reset handler */ nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO); @@ -4692,7 +4711,7 @@ static void nvme_fw_act_work(struct work_struct *work) fw_act_timeout = jiffies + msecs_to_jiffies(admin_timeout * 1000); - nvme_stop_queues(ctrl); + nvme_quiesce_io_queues(ctrl); while (nvme_ctrl_pp_status(ctrl)) { if (time_after(jiffies, fw_act_timeout)) { dev_warn(ctrl->device, @@ -4706,7 +4725,7 @@ static void nvme_fw_act_work(struct work_struct *work) if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) return; - nvme_start_queues(ctrl); + nvme_unquiesce_io_queues(ctrl); /* read FW slot information to clear the AER */ nvme_get_fw_slot_info(ctrl); @@ -4811,8 +4830,7 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, EXPORT_SYMBOL_GPL(nvme_complete_async_event); int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, - const struct blk_mq_ops *ops, unsigned int flags, - unsigned int cmd_size) + const struct blk_mq_ops *ops, unsigned int cmd_size) { int ret; @@ -4822,7 +4840,9 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, if (ctrl->ops->flags & NVME_F_FABRICS) set->reserved_tags = NVMF_RESERVED_TAGS; set->numa_node = ctrl->numa_node; - set->flags = flags; + set->flags = BLK_MQ_F_NO_SCHED; + if (ctrl->ops->flags & NVME_F_BLOCKING) + set->flags |= BLK_MQ_F_BLOCKING; set->cmd_size = cmd_size; set->driver_data = ctrl; set->nr_hw_queues = 1; @@ -4850,6 +4870,7 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, out_cleanup_admin_q: blk_mq_destroy_queue(ctrl->admin_q); + blk_put_queue(ctrl->admin_q); out_free_tagset: blk_mq_free_tag_set(ctrl->admin_tagset); return ret; @@ -4859,14 +4880,17 @@ EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set); void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl) { blk_mq_destroy_queue(ctrl->admin_q); - if (ctrl->ops->flags & NVME_F_FABRICS) + blk_put_queue(ctrl->admin_q); + if (ctrl->ops->flags & NVME_F_FABRICS) { blk_mq_destroy_queue(ctrl->fabrics_q); + blk_put_queue(ctrl->fabrics_q); + } blk_mq_free_tag_set(ctrl->admin_tagset); } EXPORT_SYMBOL_GPL(nvme_remove_admin_tag_set); int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, - const struct blk_mq_ops *ops, unsigned int flags, + const struct blk_mq_ops *ops, unsigned int nr_maps, unsigned int cmd_size) { int ret; @@ -4874,15 +4898,23 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, memset(set, 0, sizeof(*set)); set->ops = ops; set->queue_depth = ctrl->sqsize + 1; - set->reserved_tags = NVMF_RESERVED_TAGS; + /* + * Some Apple controllers requires tags to be unique across admin and + * the (only) I/O queue, so reserve the first 32 tags of the I/O queue. + */ + if (ctrl->quirks & NVME_QUIRK_SHARED_TAGS) + set->reserved_tags = NVME_AQ_DEPTH; + else if (ctrl->ops->flags & NVME_F_FABRICS) + set->reserved_tags = NVMF_RESERVED_TAGS; set->numa_node = ctrl->numa_node; - set->flags = flags; + set->flags = BLK_MQ_F_SHOULD_MERGE; + if (ctrl->ops->flags & NVME_F_BLOCKING) + set->flags |= BLK_MQ_F_BLOCKING; set->cmd_size = cmd_size, set->driver_data = ctrl; set->nr_hw_queues = ctrl->queue_count - 1; set->timeout = NVME_IO_TIMEOUT; - if (ops->map_queues) - set->nr_maps = ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; + set->nr_maps = nr_maps; ret = blk_mq_alloc_tag_set(set); if (ret) return ret; @@ -4893,6 +4925,8 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, ret = PTR_ERR(ctrl->connect_q); goto out_free_tag_set; } + blk_queue_flag_set(QUEUE_FLAG_SKIP_TAGSET_QUIESCE, + ctrl->connect_q); } ctrl->tagset = set; @@ -4906,8 +4940,10 @@ EXPORT_SYMBOL_GPL(nvme_alloc_io_tag_set); void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl) { - if (ctrl->ops->flags & NVME_F_FABRICS) + if (ctrl->ops->flags & NVME_F_FABRICS) { blk_mq_destroy_queue(ctrl->connect_q); + blk_put_queue(ctrl->connect_q); + } blk_mq_free_tag_set(ctrl->tagset); } EXPORT_SYMBOL_GPL(nvme_remove_io_tag_set); @@ -4943,7 +4979,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) if (ctrl->queue_count > 1) { nvme_queue_scan(ctrl); - nvme_start_queues(ctrl); + nvme_unquiesce_io_queues(ctrl); nvme_mpath_update(ctrl); } @@ -4988,6 +5024,7 @@ static void nvme_free_ctrl(struct device *dev) nvme_auth_stop(ctrl); nvme_auth_free(ctrl); __free_page(ctrl->discard_page); + free_opal_dev(ctrl->opal_dev); if (subsys) { mutex_lock(&nvme_subsystems_lock); @@ -5053,7 +5090,10 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, ctrl->instance); ctrl->device->class = nvme_class; ctrl->device->parent = ctrl->dev; - ctrl->device->groups = nvme_dev_attr_groups; + if (ops->dev_attr_groups) + ctrl->device->groups = ops->dev_attr_groups; + else + ctrl->device->groups = nvme_dev_attr_groups; ctrl->device->release = nvme_free_ctrl; dev_set_drvdata(ctrl->device, ctrl); ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance); @@ -5077,9 +5117,13 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device)); nvme_mpath_init_ctrl(ctrl); - nvme_auth_init_ctrl(ctrl); + ret = nvme_auth_init_ctrl(ctrl); + if (ret) + goto out_free_cdev; return 0; +out_free_cdev: + cdev_device_del(&ctrl->cdev, ctrl->device); out_free_name: nvme_put_ctrl(ctrl); kfree_const(ctrl->device->kobj.name); @@ -5092,62 +5136,17 @@ out: } EXPORT_SYMBOL_GPL(nvme_init_ctrl); -static void nvme_start_ns_queue(struct nvme_ns *ns) -{ - if (test_and_clear_bit(NVME_NS_STOPPED, &ns->flags)) - blk_mq_unquiesce_queue(ns->queue); -} - -static void nvme_stop_ns_queue(struct nvme_ns *ns) -{ - if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags)) - blk_mq_quiesce_queue(ns->queue); - else - blk_mq_wait_quiesce_done(ns->queue); -} - -/* - * Prepare a queue for teardown. - * - * This must forcibly unquiesce queues to avoid blocking dispatch, and only set - * the capacity to 0 after that to avoid blocking dispatchers that may be - * holding bd_butex. This will end buffered writers dirtying pages that can't - * be synced. - */ -static void nvme_set_queue_dying(struct nvme_ns *ns) -{ - if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) - return; - - blk_mark_disk_dead(ns->disk); - nvme_start_ns_queue(ns); - - set_capacity_and_notify(ns->disk, 0); -} - -/** - * nvme_kill_queues(): Ends all namespace queues - * @ctrl: the dead controller that needs to end - * - * Call this function when the driver determines it is unable to get the - * controller in a state capable of servicing IO. - */ -void nvme_kill_queues(struct nvme_ctrl *ctrl) +/* let I/O to all namespaces fail in preparation for surprise removal */ +void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; down_read(&ctrl->namespaces_rwsem); - - /* Forcibly unquiesce queues to avoid blocking dispatch */ - if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q)) - nvme_start_admin_queue(ctrl); - list_for_each_entry(ns, &ctrl->namespaces, list) - nvme_set_queue_dying(ns); - + blk_mark_disk_dead(ns->disk); up_read(&ctrl->namespaces_rwsem); } -EXPORT_SYMBOL_GPL(nvme_kill_queues); +EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead); void nvme_unfreeze(struct nvme_ctrl *ctrl) { @@ -5197,43 +5196,41 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_start_freeze); -void nvme_stop_queues(struct nvme_ctrl *ctrl) +void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl) { - struct nvme_ns *ns; - - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) - nvme_stop_ns_queue(ns); - up_read(&ctrl->namespaces_rwsem); + if (!ctrl->tagset) + return; + if (!test_and_set_bit(NVME_CTRL_STOPPED, &ctrl->flags)) + blk_mq_quiesce_tagset(ctrl->tagset); + else + blk_mq_wait_quiesce_done(ctrl->tagset); } -EXPORT_SYMBOL_GPL(nvme_stop_queues); +EXPORT_SYMBOL_GPL(nvme_quiesce_io_queues); -void nvme_start_queues(struct nvme_ctrl *ctrl) +void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl) { - struct nvme_ns *ns; - - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) - nvme_start_ns_queue(ns); - up_read(&ctrl->namespaces_rwsem); + if (!ctrl->tagset) + return; + if (test_and_clear_bit(NVME_CTRL_STOPPED, &ctrl->flags)) + blk_mq_unquiesce_tagset(ctrl->tagset); } -EXPORT_SYMBOL_GPL(nvme_start_queues); +EXPORT_SYMBOL_GPL(nvme_unquiesce_io_queues); -void nvme_stop_admin_queue(struct nvme_ctrl *ctrl) +void nvme_quiesce_admin_queue(struct nvme_ctrl *ctrl) { if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags)) blk_mq_quiesce_queue(ctrl->admin_q); else - blk_mq_wait_quiesce_done(ctrl->admin_q); + blk_mq_wait_quiesce_done(ctrl->admin_q->tag_set); } -EXPORT_SYMBOL_GPL(nvme_stop_admin_queue); +EXPORT_SYMBOL_GPL(nvme_quiesce_admin_queue); -void nvme_start_admin_queue(struct nvme_ctrl *ctrl) +void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl) { if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags)) blk_mq_unquiesce_queue(ctrl->admin_q); } -EXPORT_SYMBOL_GPL(nvme_start_admin_queue); +EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue); void nvme_sync_io_queues(struct nvme_ctrl *ctrl) { @@ -5344,8 +5341,13 @@ static int __init nvme_core_init(void) goto unregister_generic_ns; } + result = nvme_init_auth(); + if (result) + goto destroy_ns_chr; return 0; +destroy_ns_chr: + class_destroy(nvme_ns_chr_class); unregister_generic_ns: unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS); destroy_subsys_class: @@ -5366,6 +5368,7 @@ out: static void __exit nvme_core_exit(void) { + nvme_exit_auth(); class_destroy(nvme_ns_chr_class); class_destroy(nvme_subsys_class); class_destroy(nvme_class); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 5d57a042dbca..4564f16a0b20 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1475,6 +1475,8 @@ nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp) fc_dma_unmap_single(lport->dev, lsop->rspdma, sizeof(*lsop->rspbuf), DMA_TO_DEVICE); + kfree(lsop->rspbuf); + kfree(lsop->rqstbuf); kfree(lsop); nvme_fc_rport_put(rport); @@ -1699,6 +1701,15 @@ restart: spin_unlock_irqrestore(&rport->lock, flags); } +static +void nvme_fc_rcv_ls_req_err_msg(struct nvme_fc_lport *lport, + struct fcnvme_ls_rqst_w0 *w0) +{ + dev_info(lport->dev, "RCV %s LS failed: No memory\n", + (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ? + nvmefc_ls_names[w0->ls_cmd] : ""); +} + /** * nvme_fc_rcv_ls_req - transport entry point called by an LLDD * upon the reception of a NVME LS request. @@ -1751,20 +1762,20 @@ nvme_fc_rcv_ls_req(struct nvme_fc_remote_port *portptr, goto out_put; } - lsop = kzalloc(sizeof(*lsop) + - sizeof(union nvmefc_ls_requests) + - sizeof(union nvmefc_ls_responses), - GFP_KERNEL); + lsop = kzalloc(sizeof(*lsop), GFP_KERNEL); if (!lsop) { - dev_info(lport->dev, - "RCV %s LS failed: No memory\n", - (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ? - nvmefc_ls_names[w0->ls_cmd] : ""); + nvme_fc_rcv_ls_req_err_msg(lport, w0); ret = -ENOMEM; goto out_put; } - lsop->rqstbuf = (union nvmefc_ls_requests *)&lsop[1]; - lsop->rspbuf = (union nvmefc_ls_responses *)&lsop->rqstbuf[1]; + + lsop->rqstbuf = kzalloc(sizeof(*lsop->rqstbuf), GFP_KERNEL); + lsop->rspbuf = kzalloc(sizeof(*lsop->rspbuf), GFP_KERNEL); + if (!lsop->rqstbuf || !lsop->rspbuf) { + nvme_fc_rcv_ls_req_err_msg(lport, w0); + ret = -ENOMEM; + goto out_free; + } lsop->rspdma = fc_dma_map_single(lport->dev, lsop->rspbuf, sizeof(*lsop->rspbuf), @@ -1801,6 +1812,8 @@ out_unmap: fc_dma_unmap_single(lport->dev, lsop->rspdma, sizeof(*lsop->rspbuf), DMA_TO_DEVICE); out_free: + kfree(lsop->rspbuf); + kfree(lsop->rqstbuf); kfree(lsop); out_put: nvme_fc_rport_put(rport); @@ -2391,7 +2404,7 @@ nvme_fc_ctrl_free(struct kref *ref) list_del(&ctrl->ctrl_list); spin_unlock_irqrestore(&ctrl->rport->lock, flags); - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); nvme_remove_admin_tag_set(&ctrl->ctrl); kfree(ctrl->queues); @@ -2492,20 +2505,20 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) * (but with error status). */ if (ctrl->ctrl.queue_count > 1) { - nvme_stop_queues(&ctrl->ctrl); + nvme_quiesce_io_queues(&ctrl->ctrl); nvme_sync_io_queues(&ctrl->ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); blk_mq_tagset_wait_completed_request(&ctrl->tag_set); if (start_queues) - nvme_start_queues(&ctrl->ctrl); + nvme_unquiesce_io_queues(&ctrl->ctrl); } /* * Other transports, which don't have link-level contexts bound * to sqe's, would try to gracefully shutdown the controller by * writing the registers for shutdown and polling (call - * nvme_shutdown_ctrl()). Given a bunch of i/o was potentially + * nvme_disable_ctrl()). Given a bunch of i/o was potentially * just aborted and we will wait on those contexts, and given * there was no indication of how live the controlelr is on the * link, don't send more io to create more contexts for the @@ -2516,13 +2529,13 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) /* * clean up the admin queue. Same thing as above. */ - nvme_stop_admin_queue(&ctrl->ctrl); + nvme_quiesce_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); if (start_queues) - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); } static void @@ -2732,7 +2745,7 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, atomic_set(&op->state, FCPOP_STATE_ACTIVE); if (!(op->flags & FCOP_FLAGS_AEN)) - blk_mq_start_request(op->rq); + nvme_start_request(op->rq); cmdiu->csn = cpu_to_be32(atomic_inc_return(&queue->csn)); ret = ctrl->lport->ops->fcp_io(&ctrl->lport->localport, @@ -2903,7 +2916,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) nvme_fc_init_io_queues(ctrl); ret = nvme_alloc_io_tag_set(&ctrl->ctrl, &ctrl->tag_set, - &nvme_fc_mq_ops, BLK_MQ_F_SHOULD_MERGE, + &nvme_fc_mq_ops, 1, struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, ctrl->lport->ops->fcprqst_priv_sz)); if (ret) @@ -3104,9 +3117,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) ctrl->ctrl.max_hw_sectors = ctrl->ctrl.max_segments << (ilog2(SZ_4K) - 9); - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); - ret = nvme_init_ctrl_finish(&ctrl->ctrl); + ret = nvme_init_ctrl_finish(&ctrl->ctrl, false); if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) goto out_disconnect_admin_queue; @@ -3250,10 +3263,10 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) nvme_fc_free_queue(&ctrl->queues[0]); /* re-enable the admin_q so anything new can fast fail */ - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); /* resume the io queues so that things will fast fail */ - nvme_start_queues(&ctrl->ctrl); + nvme_unquiesce_io_queues(&ctrl->ctrl); nvme_fc_ctlr_inactive_on_rport(ctrl); } @@ -3509,7 +3522,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, nvme_fc_init_queue(ctrl, 0); ret = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set, - &nvme_fc_admin_mq_ops, BLK_MQ_F_NO_SCHED, + &nvme_fc_admin_mq_ops, struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv, ctrl->lport->ops->fcprqst_priv_sz)); if (ret) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 81f5550b670d..9ddda571f046 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -8,6 +8,50 @@ #include <linux/io_uring.h> #include "nvme.h" +static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c, + fmode_t mode) +{ + if (capable(CAP_SYS_ADMIN)) + return true; + + /* + * Do not allow unprivileged processes to send vendor specific or fabrics + * commands as we can't be sure about their effects. + */ + if (c->common.opcode >= nvme_cmd_vendor_start || + c->common.opcode == nvme_fabrics_command) + return false; + + /* + * Do not allow unprivileged passthrough of admin commands except + * for a subset of identify commands that contain information required + * to form proper I/O commands in userspace and do not expose any + * potentially sensitive information. + */ + if (!ns) { + if (c->common.opcode == nvme_admin_identify) { + switch (c->identify.cns) { + case NVME_ID_CNS_NS: + case NVME_ID_CNS_CS_NS: + case NVME_ID_CNS_NS_CS_INDEP: + case NVME_ID_CNS_CS_CTRL: + case NVME_ID_CNS_CTRL: + return true; + } + } + return false; + } + + /* + * Only allow I/O commands that transfer data to the controller if the + * special file is open for writing, but always allow I/O commands that + * transfer data from the controller. + */ + if (nvme_is_write(c)) + return mode & FMODE_WRITE; + return true; +} + /* * Convert integer values from ioctl structures to user pointers, silently * ignoring the upper bits in the compat case to match behaviour of 32-bit @@ -261,7 +305,7 @@ static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl, } static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - struct nvme_passthru_cmd __user *ucmd) + struct nvme_passthru_cmd __user *ucmd, fmode_t mode) { struct nvme_passthru_cmd cmd; struct nvme_command c; @@ -269,8 +313,6 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u64 result; int status; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; if (copy_from_user(&cmd, ucmd, sizeof(cmd))) return -EFAULT; if (cmd.flags) @@ -291,6 +333,9 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, c.common.cdw14 = cpu_to_le32(cmd.cdw14); c.common.cdw15 = cpu_to_le32(cmd.cdw15); + if (!nvme_cmd_allowed(ns, &c, mode)) + return -EACCES; + if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); @@ -308,15 +353,14 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, } static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - struct nvme_passthru_cmd64 __user *ucmd, bool vec) + struct nvme_passthru_cmd64 __user *ucmd, bool vec, + fmode_t mode) { struct nvme_passthru_cmd64 cmd; struct nvme_command c; unsigned timeout = 0; int status; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; if (copy_from_user(&cmd, ucmd, sizeof(cmd))) return -EFAULT; if (cmd.flags) @@ -337,6 +381,9 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, c.common.cdw14 = cpu_to_le32(cmd.cdw14); c.common.cdw15 = cpu_to_le32(cmd.cdw15); + if (!nvme_cmd_allowed(ns, &c, mode)) + return -EACCES; + if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); @@ -483,9 +530,6 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, void *meta = NULL; int ret; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - c.common.opcode = READ_ONCE(cmd->opcode); c.common.flags = READ_ONCE(cmd->flags); if (c.common.flags) @@ -507,6 +551,9 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, c.common.cdw14 = cpu_to_le32(READ_ONCE(cmd->cdw14)); c.common.cdw15 = cpu_to_le32(READ_ONCE(cmd->cdw15)); + if (!nvme_cmd_allowed(ns, &c, ioucmd->file->f_mode)) + return -EACCES; + d.metadata = READ_ONCE(cmd->metadata); d.addr = READ_ONCE(cmd->addr); d.data_len = READ_ONCE(cmd->data_len); @@ -570,13 +617,13 @@ static bool is_ctrl_ioctl(unsigned int cmd) } static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd, - void __user *argp) + void __user *argp, fmode_t mode) { switch (cmd) { case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ctrl, NULL, argp); + return nvme_user_cmd(ctrl, NULL, argp, mode); case NVME_IOCTL_ADMIN64_CMD: - return nvme_user_cmd64(ctrl, NULL, argp, false); + return nvme_user_cmd64(ctrl, NULL, argp, false, mode); default: return sed_ioctl(ctrl->opal_dev, cmd, argp); } @@ -601,14 +648,14 @@ struct nvme_user_io32 { #endif /* COMPAT_FOR_U64_ALIGNMENT */ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, - void __user *argp) + void __user *argp, fmode_t mode) { switch (cmd) { case NVME_IOCTL_ID: force_successful_syscall_return(); return ns->head->ns_id; case NVME_IOCTL_IO_CMD: - return nvme_user_cmd(ns->ctrl, ns, argp); + return nvme_user_cmd(ns->ctrl, ns, argp, mode); /* * struct nvme_user_io can have different padding on some 32-bit ABIs. * Just accept the compat version as all fields that are used are the @@ -620,19 +667,20 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, argp); case NVME_IOCTL_IO64_CMD: - return nvme_user_cmd64(ns->ctrl, ns, argp, false); + return nvme_user_cmd64(ns->ctrl, ns, argp, false, mode); case NVME_IOCTL_IO64_CMD_VEC: - return nvme_user_cmd64(ns->ctrl, ns, argp, true); + return nvme_user_cmd64(ns->ctrl, ns, argp, true, mode); default: return -ENOTTY; } } -static int __nvme_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *arg) +static int __nvme_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *arg, + fmode_t mode) { - if (is_ctrl_ioctl(cmd)) - return nvme_ctrl_ioctl(ns->ctrl, cmd, arg); - return nvme_ns_ioctl(ns, cmd, arg); + if (is_ctrl_ioctl(cmd)) + return nvme_ctrl_ioctl(ns->ctrl, cmd, arg, mode); + return nvme_ns_ioctl(ns, cmd, arg, mode); } int nvme_ioctl(struct block_device *bdev, fmode_t mode, @@ -640,7 +688,7 @@ int nvme_ioctl(struct block_device *bdev, fmode_t mode, { struct nvme_ns *ns = bdev->bd_disk->private_data; - return __nvme_ioctl(ns, cmd, (void __user *)arg); + return __nvme_ioctl(ns, cmd, (void __user *)arg, mode); } long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) @@ -648,7 +696,7 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct nvme_ns *ns = container_of(file_inode(file)->i_cdev, struct nvme_ns, cdev); - return __nvme_ioctl(ns, cmd, (void __user *)arg); + return __nvme_ioctl(ns, cmd, (void __user *)arg, file->f_mode); } static int nvme_uring_cmd_checks(unsigned int issue_flags) @@ -716,7 +764,8 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, } #ifdef CONFIG_NVME_MULTIPATH static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, - void __user *argp, struct nvme_ns_head *head, int srcu_idx) + void __user *argp, struct nvme_ns_head *head, int srcu_idx, + fmode_t mode) __releases(&head->srcu) { struct nvme_ctrl *ctrl = ns->ctrl; @@ -724,7 +773,7 @@ static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, nvme_get_ctrl(ns->ctrl); srcu_read_unlock(&head->srcu, srcu_idx); - ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp); + ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp, mode); nvme_put_ctrl(ctrl); return ret; @@ -749,9 +798,10 @@ int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, * deadlock when deleting namespaces using the passthrough interface. */ if (is_ctrl_ioctl(cmd)) - return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); + return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx, + mode); - ret = nvme_ns_ioctl(ns, cmd, argp); + ret = nvme_ns_ioctl(ns, cmd, argp, mode); out_unlock: srcu_read_unlock(&head->srcu, srcu_idx); return ret; @@ -773,9 +823,10 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, goto out_unlock; if (is_ctrl_ioctl(cmd)) - return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); + return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx, + file->f_mode); - ret = nvme_ns_ioctl(ns, cmd, argp); + ret = nvme_ns_ioctl(ns, cmd, argp, file->f_mode); out_unlock: srcu_read_unlock(&head->srcu, srcu_idx); return ret; @@ -849,7 +900,8 @@ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) return ret; } -static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) +static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp, + fmode_t mode) { struct nvme_ns *ns; int ret; @@ -873,7 +925,7 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) kref_get(&ns->kref); up_read(&ctrl->namespaces_rwsem); - ret = nvme_user_cmd(ctrl, ns, argp); + ret = nvme_user_cmd(ctrl, ns, argp, mode); nvme_put_ns(ns); return ret; @@ -890,11 +942,11 @@ long nvme_dev_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ctrl, NULL, argp); + return nvme_user_cmd(ctrl, NULL, argp, file->f_mode); case NVME_IOCTL_ADMIN64_CMD: - return nvme_user_cmd64(ctrl, NULL, argp, false); + return nvme_user_cmd64(ctrl, NULL, argp, false, file->f_mode); case NVME_IOCTL_IO_CMD: - return nvme_dev_user_cmd(ctrl, argp); + return nvme_dev_user_cmd(ctrl, argp, file->f_mode); case NVME_IOCTL_RESET: if (!capable(CAP_SYS_ADMIN)) return -EACCES; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 93e2138a8b42..c03093b6813c 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -114,6 +114,31 @@ void nvme_failover_req(struct request *req) kblockd_schedule_work(&ns->head->requeue_work); } +void nvme_mpath_start_request(struct request *rq) +{ + struct nvme_ns *ns = rq->q->queuedata; + struct gendisk *disk = ns->head->disk; + + if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq)) + return; + + nvme_req(rq)->flags |= NVME_MPATH_IO_STATS; + nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, + blk_rq_bytes(rq) >> SECTOR_SHIFT, + req_op(rq), jiffies); +} +EXPORT_SYMBOL_GPL(nvme_mpath_start_request); + +void nvme_mpath_end_request(struct request *rq) +{ + struct nvme_ns *ns = rq->q->queuedata; + + if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS)) + return; + bdev_end_io_acct(ns->head->disk->part0, req_op(rq), + nvme_req(rq)->start_time); +} + void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; @@ -174,11 +199,14 @@ void nvme_mpath_revalidate_paths(struct nvme_ns *ns) struct nvme_ns_head *head = ns->head; sector_t capacity = get_capacity(head->disk); int node; + int srcu_idx; + srcu_idx = srcu_read_lock(&head->srcu); list_for_each_entry_rcu(ns, &head->list, siblings) { if (capacity != get_capacity(ns->disk)) clear_bit(NVME_NS_READY, &ns->flags); } + srcu_read_unlock(&head->srcu, srcu_idx); for_each_node(node) rcu_assign_pointer(head->current_path[node], NULL); @@ -503,6 +531,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue); blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue); + blk_queue_flag_set(QUEUE_FLAG_IO_STAT, head->disk->queue); /* * This assumes all controllers that refer to a namespace either * support poll queues or not. That is not a strict guarantee, diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a29877217ee6..6bbb73ef8b25 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -162,6 +162,9 @@ struct nvme_request { u8 retries; u8 flags; u16 status; +#ifdef CONFIG_NVME_MULTIPATH + unsigned long start_time; +#endif struct nvme_ctrl *ctrl; }; @@ -173,6 +176,7 @@ struct nvme_request { enum { NVME_REQ_CANCELLED = (1 << 0), NVME_REQ_USERCMD = (1 << 1), + NVME_MPATH_IO_STATS = (1 << 2), }; static inline struct nvme_request *nvme_req(struct request *req) @@ -237,6 +241,7 @@ enum nvme_ctrl_flags { NVME_CTRL_FAILFAST_EXPIRED = 0, NVME_CTRL_ADMIN_Q_STOPPED = 1, NVME_CTRL_STARTED_ONCE = 2, + NVME_CTRL_STOPPED = 3, }; struct nvme_ctrl { @@ -336,8 +341,8 @@ struct nvme_ctrl { #ifdef CONFIG_NVME_AUTH struct work_struct dhchap_auth_work; - struct list_head dhchap_auth_list; struct mutex dhchap_auth_mutex; + struct nvme_dhchap_queue_context *dhchap_ctxs; struct nvme_dhchap_key *host_key; struct nvme_dhchap_key *ctrl_key; u16 transaction; @@ -454,6 +459,7 @@ static inline bool nvme_ns_head_multipath(struct nvme_ns_head *head) enum nvme_ns_features { NVME_NS_EXT_LBAS = 1 << 0, /* support extended LBA format */ NVME_NS_METADATA_SUPPORTED = 1 << 1, /* support getting generated md */ + NVME_NS_DEAC, /* DEAC bit in Write Zeores supported */ }; struct nvme_ns { @@ -483,11 +489,9 @@ struct nvme_ns { unsigned long features; unsigned long flags; #define NVME_NS_REMOVING 0 -#define NVME_NS_DEAD 1 #define NVME_NS_ANA_PENDING 2 #define NVME_NS_FORCE_RO 3 #define NVME_NS_READY 4 -#define NVME_NS_STOPPED 5 struct cdev cdev; struct device cdev_device; @@ -508,6 +512,9 @@ struct nvme_ctrl_ops { unsigned int flags; #define NVME_F_FABRICS (1 << 0) #define NVME_F_METADATA_SUPPORTED (1 << 1) +#define NVME_F_BLOCKING (1 << 2) + + const struct attribute_group **dev_attr_groups; int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); @@ -728,37 +735,32 @@ void nvme_cancel_tagset(struct nvme_ctrl *ctrl); void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl); bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, enum nvme_ctrl_state new_state); -int nvme_disable_ctrl(struct nvme_ctrl *ctrl); +int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown); int nvme_enable_ctrl(struct nvme_ctrl *ctrl); -int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, const struct nvme_ctrl_ops *ops, unsigned long quirks); void nvme_uninit_ctrl(struct nvme_ctrl *ctrl); void nvme_start_ctrl(struct nvme_ctrl *ctrl); void nvme_stop_ctrl(struct nvme_ctrl *ctrl); -int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl); +int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended); int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, - const struct blk_mq_ops *ops, unsigned int flags, - unsigned int cmd_size); + const struct blk_mq_ops *ops, unsigned int cmd_size); void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl); int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, - const struct blk_mq_ops *ops, unsigned int flags, + const struct blk_mq_ops *ops, unsigned int nr_maps, unsigned int cmd_size); void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl); void nvme_remove_namespaces(struct nvme_ctrl *ctrl); -int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, - bool send); - void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, volatile union nvme_result *res); -void nvme_stop_queues(struct nvme_ctrl *ctrl); -void nvme_start_queues(struct nvme_ctrl *ctrl); -void nvme_stop_admin_queue(struct nvme_ctrl *ctrl); -void nvme_start_admin_queue(struct nvme_ctrl *ctrl); -void nvme_kill_queues(struct nvme_ctrl *ctrl); +void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl); +void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl); +void nvme_quiesce_admin_queue(struct nvme_ctrl *ctrl); +void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl); +void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl); void nvme_sync_queues(struct nvme_ctrl *ctrl); void nvme_sync_io_queues(struct nvme_ctrl *ctrl); void nvme_unfreeze(struct nvme_ctrl *ctrl); @@ -857,6 +859,7 @@ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags); extern const struct attribute_group *nvme_ns_id_attr_groups[]; extern const struct pr_ops nvme_pr_ops; extern const struct block_device_operations nvme_ns_head_ops; +extern const struct attribute_group nvme_dev_attrs_group; struct nvme_ns *nvme_find_path(struct nvme_ns_head *head); #ifdef CONFIG_NVME_MULTIPATH @@ -883,6 +886,8 @@ bool nvme_mpath_clear_current_path(struct nvme_ns *ns); void nvme_mpath_revalidate_paths(struct nvme_ns *ns); void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl); void nvme_mpath_shutdown_disk(struct nvme_ns_head *head); +void nvme_mpath_start_request(struct request *rq); +void nvme_mpath_end_request(struct request *rq); static inline void nvme_trace_bio_complete(struct request *req) { @@ -968,6 +973,12 @@ static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) static inline void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys) { } +static inline void nvme_mpath_start_request(struct request *rq) +{ +} +static inline void nvme_mpath_end_request(struct request *rq) +{ +} #endif /* CONFIG_NVME_MULTIPATH */ int nvme_revalidate_zones(struct nvme_ns *ns); @@ -1013,20 +1024,38 @@ static inline void nvme_hwmon_exit(struct nvme_ctrl *ctrl) } #endif +static inline void nvme_start_request(struct request *rq) +{ + if (rq->cmd_flags & REQ_NVME_MPATH) + nvme_mpath_start_request(rq); + blk_mq_start_request(rq); +} + static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl) { return ctrl->sgls & ((1 << 0) | (1 << 1)); } #ifdef CONFIG_NVME_AUTH -void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl); +int __init nvme_init_auth(void); +void __exit nvme_exit_auth(void); +int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl); void nvme_auth_stop(struct nvme_ctrl *ctrl); int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid); int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid); -void nvme_auth_reset(struct nvme_ctrl *ctrl); void nvme_auth_free(struct nvme_ctrl *ctrl); #else -static inline void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) {}; +static inline int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) +{ + return 0; +} +static inline int __init nvme_init_auth(void) +{ + return 0; +} +static inline void __exit nvme_exit_auth(void) +{ +} static inline void nvme_auth_stop(struct nvme_ctrl *ctrl) {}; static inline int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid) { diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f4335519399d..f0f8027644bb 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -15,6 +15,7 @@ #include <linux/init.h> #include <linux/interrupt.h> #include <linux/io.h> +#include <linux/kstrtox.h> #include <linux/memremap.h> #include <linux/mm.h> #include <linux/module.h> @@ -108,7 +109,7 @@ struct nvme_dev; struct nvme_queue; static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); -static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode); +static void nvme_delete_io_queues(struct nvme_dev *dev); /* * Represents an NVM Express device. Each nvme_dev is a PCI function. @@ -130,7 +131,6 @@ struct nvme_dev { u32 db_stride; void __iomem *bar; unsigned long bar_mapped_size; - struct work_struct remove_work; struct mutex shutdown_lock; bool subsystem; u64 cmb_size; @@ -158,8 +158,6 @@ struct nvme_dev { unsigned int nr_allocated_queues; unsigned int nr_write_queues; unsigned int nr_poll_queues; - - bool attrs_added; }; static int io_queue_depth_set(const char *val, const struct kernel_param *kp) @@ -241,10 +239,13 @@ static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev) return dev->nr_allocated_queues * 8 * dev->db_stride; } -static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev) +static void nvme_dbbuf_dma_alloc(struct nvme_dev *dev) { unsigned int mem_size = nvme_dbbuf_size(dev); + if (!(dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP)) + return; + if (dev->dbbuf_dbs) { /* * Clear the dbbuf memory so the driver doesn't observe stale @@ -252,25 +253,27 @@ static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev) */ memset(dev->dbbuf_dbs, 0, mem_size); memset(dev->dbbuf_eis, 0, mem_size); - return 0; + return; } dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size, &dev->dbbuf_dbs_dma_addr, GFP_KERNEL); if (!dev->dbbuf_dbs) - return -ENOMEM; + goto fail; dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size, &dev->dbbuf_eis_dma_addr, GFP_KERNEL); - if (!dev->dbbuf_eis) { - dma_free_coherent(dev->dev, mem_size, - dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr); - dev->dbbuf_dbs = NULL; - return -ENOMEM; - } + if (!dev->dbbuf_eis) + goto fail_free_dbbuf_dbs; + return; - return 0; +fail_free_dbbuf_dbs: + dma_free_coherent(dev->dev, mem_size, dev->dbbuf_dbs, + dev->dbbuf_dbs_dma_addr); + dev->dbbuf_dbs = NULL; +fail: + dev_warn(dev->dev, "unable to allocate dma for dbbuf\n"); } static void nvme_dbbuf_dma_free(struct nvme_dev *dev) @@ -392,18 +395,10 @@ static int nvme_pci_npages_sgl(void) PAGE_SIZE); } -static size_t nvme_pci_iod_alloc_size(void) -{ - size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl()); - - return sizeof(__le64 *) * npages + - sizeof(struct scatterlist) * NVME_MAX_SEGS; -} - static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { - struct nvme_dev *dev = data; + struct nvme_dev *dev = to_nvme_dev(data); struct nvme_queue *nvmeq = &dev->queues[0]; WARN_ON(hctx_idx != 0); @@ -416,7 +411,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { - struct nvme_dev *dev = data; + struct nvme_dev *dev = to_nvme_dev(data); struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1]; WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags); @@ -428,7 +423,7 @@ static int nvme_pci_init_request(struct blk_mq_tag_set *set, struct request *req, unsigned int hctx_idx, unsigned int numa_node) { - struct nvme_dev *dev = set->driver_data; + struct nvme_dev *dev = to_nvme_dev(set->driver_data); struct nvme_iod *iod = blk_mq_rq_to_pdu(req); nvme_req(req)->ctrl = &dev->ctrl; @@ -447,7 +442,7 @@ static int queue_irq_offset(struct nvme_dev *dev) static void nvme_pci_map_queues(struct blk_mq_tag_set *set) { - struct nvme_dev *dev = set->driver_data; + struct nvme_dev *dev = to_nvme_dev(set->driver_data); int i, qoff, offset; offset = queue_irq_offset(dev); @@ -797,6 +792,8 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev, cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma); if (bv->bv_len > first_prp_len) cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len); + else + cmnd->dptr.prp2 = 0; return BLK_STS_OK; } @@ -912,7 +909,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) goto out_unmap_data; } - blk_mq_start_request(req); + nvme_start_request(req); return BLK_STS_OK; out_unmap_data: nvme_unmap_data(dev, req); @@ -1472,24 +1469,21 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest) } } -/** - * nvme_suspend_queue - put queue into suspended state - * @nvmeq: queue to suspend - */ -static int nvme_suspend_queue(struct nvme_queue *nvmeq) +static void nvme_suspend_queue(struct nvme_dev *dev, unsigned int qid) { + struct nvme_queue *nvmeq = &dev->queues[qid]; + if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags)) - return 1; + return; /* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */ mb(); nvmeq->dev->online_queues--; if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) - nvme_stop_admin_queue(&nvmeq->dev->ctrl); + nvme_quiesce_admin_queue(&nvmeq->dev->ctrl); if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags)) - pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq); - return 0; + pci_free_irq(to_pci_dev(dev->dev), nvmeq->cq_vector, nvmeq); } static void nvme_suspend_io_queues(struct nvme_dev *dev) @@ -1497,19 +1491,7 @@ static void nvme_suspend_io_queues(struct nvme_dev *dev) int i; for (i = dev->ctrl.queue_count - 1; i > 0; i--) - nvme_suspend_queue(&dev->queues[i]); -} - -static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) -{ - struct nvme_queue *nvmeq = &dev->queues[0]; - - if (shutdown) - nvme_shutdown_ctrl(&dev->ctrl); - else - nvme_disable_ctrl(&dev->ctrl); - - nvme_poll_irqdisable(nvmeq); + nvme_suspend_queue(dev, i); } /* @@ -1746,44 +1728,11 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev) * user requests may be waiting on a stopped queue. Start the * queue to flush these to completion. */ - nvme_start_admin_queue(&dev->ctrl); - blk_mq_destroy_queue(dev->ctrl.admin_q); - blk_mq_free_tag_set(&dev->admin_tagset); + nvme_unquiesce_admin_queue(&dev->ctrl); + nvme_remove_admin_tag_set(&dev->ctrl); } } -static int nvme_pci_alloc_admin_tag_set(struct nvme_dev *dev) -{ - struct blk_mq_tag_set *set = &dev->admin_tagset; - - set->ops = &nvme_mq_admin_ops; - set->nr_hw_queues = 1; - - set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; - set->timeout = NVME_ADMIN_TIMEOUT; - set->numa_node = dev->ctrl.numa_node; - set->cmd_size = sizeof(struct nvme_iod); - set->flags = BLK_MQ_F_NO_SCHED; - set->driver_data = dev; - - if (blk_mq_alloc_tag_set(set)) - return -ENOMEM; - dev->ctrl.admin_tagset = set; - - dev->ctrl.admin_q = blk_mq_init_queue(set); - if (IS_ERR(dev->ctrl.admin_q)) { - blk_mq_free_tag_set(set); - dev->ctrl.admin_q = NULL; - return -ENOMEM; - } - if (!blk_get_queue(dev->ctrl.admin_q)) { - nvme_dev_remove_admin(dev); - dev->ctrl.admin_q = NULL; - return -ENODEV; - } - return 0; -} - static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) { return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride); @@ -1827,7 +1776,14 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); - result = nvme_disable_ctrl(&dev->ctrl); + /* + * If the device has been passed off to us in an enabled state, just + * clear the enabled bit. The spec says we should set the 'shutdown + * notification bits', but doing so may cause the device to complete + * commands to the admin queue ... and we don't know what memory that + * might be pointing at! + */ + result = nvme_disable_ctrl(&dev->ctrl, false); if (result < 0) return result; @@ -2110,6 +2066,9 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) u32 enable_bits = NVME_HOST_MEM_ENABLE; int ret; + if (!dev->ctrl.hmpre) + return 0; + preferred = min(preferred, max); if (min > max) { dev_warn(dev->ctrl.device, @@ -2190,7 +2149,7 @@ static ssize_t hmb_store(struct device *dev, struct device_attribute *attr, bool new; int ret; - if (strtobool(buf, &new) < 0) + if (kstrtobool(buf, &new) < 0) return -EINVAL; if (new == ndev->hmb) @@ -2238,11 +2197,17 @@ static struct attribute *nvme_pci_attrs[] = { NULL, }; -static const struct attribute_group nvme_pci_attr_group = { +static const struct attribute_group nvme_pci_dev_attrs_group = { .attrs = nvme_pci_attrs, .is_visible = nvme_pci_attrs_are_visible, }; +static const struct attribute_group *nvme_pci_dev_attr_groups[] = { + &nvme_dev_attrs_group, + &nvme_pci_dev_attrs_group, + NULL, +}; + /* * nirqs is the number of interrupts available for write and read * queues. The core already reserved an interrupt for the admin queue. @@ -2317,12 +2282,6 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); } -static void nvme_disable_io_queues(struct nvme_dev *dev) -{ - if (__nvme_disable_io_queues(dev, nvme_admin_delete_sq)) - __nvme_disable_io_queues(dev, nvme_admin_delete_cq); -} - static unsigned int nvme_max_io_queues(struct nvme_dev *dev) { /* @@ -2430,7 +2389,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (dev->online_queues - 1 < dev->max_qid) { nr_io_queues = dev->online_queues - 1; - nvme_disable_io_queues(dev); + nvme_delete_io_queues(dev); result = nvme_setup_io_queues_trylock(dev); if (result) return result; @@ -2493,7 +2452,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) return 0; } -static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode) +static bool __nvme_delete_io_queues(struct nvme_dev *dev, u8 opcode) { int nr_queues = dev->online_queues - 1, sent = 0; unsigned long timeout; @@ -2521,40 +2480,19 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode) return true; } -static void nvme_pci_alloc_tag_set(struct nvme_dev *dev) +static void nvme_delete_io_queues(struct nvme_dev *dev) { - struct blk_mq_tag_set * set = &dev->tagset; - int ret; + if (__nvme_delete_io_queues(dev, nvme_admin_delete_sq)) + __nvme_delete_io_queues(dev, nvme_admin_delete_cq); +} - set->ops = &nvme_mq_ops; - set->nr_hw_queues = dev->online_queues - 1; - set->nr_maps = 1; - if (dev->io_queues[HCTX_TYPE_READ]) - set->nr_maps = 2; +static unsigned int nvme_pci_nr_maps(struct nvme_dev *dev) +{ if (dev->io_queues[HCTX_TYPE_POLL]) - set->nr_maps = 3; - set->timeout = NVME_IO_TIMEOUT; - set->numa_node = dev->ctrl.numa_node; - set->queue_depth = min_t(unsigned, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; - set->cmd_size = sizeof(struct nvme_iod); - set->flags = BLK_MQ_F_SHOULD_MERGE; - set->driver_data = dev; - - /* - * Some Apple controllers requires tags to be unique - * across admin and IO queue, so reserve the first 32 - * tags of the IO queue. - */ - if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) - set->reserved_tags = NVME_AQ_DEPTH; - - ret = blk_mq_alloc_tag_set(set); - if (ret) { - dev_warn(dev->ctrl.device, - "IO queues tagset allocation failed %d\n", ret); - return; - } - dev->ctrl.tagset = set; + return 3; + if (dev->io_queues[HCTX_TYPE_READ]) + return 2; + return 1; } static void nvme_pci_update_nr_queues(struct nvme_dev *dev) @@ -2645,7 +2583,8 @@ static int nvme_pci_enable(struct nvme_dev *dev) pci_enable_pcie_error_reporting(pdev); pci_save_state(pdev); - return 0; + + return nvme_pci_configure_admin_queue(dev); disable: pci_disable_device(pdev); @@ -2659,57 +2598,53 @@ static void nvme_dev_unmap(struct nvme_dev *dev) pci_release_mem_regions(to_pci_dev(dev->dev)); } -static void nvme_pci_disable(struct nvme_dev *dev) +static bool nvme_pci_ctrl_is_dead(struct nvme_dev *dev) { struct pci_dev *pdev = to_pci_dev(dev->dev); + u32 csts; - pci_free_irq_vectors(pdev); + if (!pci_is_enabled(pdev) || !pci_device_is_present(pdev)) + return true; + if (pdev->error_state != pci_channel_io_normal) + return true; - if (pci_is_enabled(pdev)) { - pci_disable_pcie_error_reporting(pdev); - pci_disable_device(pdev); - } + csts = readl(dev->bar + NVME_REG_CSTS); + return (csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY); } static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) { - bool dead = true, freeze = false; struct pci_dev *pdev = to_pci_dev(dev->dev); + bool dead; mutex_lock(&dev->shutdown_lock); - if (pci_is_enabled(pdev)) { - u32 csts; - - if (pci_device_is_present(pdev)) - csts = readl(dev->bar + NVME_REG_CSTS); - else - csts = ~0; - - if (dev->ctrl.state == NVME_CTRL_LIVE || - dev->ctrl.state == NVME_CTRL_RESETTING) { - freeze = true; + dead = nvme_pci_ctrl_is_dead(dev); + if (dev->ctrl.state == NVME_CTRL_LIVE || + dev->ctrl.state == NVME_CTRL_RESETTING) { + if (pci_is_enabled(pdev)) nvme_start_freeze(&dev->ctrl); - } - dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) || - pdev->error_state != pci_channel_io_normal); + /* + * Give the controller a chance to complete all entered requests + * if doing a safe shutdown. + */ + if (!dead && shutdown) + nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); } - /* - * Give the controller a chance to complete all entered requests if - * doing a safe shutdown. - */ - if (!dead && shutdown && freeze) - nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); - - nvme_stop_queues(&dev->ctrl); + nvme_quiesce_io_queues(&dev->ctrl); if (!dead && dev->ctrl.queue_count > 0) { - nvme_disable_io_queues(dev); - nvme_disable_admin_queue(dev, shutdown); + nvme_delete_io_queues(dev); + nvme_disable_ctrl(&dev->ctrl, shutdown); + nvme_poll_irqdisable(&dev->queues[0]); } nvme_suspend_io_queues(dev); - nvme_suspend_queue(&dev->queues[0]); - nvme_pci_disable(dev); + nvme_suspend_queue(dev, 0); + pci_free_irq_vectors(pdev); + if (pci_is_enabled(pdev)) { + pci_disable_pcie_error_reporting(pdev); + pci_disable_device(pdev); + } nvme_reap_pending_cqes(dev); nvme_cancel_tagset(&dev->ctrl); @@ -2721,9 +2656,9 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) * deadlocking blk-mq hot-cpu notifier. */ if (shutdown) { - nvme_start_queues(&dev->ctrl); + nvme_unquiesce_io_queues(&dev->ctrl); if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) - nvme_start_admin_queue(&dev->ctrl); + nvme_unquiesce_admin_queue(&dev->ctrl); } mutex_unlock(&dev->shutdown_lock); } @@ -2760,42 +2695,40 @@ static void nvme_release_prp_pools(struct nvme_dev *dev) dma_pool_destroy(dev->prp_small_pool); } +static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) +{ + size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl()); + size_t alloc_size = sizeof(__le64 *) * npages + + sizeof(struct scatterlist) * NVME_MAX_SEGS; + + WARN_ON_ONCE(alloc_size > PAGE_SIZE); + dev->iod_mempool = mempool_create_node(1, + mempool_kmalloc, mempool_kfree, + (void *)alloc_size, GFP_KERNEL, + dev_to_node(dev->dev)); + if (!dev->iod_mempool) + return -ENOMEM; + return 0; +} + static void nvme_free_tagset(struct nvme_dev *dev) { if (dev->tagset.tags) - blk_mq_free_tag_set(&dev->tagset); + nvme_remove_io_tag_set(&dev->ctrl); dev->ctrl.tagset = NULL; } +/* pairs with nvme_pci_alloc_dev */ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) { struct nvme_dev *dev = to_nvme_dev(ctrl); - nvme_dbbuf_dma_free(dev); nvme_free_tagset(dev); - if (dev->ctrl.admin_q) - blk_put_queue(dev->ctrl.admin_q); - free_opal_dev(dev->ctrl.opal_dev); - mempool_destroy(dev->iod_mempool); put_device(dev->dev); kfree(dev->queues); kfree(dev); } -static void nvme_remove_dead_ctrl(struct nvme_dev *dev) -{ - /* - * Set state to deleting now to avoid blocking nvme_wait_reset(), which - * may be holding this pci_dev's device lock. - */ - nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); - nvme_get_ctrl(&dev->ctrl); - nvme_dev_disable(dev, false); - nvme_kill_queues(&dev->ctrl); - if (!queue_work(nvme_wq, &dev->remove_work)) - nvme_put_ctrl(&dev->ctrl); -} - static void nvme_reset_work(struct work_struct *work) { struct nvme_dev *dev = @@ -2806,8 +2739,7 @@ static void nvme_reset_work(struct work_struct *work) if (dev->ctrl.state != NVME_CTRL_RESETTING) { dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n", dev->ctrl.state); - result = -ENODEV; - goto out; + return; } /* @@ -2822,34 +2754,7 @@ static void nvme_reset_work(struct work_struct *work) result = nvme_pci_enable(dev); if (result) goto out_unlock; - - result = nvme_pci_configure_admin_queue(dev); - if (result) - goto out_unlock; - - if (!dev->ctrl.admin_q) { - result = nvme_pci_alloc_admin_tag_set(dev); - if (result) - goto out_unlock; - } else { - nvme_start_admin_queue(&dev->ctrl); - } - - dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1); - - /* - * Limit the max command size to prevent iod->sg allocations going - * over a single page. - */ - dev->ctrl.max_hw_sectors = min_t(u32, - NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9); - dev->ctrl.max_segments = NVME_MAX_SEGS; - - /* - * Don't limit the IOMMU merged segment size. - */ - dma_set_max_seg_size(dev->dev, 0xffffffff); - + nvme_unquiesce_admin_queue(&dev->ctrl); mutex_unlock(&dev->shutdown_lock); /* @@ -2863,62 +2768,37 @@ static void nvme_reset_work(struct work_struct *work) goto out; } - /* - * We do not support an SGL for metadata (yet), so we are limited to a - * single integrity segment for the separate metadata pointer. - */ - dev->ctrl.max_integrity_segments = 1; - - result = nvme_init_ctrl_finish(&dev->ctrl); + result = nvme_init_ctrl_finish(&dev->ctrl, was_suspend); if (result) goto out; - if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) { - if (!dev->ctrl.opal_dev) - dev->ctrl.opal_dev = - init_opal_dev(&dev->ctrl, &nvme_sec_submit); - else if (was_suspend) - opal_unlock_from_suspend(dev->ctrl.opal_dev); - } else { - free_opal_dev(dev->ctrl.opal_dev); - dev->ctrl.opal_dev = NULL; - } + nvme_dbbuf_dma_alloc(dev); - if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) { - result = nvme_dbbuf_dma_alloc(dev); - if (result) - dev_warn(dev->dev, - "unable to allocate dma for dbbuf\n"); - } - - if (dev->ctrl.hmpre) { - result = nvme_setup_host_mem(dev); - if (result < 0) - goto out; - } + result = nvme_setup_host_mem(dev); + if (result < 0) + goto out; result = nvme_setup_io_queues(dev); if (result) goto out; /* - * Keep the controller around but remove all namespaces if we don't have - * any working I/O queue. + * Freeze and update the number of I/O queues as thos might have + * changed. If there are no I/O queues left after this reset, keep the + * controller around but remove all namespaces. */ - if (dev->online_queues < 2) { - dev_warn(dev->ctrl.device, "IO queues not created\n"); - nvme_kill_queues(&dev->ctrl); - nvme_remove_namespaces(&dev->ctrl); - nvme_free_tagset(dev); - } else { - nvme_start_queues(&dev->ctrl); + if (dev->online_queues > 1) { + nvme_unquiesce_io_queues(&dev->ctrl); nvme_wait_freeze(&dev->ctrl); - if (!dev->ctrl.tagset) - nvme_pci_alloc_tag_set(dev); - else - nvme_pci_update_nr_queues(dev); + nvme_pci_update_nr_queues(dev); nvme_dbbuf_set(dev); nvme_unfreeze(&dev->ctrl); + } else { + dev_warn(dev->ctrl.device, "IO queues lost\n"); + nvme_mark_namespaces_dead(&dev->ctrl); + nvme_unquiesce_io_queues(&dev->ctrl); + nvme_remove_namespaces(&dev->ctrl); + nvme_free_tagset(dev); } /* @@ -2932,30 +2812,22 @@ static void nvme_reset_work(struct work_struct *work) goto out; } - if (!dev->attrs_added && !sysfs_create_group(&dev->ctrl.device->kobj, - &nvme_pci_attr_group)) - dev->attrs_added = true; - nvme_start_ctrl(&dev->ctrl); return; out_unlock: mutex_unlock(&dev->shutdown_lock); out: - if (result) - dev_warn(dev->ctrl.device, - "Removing after probe failure status: %d\n", result); - nvme_remove_dead_ctrl(dev); -} - -static void nvme_remove_dead_ctrl_work(struct work_struct *work) -{ - struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work); - struct pci_dev *pdev = to_pci_dev(dev->dev); - - if (pci_get_drvdata(pdev)) - device_release_driver(&pdev->dev); - nvme_put_ctrl(&dev->ctrl); + /* + * Set state to deleting now to avoid blocking nvme_wait_reset(), which + * may be holding this pci_dev's device lock. + */ + dev_warn(dev->ctrl.device, "Disabling device after reset failure: %d\n", + result); + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); + nvme_dev_disable(dev, true); + nvme_mark_namespaces_dead(&dev->ctrl); + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD); } static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) @@ -3008,6 +2880,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .name = "pcie", .module = THIS_MODULE, .flags = NVME_F_METADATA_SUPPORTED, + .dev_attr_groups = nvme_pci_dev_attr_groups, .reg_read32 = nvme_pci_reg_read32, .reg_write32 = nvme_pci_reg_write32, .reg_read64 = nvme_pci_reg_read64, @@ -3077,29 +2950,22 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) return 0; } -static void nvme_async_probe(void *data, async_cookie_t cookie) +static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, + const struct pci_device_id *id) { - struct nvme_dev *dev = data; - - flush_work(&dev->ctrl.reset_work); - flush_work(&dev->ctrl.scan_work); - nvme_put_ctrl(&dev->ctrl); -} - -static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) -{ - int node, result = -ENOMEM; - struct nvme_dev *dev; unsigned long quirks = id->driver_data; - size_t alloc_size; + int node = dev_to_node(&pdev->dev); + struct nvme_dev *dev; + int ret = -ENOMEM; - node = dev_to_node(&pdev->dev); if (node == NUMA_NO_NODE) set_dev_node(&pdev->dev, first_memory_node); dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); if (!dev) - return -ENOMEM; + return NULL; + INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); + mutex_init(&dev->shutdown_lock); dev->nr_write_queues = write_queues; dev->nr_poll_queues = poll_queues; @@ -3107,25 +2973,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev->queues = kcalloc_node(dev->nr_allocated_queues, sizeof(struct nvme_queue), GFP_KERNEL, node); if (!dev->queues) - goto free; + goto out_free_dev; dev->dev = get_device(&pdev->dev); - pci_set_drvdata(pdev, dev); - - result = nvme_dev_map(dev); - if (result) - goto put_pci; - - INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); - INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); - mutex_init(&dev->shutdown_lock); - - result = nvme_setup_prp_pools(dev); - if (result) - goto unmap; quirks |= check_vendor_combination_bug(pdev); - if (!noacpi && acpi_storage_d3(&pdev->dev)) { /* * Some systems use a bios work around to ask for D3 on @@ -3135,46 +2987,131 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) "platform quirk: setting simple suspend\n"); quirks |= NVME_QUIRK_SIMPLE_SUSPEND; } + ret = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops, + quirks); + if (ret) + goto out_put_device; + + dma_set_min_align_mask(&pdev->dev, NVME_CTRL_PAGE_SIZE - 1); + dma_set_max_seg_size(&pdev->dev, 0xffffffff); /* - * Double check that our mempool alloc size will cover the biggest - * command we support. + * Limit the max command size to prevent iod->sg allocations going + * over a single page. */ - alloc_size = nvme_pci_iod_alloc_size(); - WARN_ON_ONCE(alloc_size > PAGE_SIZE); + dev->ctrl.max_hw_sectors = min_t(u32, + NVME_MAX_KB_SZ << 1, dma_max_mapping_size(&pdev->dev) >> 9); + dev->ctrl.max_segments = NVME_MAX_SEGS; - dev->iod_mempool = mempool_create_node(1, mempool_kmalloc, - mempool_kfree, - (void *) alloc_size, - GFP_KERNEL, node); - if (!dev->iod_mempool) { - result = -ENOMEM; - goto release_pools; - } + /* + * There is no support for SGLs for metadata (yet), so we are limited to + * a single integrity segment for the separate metadata pointer. + */ + dev->ctrl.max_integrity_segments = 1; + return dev; + +out_put_device: + put_device(dev->dev); + kfree(dev->queues); +out_free_dev: + kfree(dev); + return ERR_PTR(ret); +} - result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops, - quirks); +static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct nvme_dev *dev; + int result = -ENOMEM; + + dev = nvme_pci_alloc_dev(pdev, id); + if (!dev) + return -ENOMEM; + + result = nvme_dev_map(dev); if (result) - goto release_mempool; + goto out_uninit_ctrl; + + result = nvme_setup_prp_pools(dev); + if (result) + goto out_dev_unmap; + + result = nvme_pci_alloc_iod_mempool(dev); + if (result) + goto out_release_prp_pools; dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); - nvme_reset_ctrl(&dev->ctrl); - async_schedule(nvme_async_probe, dev); + result = nvme_pci_enable(dev); + if (result) + goto out_release_iod_mempool; + + result = nvme_alloc_admin_tag_set(&dev->ctrl, &dev->admin_tagset, + &nvme_mq_admin_ops, sizeof(struct nvme_iod)); + if (result) + goto out_disable; + + /* + * Mark the controller as connecting before sending admin commands to + * allow the timeout handler to do the right thing. + */ + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) { + dev_warn(dev->ctrl.device, + "failed to mark controller CONNECTING\n"); + result = -EBUSY; + goto out_disable; + } + + result = nvme_init_ctrl_finish(&dev->ctrl, false); + if (result) + goto out_disable; + + nvme_dbbuf_dma_alloc(dev); + + result = nvme_setup_host_mem(dev); + if (result < 0) + goto out_disable; + + result = nvme_setup_io_queues(dev); + if (result) + goto out_disable; + if (dev->online_queues > 1) { + nvme_alloc_io_tag_set(&dev->ctrl, &dev->tagset, &nvme_mq_ops, + nvme_pci_nr_maps(dev), sizeof(struct nvme_iod)); + nvme_dbbuf_set(dev); + } + + if (!dev->ctrl.tagset) + dev_warn(dev->ctrl.device, "IO queues not created\n"); + + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) { + dev_warn(dev->ctrl.device, + "failed to mark controller live state\n"); + result = -ENODEV; + goto out_disable; + } + + pci_set_drvdata(pdev, dev); + + nvme_start_ctrl(&dev->ctrl); + nvme_put_ctrl(&dev->ctrl); return 0; - release_mempool: +out_disable: + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); + nvme_dev_disable(dev, true); + nvme_free_host_mem(dev); + nvme_dev_remove_admin(dev); + nvme_dbbuf_dma_free(dev); + nvme_free_queues(dev, 0); +out_release_iod_mempool: mempool_destroy(dev->iod_mempool); - release_pools: +out_release_prp_pools: nvme_release_prp_pools(dev); - unmap: +out_dev_unmap: nvme_dev_unmap(dev); - put_pci: - put_device(dev->dev); - free: - kfree(dev->queues); - kfree(dev); +out_uninit_ctrl: + nvme_uninit_ctrl(&dev->ctrl); return result; } @@ -3206,13 +3143,6 @@ static void nvme_shutdown(struct pci_dev *pdev) nvme_disable_prepare_reset(dev, true); } -static void nvme_remove_attrs(struct nvme_dev *dev) -{ - if (dev->attrs_added) - sysfs_remove_group(&dev->ctrl.device->kobj, - &nvme_pci_attr_group); -} - /* * The driver's remove may be called on a device in a partially initialized * state. This function must not have any dependencies on the device state in @@ -3234,10 +3164,11 @@ static void nvme_remove(struct pci_dev *pdev) nvme_stop_ctrl(&dev->ctrl); nvme_remove_namespaces(&dev->ctrl); nvme_dev_disable(dev, true); - nvme_remove_attrs(dev); nvme_free_host_mem(dev); nvme_dev_remove_admin(dev); + nvme_dbbuf_dma_free(dev); nvme_free_queues(dev, 0); + mempool_destroy(dev->iod_mempool); nvme_release_prp_pools(dev); nvme_dev_unmap(dev); nvme_uninit_ctrl(&dev->ctrl); @@ -3574,11 +3505,12 @@ static struct pci_driver nvme_driver = { .probe = nvme_probe, .remove = nvme_remove, .shutdown = nvme_shutdown, -#ifdef CONFIG_PM_SLEEP .driver = { - .pm = &nvme_dev_pm_ops, - }, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, +#ifdef CONFIG_PM_SLEEP + .pm = &nvme_dev_pm_ops, #endif + }, .sriov_configure = pci_sriov_configure_simple, .err_handler = &nvme_err_handler, }; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 6e079abb22ee..bbad26b82b56 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -798,7 +798,9 @@ static int nvme_rdma_alloc_tag_set(struct nvme_ctrl *ctrl) NVME_RDMA_METADATA_SGL_SIZE; return nvme_alloc_io_tag_set(ctrl, &to_rdma_ctrl(ctrl)->tag_set, - &nvme_rdma_mq_ops, BLK_MQ_F_SHOULD_MERGE, cmd_size); + &nvme_rdma_mq_ops, + ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2, + cmd_size); } static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl) @@ -846,7 +848,6 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, if (new) { error = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set, &nvme_rdma_admin_mq_ops, - BLK_MQ_F_NO_SCHED, sizeof(struct nvme_rdma_request) + NVME_RDMA_DATA_SGL_SIZE); if (error) @@ -869,16 +870,16 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, else ctrl->ctrl.max_integrity_segments = 0; - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); - error = nvme_init_ctrl_finish(&ctrl->ctrl); + error = nvme_init_ctrl_finish(&ctrl->ctrl, false); if (error) goto out_quiesce_queue; return 0; out_quiesce_queue: - nvme_stop_admin_queue(&ctrl->ctrl); + nvme_quiesce_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); out_stop_queue: nvme_rdma_stop_queue(&ctrl->queues[0]); @@ -922,7 +923,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) goto out_cleanup_tagset; if (!new) { - nvme_start_queues(&ctrl->ctrl); + nvme_unquiesce_io_queues(&ctrl->ctrl); if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) { /* * If we timed out waiting for freeze we are likely to @@ -949,7 +950,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) return 0; out_wait_freeze_timed_out: - nvme_stop_queues(&ctrl->ctrl); + nvme_quiesce_io_queues(&ctrl->ctrl); nvme_sync_io_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); out_cleanup_tagset: @@ -964,12 +965,12 @@ out_free_io_queues: static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, bool remove) { - nvme_stop_admin_queue(&ctrl->ctrl); + nvme_quiesce_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); nvme_cancel_admin_tagset(&ctrl->ctrl); if (remove) { - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); nvme_remove_admin_tag_set(&ctrl->ctrl); } nvme_rdma_destroy_admin_queue(ctrl); @@ -980,12 +981,12 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, { if (ctrl->ctrl.queue_count > 1) { nvme_start_freeze(&ctrl->ctrl); - nvme_stop_queues(&ctrl->ctrl); + nvme_quiesce_io_queues(&ctrl->ctrl); nvme_sync_io_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); nvme_cancel_tagset(&ctrl->ctrl); if (remove) { - nvme_start_queues(&ctrl->ctrl); + nvme_unquiesce_io_queues(&ctrl->ctrl); nvme_remove_io_tag_set(&ctrl->ctrl); } nvme_rdma_free_io_queues(ctrl); @@ -1106,7 +1107,7 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) destroy_io: if (ctrl->ctrl.queue_count > 1) { - nvme_stop_queues(&ctrl->ctrl); + nvme_quiesce_io_queues(&ctrl->ctrl); nvme_sync_io_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); nvme_cancel_tagset(&ctrl->ctrl); @@ -1115,7 +1116,7 @@ destroy_io: nvme_rdma_free_io_queues(ctrl); } destroy_admin: - nvme_stop_admin_queue(&ctrl->ctrl); + nvme_quiesce_admin_queue(&ctrl->ctrl); blk_sync_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); nvme_cancel_admin_tagset(&ctrl->ctrl); @@ -1153,13 +1154,13 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) struct nvme_rdma_ctrl *ctrl = container_of(work, struct nvme_rdma_ctrl, err_work); - nvme_auth_stop(&ctrl->ctrl); nvme_stop_keep_alive(&ctrl->ctrl); flush_work(&ctrl->ctrl.async_event_work); nvme_rdma_teardown_io_queues(ctrl, false); - nvme_start_queues(&ctrl->ctrl); + nvme_unquiesce_io_queues(&ctrl->ctrl); nvme_rdma_teardown_admin_queue(ctrl, false); - nvme_start_admin_queue(&ctrl->ctrl); + nvme_unquiesce_admin_queue(&ctrl->ctrl); + nvme_auth_stop(&ctrl->ctrl); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we started ctrl delete */ @@ -2040,7 +2041,7 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, if (ret) goto unmap_qe; - blk_mq_start_request(rq); + nvme_start_request(rq); if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && queue->pi_support && @@ -2207,11 +2208,8 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) { nvme_rdma_teardown_io_queues(ctrl, shutdown); - nvme_stop_admin_queue(&ctrl->ctrl); - if (shutdown) - nvme_shutdown_ctrl(&ctrl->ctrl); - else - nvme_disable_ctrl(&ctrl->ctrl); + nvme_quiesce_admin_queue(&ctrl->ctrl); + nvme_disable_ctrl(&ctrl->ctrl, shutdown); nvme_rdma_teardown_admin_queue(ctrl, shutdown); } diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 9b47dcb2a7d9..b69b89166b6b 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -301,7 +301,7 @@ static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req, if (!iov_iter_count(&req->iter) && req->data_sent < req->data_len) { req->curr_bio = req->curr_bio->bi_next; - nvme_tcp_init_iter(req, WRITE); + nvme_tcp_init_iter(req, ITER_SOURCE); } } @@ -781,7 +781,7 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, nvme_tcp_init_recv_ctx(queue); return -EIO; } - nvme_tcp_init_iter(req, READ); + nvme_tcp_init_iter(req, ITER_DEST); } /* we can read only from what is left in this bio */ @@ -1867,7 +1867,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) if (new) { ret = nvme_alloc_io_tag_set(ctrl, &to_tcp_ctrl(ctrl)->tag_set, &nvme_tcp_mq_ops, - BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING, + ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2, sizeof(struct nvme_tcp_request)); if (ret) goto out_free_io_queues; @@ -1884,7 +1884,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) goto out_cleanup_connect_q; if (!new) { - nvme_start_queues(ctrl); + nvme_unquiesce_io_queues(ctrl); if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) { /* * If we timed out waiting for freeze we are likely to @@ -1911,7 +1911,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) return 0; out_wait_freeze_timed_out: - nvme_stop_queues(ctrl); + nvme_quiesce_io_queues(ctrl); nvme_sync_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); out_cleanup_connect_q: @@ -1942,7 +1942,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) if (new) { error = nvme_alloc_admin_tag_set(ctrl, &to_tcp_ctrl(ctrl)->admin_tag_set, - &nvme_tcp_admin_mq_ops, BLK_MQ_F_BLOCKING, + &nvme_tcp_admin_mq_ops, sizeof(struct nvme_tcp_request)); if (error) goto out_free_queue; @@ -1956,16 +1956,16 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) if (error) goto out_stop_queue; - nvme_start_admin_queue(ctrl); + nvme_unquiesce_admin_queue(ctrl); - error = nvme_init_ctrl_finish(ctrl); + error = nvme_init_ctrl_finish(ctrl, false); if (error) goto out_quiesce_queue; return 0; out_quiesce_queue: - nvme_stop_admin_queue(ctrl); + nvme_quiesce_admin_queue(ctrl); blk_sync_queue(ctrl->admin_q); out_stop_queue: nvme_tcp_stop_queue(ctrl, 0); @@ -1981,12 +1981,12 @@ out_free_queue: static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, bool remove) { - nvme_stop_admin_queue(ctrl); + nvme_quiesce_admin_queue(ctrl); blk_sync_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); nvme_cancel_admin_tagset(ctrl); if (remove) - nvme_start_admin_queue(ctrl); + nvme_unquiesce_admin_queue(ctrl); nvme_tcp_destroy_admin_queue(ctrl, remove); } @@ -1995,14 +1995,14 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, { if (ctrl->queue_count <= 1) return; - nvme_stop_admin_queue(ctrl); + nvme_quiesce_admin_queue(ctrl); nvme_start_freeze(ctrl); - nvme_stop_queues(ctrl); + nvme_quiesce_io_queues(ctrl); nvme_sync_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); nvme_cancel_tagset(ctrl); if (remove) - nvme_start_queues(ctrl); + nvme_unquiesce_io_queues(ctrl); nvme_tcp_destroy_io_queues(ctrl, remove); } @@ -2083,14 +2083,14 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) destroy_io: if (ctrl->queue_count > 1) { - nvme_stop_queues(ctrl); + nvme_quiesce_io_queues(ctrl); nvme_sync_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); nvme_cancel_tagset(ctrl); nvme_tcp_destroy_io_queues(ctrl, new); } destroy_admin: - nvme_stop_admin_queue(ctrl); + nvme_quiesce_admin_queue(ctrl); blk_sync_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); nvme_cancel_admin_tagset(ctrl); @@ -2128,14 +2128,14 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) struct nvme_tcp_ctrl, err_work); struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl; - nvme_auth_stop(ctrl); nvme_stop_keep_alive(ctrl); flush_work(&ctrl->async_event_work); nvme_tcp_teardown_io_queues(ctrl, false); /* unquiesce to fail fast pending requests */ - nvme_start_queues(ctrl); + nvme_unquiesce_io_queues(ctrl); nvme_tcp_teardown_admin_queue(ctrl, false); - nvme_start_admin_queue(ctrl); + nvme_unquiesce_admin_queue(ctrl); + nvme_auth_stop(ctrl); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we started ctrl delete */ @@ -2150,11 +2150,8 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown) { nvme_tcp_teardown_io_queues(ctrl, shutdown); - nvme_stop_admin_queue(ctrl); - if (shutdown) - nvme_shutdown_ctrl(ctrl); - else - nvme_disable_ctrl(ctrl); + nvme_quiesce_admin_queue(ctrl); + nvme_disable_ctrl(ctrl, shutdown); nvme_tcp_teardown_admin_queue(ctrl, shutdown); } @@ -2414,7 +2411,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, if (unlikely(ret)) return ret; - blk_mq_start_request(rq); + nvme_start_request(rq); nvme_tcp_queue_request(req, true, bd->last); @@ -2523,7 +2520,7 @@ static const struct blk_mq_ops nvme_tcp_admin_mq_ops = { static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = { .name = "tcp", .module = THIS_MODULE, - .flags = NVME_F_FABRICS, + .flags = NVME_F_FABRICS | NVME_F_BLOCKING, .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, |