From 165a5d4fbe5c9e09d7cf82ff431dd74a8d6c0b75 Mon Sep 17 00:00:00 2001 From: Maximilian Heyne Date: Thu, 14 May 2026 10:32:49 +0200 Subject: nvme: Let the blocklayer set timeouts for requests When initializing an nvme request which is about to be send to the block layer, we do not need to initialize its timeout. If it's left uninitialized at 0 the block layer will use the request queue's timeout in blk_add_timer (via nvme_start_request which is called from nvme_*_queue_rq). These timeouts are setup to either NVME_IO_TIMEOUT or NVME_ADMIN_TIMEOUT when the request queues were created. Because the io_timeout of the IO queues can be modified via sysfs, the following situation can occur: 1) NVME_IO_TIMEOUT = 30 (default module parameter) 2) nvme1n1 is probed. IO queues default timeout is 30 s 3) manually change the IO timeout to 90 s echo 90000 > /sys/class/nvme/nvme1/nvme1n1/queue/io_timeout 4) Any call of __submit_sync_cmd on nvme1n1 to an IO queue will issue commands with the 30 s timeout instead of the wanted 90 s which might be more suitable for this device. Commit 470e900c8036 ("nvme: refactor nvme_alloc_request") silently changed the behavior for ioctl's already because it unconditionally overrides the request's timeout that was set in nvme_init_request. If it was unset by the user of the ioctl if will be overridden with 0 meaning the block layer will pick the request queue's IO timeout. Following up on that, this patch further improves the consistency of IO timeout usage. However, there are still uses of NVME_IO_TIMEOUT which could be inconsistent with what is set in the device's request_queue by the user. Reviewed-by: Mohamed Khalfella Reviewed-by: Christoph Hellwig Reviewed-by: Daniel Wagner Reviewed-by: Hannes Reinecke Signed-off-by: Maximilian Heyne Signed-off-by: Maurizio Lombardi Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dc388e24caad..89948d0acf18 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -729,10 +729,8 @@ void nvme_init_request(struct request *req, struct nvme_command *cmd) struct nvme_ns *ns = req->q->disk->private_data; logging_enabled = ns->head->passthru_err_log_enabled; - req->timeout = NVME_IO_TIMEOUT; } else { /* no queuedata implies admin queue */ logging_enabled = nr->ctrl->passthru_err_log_enabled; - req->timeout = NVME_ADMIN_TIMEOUT; } if (!logging_enabled) -- cgit v1.2.3 From 23b6d2cbf75ff15647efbb7c0e5c03bd7ed1fe1a Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Thu, 14 May 2026 10:32:50 +0200 Subject: nvme: remove redundant timeout argument from nvme_wait_freeze_timeout All callers of nvme_wait_freeze_timeout() currently pass the exact same NVME_IO_TIMEOUT default as their timeout argument. Remove it and use a local variable. Reviewed-by: Daniel Wagner Reviewed-by: Mohamed Khalfella Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Maurizio Lombardi Signed-off-by: Keith Busch --- drivers/nvme/host/apple.c | 2 +- drivers/nvme/host/core.c | 3 ++- drivers/nvme/host/nvme.h | 2 +- drivers/nvme/host/pci.c | 2 +- drivers/nvme/host/rdma.c | 2 +- drivers/nvme/host/tcp.c | 2 +- 6 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index 423c9c628e7b..e77c47408102 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -858,7 +858,7 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown) * doing a safe shutdown. */ if (!dead && shutdown && freeze) - nvme_wait_freeze_timeout(&anv->ctrl, NVME_IO_TIMEOUT); + nvme_wait_freeze_timeout(&anv->ctrl); nvme_quiesce_io_queues(&anv->ctrl); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 89948d0acf18..f9fe7bb65ec6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -5246,8 +5246,9 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_unfreeze); -int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) +int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl) { + unsigned long timeout = NVME_IO_TIMEOUT; struct nvme_ns *ns; int srcu_idx; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ccd5e05dac98..6f9ecb4948f4 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -900,7 +900,7 @@ void nvme_sync_queues(struct nvme_ctrl *ctrl); void nvme_sync_io_queues(struct nvme_ctrl *ctrl); void nvme_unfreeze(struct nvme_ctrl *ctrl); void nvme_wait_freeze(struct nvme_ctrl *ctrl); -int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); +int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl); void nvme_start_freeze(struct nvme_ctrl *ctrl); static inline enum req_op nvme_req_op(struct nvme_command *cmd) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 9fd04cd7c5cb..2dc1074f9984 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3276,7 +3276,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) * if doing a safe shutdown. */ if (!dead && shutdown) - nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); + nvme_wait_freeze_timeout(&dev->ctrl); } nvme_quiesce_io_queues(&dev->ctrl); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index f77c960f7632..bf73135c1439 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -888,7 +888,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) if (!new) { nvme_start_freeze(&ctrl->ctrl); nvme_unquiesce_io_queues(&ctrl->ctrl); - if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) { + if (!nvme_wait_freeze_timeout(&ctrl->ctrl)) { /* * If we timed out waiting for freeze we are likely to * be stuck. Fail the controller initialization just diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 15d36d6a728e..0552aa8a1150 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2208,7 +2208,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) if (!new) { nvme_start_freeze(ctrl); nvme_unquiesce_io_queues(ctrl); - if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) { + if (!nvme_wait_freeze_timeout(ctrl)) { /* * If we timed out waiting for freeze we are likely to * be stuck. Fail the controller initialization just -- cgit v1.2.3 From 61b99f24f0d56867d83b49f890790dd01ddd7675 Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Thu, 14 May 2026 10:32:51 +0200 Subject: nvme: add sysfs attribute to change admin timeout per nvme controller Currently, there is no method to adjust the timeout values on a per-controller basis with nvme admin queues. Add an admin_timeout attribute to nvme so that different nvme controllers which may have different timeout requirements can have custom admin timeouts set. The admin timeout is also applied to the fabrics queue (fabrics_q). The fabrics queue is utilized for fabric-specific administrative and control operations, such as Connect and Property Get/Set commands. Reviewed-by: Daniel Wagner Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Mohamed Khalfella Reviewed-by: Christoph Hellwig Signed-off-by: Maurizio Lombardi Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 1 + drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/pci.c | 2 +- drivers/nvme/host/sysfs.c | 41 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 44 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f9fe7bb65ec6..20df7c12c718 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -5140,6 +5140,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; ctrl->ka_last_check_time = jiffies; + ctrl->admin_timeout = NVME_ADMIN_TIMEOUT; BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) > PAGE_SIZE); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 6f9ecb4948f4..7923533cce00 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -370,6 +370,7 @@ struct nvme_ctrl { u16 mtfa; u32 ctrl_config; u32 queue_count; + u32 admin_timeout; u64 cap; u32 max_hw_sectors; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 2dc1074f9984..35affda088f4 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3094,7 +3094,7 @@ static bool __nvme_delete_io_queues(struct nvme_dev *dev, u8 opcode) unsigned long timeout; retry: - timeout = NVME_ADMIN_TIMEOUT; + timeout = dev->ctrl.admin_timeout; while (nr_queues > 0) { if (nvme_delete_queue(&dev->queues[nr_queues], opcode)) break; diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index e59758616f27..3b39b64cd9da 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -623,6 +623,46 @@ static ssize_t quirks_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RO(quirks); +static ssize_t nvme_admin_timeout_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%u\n", + jiffies_to_msecs(ctrl->admin_timeout)); +} + +static ssize_t nvme_admin_timeout_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + u32 timeout; + int err; + + /* + * Wait until the controller reaches the LIVE state to be sure that + * admin_q and fabrics_q are properly initialized. + */ + if (!test_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags)) + return -EBUSY; + + err = kstrtou32(buf, 10, &timeout); + if (err || !timeout) + return -EINVAL; + + ctrl->admin_timeout = msecs_to_jiffies(timeout); + + blk_queue_rq_timeout(ctrl->admin_q, ctrl->admin_timeout); + if (ctrl->fabrics_q) + blk_queue_rq_timeout(ctrl->fabrics_q, ctrl->admin_timeout); + + return count; +} + +static DEVICE_ATTR(admin_timeout, S_IRUGO | S_IWUSR, + nvme_admin_timeout_show, nvme_admin_timeout_store); + #ifdef CONFIG_NVME_HOST_AUTH static ssize_t nvme_ctrl_dhchap_secret_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -765,6 +805,7 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_cntrltype.attr, &dev_attr_dctype.attr, &dev_attr_quirks.attr, + &dev_attr_admin_timeout.attr, #ifdef CONFIG_NVME_HOST_AUTH &dev_attr_dhchap_secret.attr, &dev_attr_dhchap_ctrl_secret.attr, -- cgit v1.2.3 From 97960b93d32a0230362c2f4dce021e98421c5a91 Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Thu, 14 May 2026 10:32:52 +0200 Subject: nvme: add sysfs attribute to change IO timeout per controller Currently, there is no method to adjust the timeout values on a per controller basis with nvme I/O queues. Add an io_timeout attribute to nvme so that different nvme controllers which may have different timeout requirements can have custom I/O timeouts set. The I/O timeout is also applied to the connect queue (connect_q). In NVMe over Fabrics, the connect queue is utilized specifically to issue Connect commands that establish the I/O queues. Reviewed-by: Mohamed Khalfella Reviewed-by: Daniel Wagner Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Maurizio Lombardi Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 4 +++- drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/sysfs.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 20df7c12c718..b14aae0a4217 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4203,6 +4203,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) mutex_unlock(&ctrl->namespaces_lock); goto out_unlink_ns; } + blk_queue_rq_timeout(ns->queue, ctrl->io_timeout); nvme_ns_add_to_ctrl_list(ns); mutex_unlock(&ctrl->namespaces_lock); synchronize_srcu(&ctrl->srcu); @@ -5141,6 +5142,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; ctrl->ka_last_check_time = jiffies; ctrl->admin_timeout = NVME_ADMIN_TIMEOUT; + ctrl->io_timeout = NVME_IO_TIMEOUT; BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) > PAGE_SIZE); @@ -5249,7 +5251,7 @@ EXPORT_SYMBOL_GPL(nvme_unfreeze); int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl) { - unsigned long timeout = NVME_IO_TIMEOUT; + unsigned long timeout = ctrl->io_timeout; struct nvme_ns *ns; int srcu_idx; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 7923533cce00..9ccaed0b9dbf 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -371,6 +371,7 @@ struct nvme_ctrl { u32 ctrl_config; u32 queue_count; u32 admin_timeout; + u32 io_timeout; u64 cap; u32 max_hw_sectors; diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 3b39b64cd9da..b682c1a4b23f 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -663,6 +663,52 @@ static ssize_t nvme_admin_timeout_store(struct device *dev, static DEVICE_ATTR(admin_timeout, S_IRUGO | S_IWUSR, nvme_admin_timeout_show, nvme_admin_timeout_store); +static ssize_t nvme_io_timeout_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%u\n", jiffies_to_msecs(ctrl->io_timeout)); +} + +static ssize_t nvme_io_timeout_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvme_ns *ns; + u32 timeout; + int err; + + /* + * Wait until the controller reaches the LIVE state to be sure that + * connect_q is properly initialized. + */ + if (!test_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags)) + return -EBUSY; + + err = kstrtou32(buf, 10, &timeout); + if (err || !timeout) + return -EINVAL; + + /* Take the namespaces_lock to avoid racing against nvme_alloc_ns() */ + mutex_lock(&ctrl->namespaces_lock); + + ctrl->io_timeout = msecs_to_jiffies(timeout); + list_for_each_entry(ns, &ctrl->namespaces, list) + blk_queue_rq_timeout(ns->queue, ctrl->io_timeout); + + mutex_unlock(&ctrl->namespaces_lock); + + if (ctrl->connect_q) + blk_queue_rq_timeout(ctrl->connect_q, ctrl->io_timeout); + + return count; +} + +static DEVICE_ATTR(io_timeout, S_IRUGO | S_IWUSR, + nvme_io_timeout_show, nvme_io_timeout_store); + #ifdef CONFIG_NVME_HOST_AUTH static ssize_t nvme_ctrl_dhchap_secret_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -806,6 +852,7 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_dctype.attr, &dev_attr_quirks.attr, &dev_attr_admin_timeout.attr, + &dev_attr_io_timeout.attr, #ifdef CONFIG_NVME_HOST_AUTH &dev_attr_dhchap_secret.attr, &dev_attr_dhchap_ctrl_secret.attr, -- cgit v1.2.3 From f702badaf7d31dc3dea6c66da92b5f35fadd89dc Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Thu, 14 May 2026 10:32:53 +0200 Subject: nvme-core: align fabrics_q teardown with admin_q in nvme_free_ctrl Currently, the final reference for the fabrics admin queue (fabrics_q) is dropped inside nvme_remove_admin_tag_set(). However, the primary admin queue (admin_q) defers dropping its final reference until nvme_free_ctrl(). Move the blk_put_queue() call for fabrics_q from nvme_remove_admin_tag_set() to nvme_free_ctrl(). This aligns the lifecycle management of both admin queues, ensuring they are freed symmetrically when the controller is finally torn down. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Daniel Wagner Signed-off-by: Maurizio Lombardi Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b14aae0a4217..a6fe2cfb1ab1 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4932,10 +4932,8 @@ void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl) */ nvme_stop_keep_alive(ctrl); blk_mq_destroy_queue(ctrl->admin_q); - if (ctrl->ops->flags & NVME_F_FABRICS) { + if (ctrl->fabrics_q) blk_mq_destroy_queue(ctrl->fabrics_q); - blk_put_queue(ctrl->fabrics_q); - } blk_mq_free_tag_set(ctrl->admin_tagset); } EXPORT_SYMBOL_GPL(nvme_remove_admin_tag_set); @@ -5077,6 +5075,8 @@ static void nvme_free_ctrl(struct device *dev) if (ctrl->admin_q) blk_put_queue(ctrl->admin_q); + if (ctrl->fabrics_q) + blk_put_queue(ctrl->fabrics_q); if (!subsys || ctrl->instance != subsys->instance) ida_free(&nvme_instance_ida, ctrl->instance); nvme_free_cels(ctrl); -- cgit v1.2.3 From 233bbeb4a47cbead8c0471c0b8daec141033eae4 Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Thu, 14 May 2026 10:32:54 +0200 Subject: nvmet-loop: do not alloc admin tag set during reset Currently, resetting a loopback controller unconditionally invokes nvme_alloc_admin_tag_set() inside nvme_loop_configure_admin_queue(). Doing so drops the old queue and allocates a new one. Consequently, this reverts the admin queue's timeout (q->rq_timeout) back to the module default (NVME_ADMIN_TIMEOUT), completely wiping out any custom timeout values the user may have configured via sysfs and potentially racing against the sysfs nvme_admin_timeout_store() function that may dereference the admin_q pointer during the RESETTING state. Decouple the admin tag set lifecycle from the admin queue configuration and destruction paths, which are executed during resets; Specifically: * Move nvme_alloc_admin_tag_set() into nvme_loop_create_ctrl() so it is only allocated once during the initial controller creation. * Defer the destruction of the admin tag set to nvme_loop_delete_ctrl_host() and the terminal error-handling paths of nvme_loop_reset_ctrl_work() and nvme_loop_create_ctrl(). Reviewed-by: Daniel Wagner Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Maurizio Lombardi Signed-off-by: Keith Busch --- drivers/nvme/target/loop.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index d98d0cdc5d6f..070d16068e6b 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -274,7 +274,6 @@ static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl) nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); nvmet_cq_put(&ctrl->queues[0].nvme_cq); - nvme_remove_admin_tag_set(&ctrl->ctrl); } static void nvme_loop_free_ctrl(struct nvme_ctrl *nctrl) @@ -375,25 +374,18 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) } ctrl->ctrl.queue_count = 1; - error = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set, - &nvme_loop_admin_mq_ops, - sizeof(struct nvme_loop_iod) + - NVME_INLINE_SG_CNT * sizeof(struct scatterlist)); - if (error) - goto out_free_sq; - /* reset stopped state for the fresh admin queue */ clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->ctrl.flags); error = nvmf_connect_admin_queue(&ctrl->ctrl); if (error) - goto out_cleanup_tagset; + goto out_free_sq; set_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); error = nvme_enable_ctrl(&ctrl->ctrl); if (error) - goto out_cleanup_tagset; + goto out_free_sq; ctrl->ctrl.max_hw_sectors = (NVME_LOOP_MAX_SEGMENTS - 1) << PAGE_SECTORS_SHIFT; @@ -402,14 +394,12 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) error = nvme_init_ctrl_finish(&ctrl->ctrl, false); if (error) - goto out_cleanup_tagset; + goto out_free_sq; return 0; -out_cleanup_tagset: - clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); - nvme_remove_admin_tag_set(&ctrl->ctrl); out_free_sq: + clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); nvmet_cq_put(&ctrl->queues[0].nvme_cq); return error; @@ -432,6 +422,7 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) static void nvme_loop_delete_ctrl_host(struct nvme_ctrl *ctrl) { nvme_loop_shutdown_ctrl(to_loop_ctrl(ctrl)); + nvme_remove_admin_tag_set(ctrl); } static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl) @@ -494,6 +485,7 @@ out_destroy_admin: nvme_cancel_admin_tagset(&ctrl->ctrl); nvme_loop_destroy_admin_queue(ctrl); out_disable: + nvme_remove_admin_tag_set(&ctrl->ctrl); dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); nvme_uninit_ctrl(&ctrl->ctrl); } @@ -594,10 +586,17 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, if (!ctrl->queues) goto out_uninit_ctrl; - ret = nvme_loop_configure_admin_queue(ctrl); + ret = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set, + &nvme_loop_admin_mq_ops, + sizeof(struct nvme_loop_iod) + + NVME_INLINE_SG_CNT * sizeof(struct scatterlist)); if (ret) goto out_free_queues; + ret = nvme_loop_configure_admin_queue(ctrl); + if (ret) + goto out_remove_admin_tagset; + if (opts->queue_size > ctrl->ctrl.maxcmd) { /* warn if maxcmd is lower than queue_size */ dev_warn(ctrl->ctrl.device, @@ -633,6 +632,8 @@ out_remove_admin_queue: nvme_quiesce_admin_queue(&ctrl->ctrl); nvme_cancel_admin_tagset(&ctrl->ctrl); nvme_loop_destroy_admin_queue(ctrl); +out_remove_admin_tagset: + nvme_remove_admin_tag_set(&ctrl->ctrl); out_free_queues: kfree(ctrl->queues); out_uninit_ctrl: -- cgit v1.2.3 From 00d7b33351aac0ea55d17167561e12bbeca73138 Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Thu, 14 May 2026 10:32:55 +0200 Subject: nvme-core: warn on allocating admin tag set with existing queue Currently, nvme_alloc_admin_tag_set() silently drops and releases the existing admin_q if it called on a controller that already had one (e.g., during a controller reset). However, transport drivers should not be reallocating the admin tag set and queue during a reset. Dropping the old queue and allocating a new one destroys user-configured timeouts and may race against nvme_admin_timeout_store() Since all transport drivers are now expected to preserve the admin queue across resets, calling nvme_alloc_admin_tag_set() when ctrl->admin_q is already populated is a bug. Remove the silent cleanup and replace it with a WARN_ON_ONCE() to explicitly catch any transport drivers that violate this lifecycle rule Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Reviewed-by: Daniel Wagner Signed-off-by: Maurizio Lombardi Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a6fe2cfb1ab1..72c50d5e938d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4889,12 +4889,7 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, if (ret) return ret; - /* - * If a previous admin queue exists (e.g., from before a reset), - * put it now before allocating a new one to avoid orphaning it. - */ - if (ctrl->admin_q) - blk_put_queue(ctrl->admin_q); + WARN_ON_ONCE(ctrl->admin_q); ctrl->admin_q = blk_mq_alloc_queue(set, NULL, NULL); if (IS_ERR(ctrl->admin_q)) { -- cgit v1.2.3 From c8cdecdb47d3191146ab6a90b422d3271bc1ef89 Mon Sep 17 00:00:00 2001 From: Chao Shi Date: Fri, 15 May 2026 14:58:53 -0400 Subject: nvme: core: reject invalid LBA data size from Identify Namespace nvme_update_ns_info_block() trusts id->lbaf[lbaf].ds from the controller and assigns it directly to ns->head->lba_shift without bounds checking. nvme_lba_to_sect() then does: return lba << (head->lba_shift - SECTOR_SHIFT); When called with lba = le64_to_cpu(id->nsze) to compute the device capacity, an attacker-controlled controller can choose ds < 9 or a combination of (ds, nsze) that makes the left shift overflow sector_t. The former is a C undefined behaviour that UBSAN reports as a BUG; the latter silently yields a bogus capacity that the block layer then trusts for bounds checking. Validate ds against SECTOR_SHIFT and use check_shl_overflow() to compute capacity so that any (ds, nsze) combination that would overflow sector_t is rejected. The namespace is skipped with -ENODEV instead of crashing the kernel. This is reachable by a malicious NVMe device, a buggy firmware, or an attacker-controlled NVMe-oF target. The check is performed before queue_limits_start_update() and blk_mq_freeze_queue(), so the error path is a plain `goto out` with no cleanup needed. Stack trace (UBSAN, ds < 9 variant): RIP: nvme_lba_to_sect drivers/nvme/host/nvme.h:699 [inline] RIP: nvme_update_ns_info_block.cold+0x5/0x7 Call Trace: nvme_update_ns_info+0x175/0xd90 drivers/nvme/host/core.c:2467 nvme_validate_ns drivers/nvme/host/core.c:4299 [inline] nvme_scan_ns drivers/nvme/host/core.c:4350 nvme_scan_ns_async+0xa5/0xe0 drivers/nvme/host/core.c:4383 async_run_entry_fn process_one_work worker_thread kthread Found by Syzkaller. Acked-by: Sungwoo Kim Acked-by: Dave Tian Acked-by: Weidong Zhu Signed-off-by: Chao Shi Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 72c50d5e938d..10f154529334 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2407,12 +2407,22 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, goto out; } + if (id->lbaf[lbaf].ds < SECTOR_SHIFT || + check_shl_overflow(le64_to_cpu(id->nsze), + id->lbaf[lbaf].ds - SECTOR_SHIFT, + &capacity)) { + dev_warn_once(ns->ctrl->device, + "invalid LBA data size %u, skipping namespace\n", + id->lbaf[lbaf].ds); + ret = -ENODEV; + goto out; + } + lim = queue_limits_start_update(ns->disk->queue); memflags = blk_mq_freeze_queue(ns->disk->queue); ns->head->lba_shift = id->lbaf[lbaf].ds; ns->head->nuse = le64_to_cpu(id->nuse); - capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze)); nvme_set_ctrl_limits(ns->ctrl, &lim, false); nvme_configure_metadata(ns->ctrl, ns->head, id, nvm, info); nvme_set_chunk_sectors(ns, id, &lim); -- cgit v1.2.3 From 6022a5330fa2eabce7f20a23200e14a771640f1a Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Thu, 21 May 2026 17:37:16 +0200 Subject: nvme-core: fix unsigned comparison warning in nvme_wait_freeze_timeout The timeout variable in nvme_wait_freeze_timeout() is an unsigned type. Checking if it is <= 0 triggers a compiler warning because an unsigned variable can never be negative. Fix this warning by changing the type to long. Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202605211257.STzj2Ujv-lkp@intel.com/ Fixes: 23b6d2cbf75f ("nvme: remove redundant timeout argument from nvme_wait_freeze_timeout") Signed-off-by: Maurizio Lombardi Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 10f154529334..fb14a208febe 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -5256,7 +5256,7 @@ EXPORT_SYMBOL_GPL(nvme_unfreeze); int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl) { - unsigned long timeout = ctrl->io_timeout; + long timeout = ctrl->io_timeout; struct nvme_ns *ns; int srcu_idx; -- cgit v1.2.3 From 4dae393956093c807212918fd91a8fc70df15338 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 26 May 2026 17:22:22 +0800 Subject: nvmet-tcp: fix page fragment cache leak in error path In nvmet_tcp_alloc_queue(), when a connection is closed during the allocation process (e.g., nvmet_tcp_set_queue_sock() returns -ENOTCONN), the error handling jumps to out_destroy_sq and then to out_ida_remove without draining the page fragment cache. Although nvmet_tcp_free_cmd() is called in some error paths to release individual page fragments, the underlying page cache reference held by queue->pf_cache is never released. The first allocation using pf_cache is the call to nvmet_tcp_alloc_cmd() for queue->connect, which happens after ida_alloc() returns successfully. This results in a page leak each time a connection fails during allocation, which could lead to memory exhaustion over time if connections are repeatedly opened and closed. Fix this by calling page_frag_cache_drain() before freeing the queue structure in the out_ida_remove label. Fixes: 872d26a391da ("nvmet-tcp: add NVMe over TCP target driver") Reviewed-by: Christoph Hellwig Signed-off-by: Geliang Tang Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 164a564ba3b4..93b3c6134240 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1997,6 +1997,12 @@ out_free_connect: nvmet_tcp_free_cmd(&queue->connect); out_ida_remove: ida_free(&nvmet_tcp_queue_ida, queue->idx); + /* + * Drain the page fragment cache if any allocations were done. + * The first allocation using pf_cache is nvmet_tcp_alloc_cmd() + * for queue->connect after ida_alloc(). + */ + page_frag_cache_drain(&queue->pf_cache); out_sock: fput(queue->sock->file); out_free_queue: -- cgit v1.2.3 From 7ef789703e2b91775dcb36b2efa46325be31a2a0 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 26 May 2026 17:28:05 +0800 Subject: nvmet-tcp: check return value of nvmet_tcp_set_queue_sock The return value of nvmet_tcp_set_queue_sock() is currently ignored in nvmet_tcp_tls_handshake_done(). If it fails (e.g., due to the socket not being in TCP_ESTABLISHED state), the socket callbacks will not be properly set, leading to queue and socket leakage. Fix this by capturing the return value and calling nvmet_tcp_schedule_release_queue() on failure to ensure proper cleanup. Fixes: 675b453e0241 ("nvmet-tcp: enable TLS handshake upcall") Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Signed-off-by: Geliang Tang Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 93b3c6134240..3568fa9a0905 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1842,10 +1842,11 @@ static void nvmet_tcp_tls_handshake_done(void *data, int status, if (!status) status = nvmet_tcp_tls_key_lookup(queue, peerid); + if (!status) + status = nvmet_tcp_set_queue_sock(queue); + if (status) nvmet_tcp_schedule_release_queue(queue); - else - nvmet_tcp_set_queue_sock(queue); kref_put(&queue->kref, nvmet_tcp_release_queue); } -- cgit v1.2.3 From 5ab7c84f218b08908bf7768e5669d15e89595a02 Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 13 May 2026 09:50:30 +0000 Subject: nvme: use DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE for multipath_sysfs Use DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE instead of DEFINE_SYSFS_GROUP_VISIBLE, which means that we can drop multipath_sysfs_attr_visible(). Incidentally, multipath_sysfs_attr_visible() should have returned a umode_t. This idea was suggested by Ben Marzinski elsewhere. Reviewed-by: Christoph Hellwig Signed-off-by: John Garry Signed-off-by: Keith Busch --- drivers/nvme/host/sysfs.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index b682c1a4b23f..1f471f2cfd25 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -335,14 +335,7 @@ static bool multipath_sysfs_group_visible(struct kobject *kobj) return nvme_disk_is_ns_head(dev_to_disk(dev)); } - -static bool multipath_sysfs_attr_visible(struct kobject *kobj, - struct attribute *attr, int n) -{ - return false; -} - -DEFINE_SYSFS_GROUP_VISIBLE(multipath_sysfs) +DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(multipath_sysfs) const struct attribute_group nvme_ns_mpath_attr_group = { .name = "multipath", -- cgit v1.2.3 From 001e57554de81aa79c25c18fd53911d8a415c304 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Wed, 27 May 2026 11:50:00 +0530 Subject: nvme-multipath: fix flex array size in struct nvme_ns_head struct nvme_ns_head contains a flexible array member, current_path[], which is indexed using the NUMA node ID: head->current_path[numa_node_id()] The structure is currently allocated as: size = sizeof(struct nvme_ns_head) + (num_possible_nodes() * sizeof(struct nvme_ns *)); head = kzalloc(size, GFP_KERNEL); This allocation assumes that NUMA node IDs are sequential and densely packed from 0 .. num_possible_nodes() - 1. While this assumption holds on many systems, it is not always true on some architectures such as powerpc. On some powerpc systems, NUMA node IDs can be sparse. For example: NUMA: NUMA node(s): 6 NUMA node0 CPU(s): 80-159 NUMA node8 CPU(s): 0-79 NUMA node252 CPU(s): NUMA node253 CPU(s): NUMA node254 CPU(s): NUMA node255 CPU(s): That is, the possible/online NUMA node IDs are: 0, 8, 252, 253, 254, 255 In this case: num_possible_nodes() = 6 So memory is allocated for only 6 entries in current_path[]. However, the array is later indexed using the actual NUMA node ID. As a result, accesses such as: head->current_path[8] or head->current_path[252] goes out of bounds, leading to the following KASAN splat: ================================================================== BUG: KASAN: slab-out-of-bounds in nvme_mpath_revalidate_paths+0x22c/0x290 [nvme_core] Write of size 8 at addr c00020003bda35b8 by task kworker/u641:2/1997 CPU: 1 UID: 0 PID: 1997 Comm: kworker/u641:2 Not tainted 7.1.0-rc5-dirty #14 PREEMPT(lazy) Hardware name: 8335-GTH POWER9 0x4e1202 opal:skiboot-v6.5.3-35-g1851b2a06 PowerNV Workqueue: async async_run_entry_fn Call Trace: [c000200037fa7510] [c0000000021c23d4] dump_stack_lvl+0x88/0xdc (unreliable) [c000200037fa7540] [c0000000009fda90] print_report+0x22c/0x67c [c000200037fa7630] [c0000000009fd508] kasan_report+0x108/0x220 [c000200037fa7740] [c0000000009fff48] __asan_store8+0xe8/0x120 [c000200037fa7760] [c008000018e76474] nvme_mpath_revalidate_paths+0x22c/0x290 [nvme_core] [c000200037fa7800] [c008000018e6556c] nvme_update_ns_info+0x4a4/0x5e0 [nvme_core] [c000200037fa7a50] [c008000018e66270] nvme_alloc_ns+0x6d8/0x1a70 [nvme_core] [c000200037fa7c20] [c008000018e679fc] nvme_scan_ns+0x3f4/0x630 [nvme_core] [c000200037fa7d10] [c00000000031f22c] async_run_entry_fn+0x9c/0x3a0 [c000200037fa7db0] [c0000000002fa544] process_one_work+0x414/0xa10 [c000200037fa7ec0] [c0000000002fbf00] worker_thread+0x320/0x640 [c000200037fa7f80] [c00000000030d0f8] kthread+0x278/0x290 [c000200037fa7fe0] [c00000000000ded8] start_kernel_thread+0x14/0x18 Allocated by task 1997 on cpu 1 at 35.928317s: The buggy address belongs to the object at c00020003bda3000 which belongs to the cache kmalloc-rnd-15-2k of size 2048 The buggy address is located 16 bytes to the right of allocated 1448-byte region [c00020003bda3000, c00020003bda35a8) The buggy address belongs to the physical page: Memory state around the buggy address: c00020003bda3480: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 c00020003bda3500: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >c00020003bda3580: 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc ^ c00020003bda3600: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc c00020003bda3680: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ================================================================== Fix this by allocating the flexible array using nr_node_ids instead of num_possible_nodes(). Since nr_node_ids represents the maximum possible NUMA node IDs, indexing current_path[] using numa_node_id() becomes safe even on systems with sparse node IDs. Fixes: f333444708f8 ("nvme: take node locality into account when selecting a path") Tested-by: Mukesh Kumar Chaurasiya (IBM) Reviewed-by: Mukesh Kumar Chaurasiya (IBM) Reviewed-by: Hannes Reinecke Reviewed-by: John Garry Reviewed-by: Christoph Hellwig Signed-off-by: Nilay Shroff Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index fb14a208febe..5d8af8aa472e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3930,7 +3930,7 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, int ret = -ENOMEM; #ifdef CONFIG_NVME_MULTIPATH - size += num_possible_nodes() * sizeof(struct nvme_ns *); + size += nr_node_ids * sizeof(struct nvme_ns *); #endif head = kzalloc(size, GFP_KERNEL); -- cgit v1.2.3 From badc53620fe813b3a9f727ef9526f98567c2c898 Mon Sep 17 00:00:00 2001 From: Wentao Liang Date: Wed, 27 May 2026 08:45:44 +0000 Subject: nvme: target: rdma: fix ndev refcount leak on queue connect nvmet_rdma_queue_connect() calls nvmet_rdma_find_get_device() which acquires a reference on the returned ndev via kref_get(). On the path where the host queue backlog is exceeded and the function returns NVME_SC_CONNECT_CTRL_BUSY, reference of ndev is not released, leaking the kref. Fix this by adding a goto to the existing put_device label before the early return. Fixes: 31deaeb11ba7 ("nvmet-rdma: avoid circular locking dependency on install_queue()") Cc: stable@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Wentao Liang Signed-off-by: Keith Busch --- drivers/nvme/target/rdma.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index e6e2c3f9afdf..ac26f4f774c4 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1598,8 +1598,10 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, pending++; } mutex_unlock(&nvmet_rdma_queue_mutex); - if (pending > NVMET_RDMA_BACKLOG) - return NVME_SC_CONNECT_CTRL_BUSY; + if (pending > NVMET_RDMA_BACKLOG) { + ret = NVME_SC_CONNECT_CTRL_BUSY; + goto put_device; + } } ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn); -- cgit v1.2.3 From a192b8cfa447e1b3701a13434a31c392b2e7ed29 Mon Sep 17 00:00:00 2001 From: Mateusz Nowicki Date: Sat, 23 May 2026 08:28:16 +0000 Subject: nvme-pci: fix out-of-bounds access in nvme_setup_descriptor_pools nvme_setup_descriptor_pools() indexes dev->descriptor_pools[] using the numa_node forwarded from hctx->numa_node by its single caller, nvme_init_hctx_common(). On a non-NUMA kernel hctx->numa_node is NUMA_NO_NODE (-1). Because the parameter was declared 'unsigned', the value becomes UINT_MAX and the index walks off the array (sized to nr_node_ids), faulting during nvme_alloc_ns() and leaving the namespace without a /dev node. Reproduces on any NVMe controller probed by a CONFIG_NUMA=n kernel: BUG: unable to handle page fault for address: ffff889101603d38 RIP: 0010:nvme_init_hctx_common+0x5a/0x190 [nvme] Call Trace: nvme_init_hctx+0x10/0x20 [nvme] nvme_alloc_ns+0x9e/0xa10 [nvme_core] nvme_scan_ns+0x301/0x3b0 [nvme_core] nvme_scan_ns_async+0x23/0x30 [nvme_core] Switch the parameter to int and fall back to node 0 when it is NUMA_NO_NODE; node 0 is always present. Fixes: d977506f8863 ("nvme-pci: make PRP list DMA pools per-NUMA-node") Link: https://lore.kernel.org/r/20260309062840.2937858-2-iam@sung-woo.kim Reported-by: Sung-woo Kim Reviewed-by: Christoph Hellwig Signed-off-by: Mateusz Nowicki Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 35affda088f4..d20d8722ad96 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -587,11 +587,16 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db, } static struct nvme_descriptor_pools * -nvme_setup_descriptor_pools(struct nvme_dev *dev, unsigned numa_node) +nvme_setup_descriptor_pools(struct nvme_dev *dev, int numa_node) { - struct nvme_descriptor_pools *pools = &dev->descriptor_pools[numa_node]; + struct nvme_descriptor_pools *pools; size_t small_align = NVME_SMALL_POOL_SIZE; + if (numa_node == NUMA_NO_NODE) + numa_node = 0; + + pools = &dev->descriptor_pools[numa_node]; + if (pools->small) return pools; /* already initialized */ -- cgit v1.2.3 From f078d1aa52a4481cbf4d12c1543639d65a020d3b Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 29 May 2026 09:52:01 +0000 Subject: nvme-multipath: pass NS head to nvme_mpath_revalidate_paths() In nvme_mpath_revalidate_paths(), we are passed a NS pointer and use that to lookup the NS head and then use that same NS pointer as an iter variable. It makes more sense pass the NS head and use a local variable for the NS iter. Reviewed-by: Christoph Hellwig Signed-off-by: John Garry Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/multipath.c | 4 ++-- drivers/nvme/host/nvme.h | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5d8af8aa472e..f69e3115d8cf 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2567,7 +2567,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk)); set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info)); - nvme_mpath_revalidate_paths(ns); + nvme_mpath_revalidate_paths(ns->head); blk_mq_unfreeze_queue(ns->head->disk->queue, memflags); } diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 263161cb8ac0..e00e2842df30 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -254,10 +254,10 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) srcu_read_unlock(&ctrl->srcu, srcu_idx); } -void nvme_mpath_revalidate_paths(struct nvme_ns *ns) +void nvme_mpath_revalidate_paths(struct nvme_ns_head *head) { - struct nvme_ns_head *head = ns->head; sector_t capacity = get_capacity(head->disk); + struct nvme_ns *ns; int node; int srcu_idx; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9ccaed0b9dbf..86b09c06b9e0 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -1043,7 +1043,7 @@ void nvme_mpath_update(struct nvme_ctrl *ctrl); void nvme_mpath_uninit(struct nvme_ctrl *ctrl); void nvme_mpath_stop(struct nvme_ctrl *ctrl); bool nvme_mpath_clear_current_path(struct nvme_ns *ns); -void nvme_mpath_revalidate_paths(struct nvme_ns *ns); +void nvme_mpath_revalidate_paths(struct nvme_ns_head *head); void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl); void nvme_mpath_remove_disk(struct nvme_ns_head *head); void nvme_mpath_start_request(struct request *rq); @@ -1108,7 +1108,7 @@ static inline bool nvme_mpath_clear_current_path(struct nvme_ns *ns) { return false; } -static inline void nvme_mpath_revalidate_paths(struct nvme_ns *ns) +static inline void nvme_mpath_revalidate_paths(struct nvme_ns_head *head) { } static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) -- cgit v1.2.3 From 4cf06977bdb6a037e2717b4117f3fd636f6e9641 Mon Sep 17 00:00:00 2001 From: liyouhong Date: Fri, 29 May 2026 16:51:43 +0800 Subject: nvme-multipath: require exact iopolicy names for module parameter The iopolicy module parameter uses strncmp prefix matching, so values like "numax" are accepted as "numa". The per-subsystem sysfs attribute already requires an exact match via sysfs_streq(). Parse both through a shared helper so invalid values are rejected consistently. Reviewed-by: Christoph Hellwig Signed-off-by: liyouhong Signed-off-by: Keith Busch --- drivers/nvme/host/multipath.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index e00e2842df30..d6c51f59ff25 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -73,19 +73,29 @@ static const char *nvme_iopolicy_names[] = { static int iopolicy = NVME_IOPOLICY_NUMA; +static int nvme_iopolicy_parse(const char *str) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) { + if (sysfs_streq(str, nvme_iopolicy_names[i])) + return i; + } + return -EINVAL; +} + static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp) { + int policy; + if (!val) return -EINVAL; - if (!strncmp(val, "numa", 4)) - iopolicy = NVME_IOPOLICY_NUMA; - else if (!strncmp(val, "round-robin", 11)) - iopolicy = NVME_IOPOLICY_RR; - else if (!strncmp(val, "queue-depth", 11)) - iopolicy = NVME_IOPOLICY_QD; - else - return -EINVAL; + policy = nvme_iopolicy_parse(val); + if (policy < 0) + return policy; + + iopolicy = policy; return 0; } @@ -1039,16 +1049,14 @@ static ssize_t nvme_subsys_iopolicy_store(struct device *dev, { struct nvme_subsystem *subsys = container_of(dev, struct nvme_subsystem, dev); - int i; + int policy; - for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) { - if (sysfs_streq(buf, nvme_iopolicy_names[i])) { - nvme_subsys_iopolicy_update(subsys, i); - return count; - } - } + policy = nvme_iopolicy_parse(buf); + if (policy < 0) + return policy; - return -EINVAL; + nvme_subsys_iopolicy_update(subsys, policy); + return count; } SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR, nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store); -- cgit v1.2.3 From 88bac2c1a72b8f4f71e9845699aa872df04e5850 Mon Sep 17 00:00:00 2001 From: "Achkinazi, Igor" Date: Thu, 28 May 2026 15:24:27 +0000 Subject: nvme-multipath: set BIO_REMAPPED on bios remapped to per-path namespace disks When nvme_ns_head_submit_bio() remaps a bio from the multipath head to a per-path namespace, bio_set_dev() clears BIO_REMAPPED. The remapped bio is then resubmitted through submit_bio_noacct() which calls bio_check_eod() because BIO_REMAPPED is not set. This races with nvme_ns_remove() which zeroes the per-path capacity before synchronize_srcu(): CPU 0 (IO submission) --------------------- srcu_read_lock() nvme_find_path() -> ns [NVME_NS_READY is set] CPU 1 (namespace removal) ------------------------- clear_bit(NVME_NS_READY) set_capacity(ns->disk, 0) synchronize_srcu() <- blocks CPU 0 (IO submission) --------------------- bio_set_dev(bio, ns->disk->part0) [clears BIO_REMAPPED] submit_bio_noacct(bio) -> bio_check_eod() sees capacity=0 -> bio fails with IO error The SRCU read lock prevents synchronize_srcu() from completing, but does not prevent set_capacity(0) from executing. The bio fails the EOD check before it reaches the NVMe driver, so nvme_failover_req() never gets a chance to redirect it to another path of multipath. IO errors are reported to the application despite another path being available. On older kernels (before commit 0b64682e78f7 "block: skip unnecessary checks for split bio"), the same race was also reachable through split remainders resubmitted via submit_bio_noacct(). Fix this by setting BIO_REMAPPED after bio_set_dev() in nvme_ns_head_submit_bio(). This skips bio_check_eod() on the per-path device; the EOD check already passed on the multipath head. NVMe per-path namespace devices are always whole disks (bd_partno=0), so the blk_partition_remap() skip also gated by BIO_REMAPPED is a no-op. The flag does not persist across failover and cannot go stale if the namespace geometry changes between attempts: nvme_failover_req() calls bio_set_dev() to redirect the bio back to the multipath head, which clears BIO_REMAPPED. When nvme_requeue_work() resubmits through submit_bio_noacct(), bio_check_eod() runs normally against the current capacity. Same approach as commit 3a905c37c351 ("block: skip bio_check_eod for partition-remapped bios"). Fixes: a7c7f7b2b641 ("nvme: use bio_set_dev to assign ->bi_bdev") Cc: stable@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Igor Achkinazi Signed-off-by: Keith Busch --- drivers/nvme/host/multipath.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index d6c51f59ff25..bd9e8d5a2713 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -521,6 +521,12 @@ static void nvme_ns_head_submit_bio(struct bio *bio) ns = nvme_find_path(head); if (likely(ns)) { bio_set_dev(bio, ns->disk->part0); + /* + * Use BIO_REMAPPED to skip bio_check_eod() when this bio + * enters submit_bio_noacct() for the per-path device. The EOD + * check already passed on the multipath head. + */ + bio_set_flag(bio, BIO_REMAPPED); bio->bi_opf |= REQ_NVME_MPATH; trace_block_bio_remap(bio, disk_devt(ns->head->disk), bio->bi_iter.bi_sector); -- cgit v1.2.3 From 53cd102a7a56079b11b897835bd9b94c14e6322c Mon Sep 17 00:00:00 2001 From: Bryam Vargas Date: Wed, 27 May 2026 15:00:00 -0500 Subject: nvmet: fix pre-auth out-of-bounds heap read in Discovery Get Log Page nvmet_execute_disc_get_log_page() validates only the dword alignment of the host-supplied Log Page Offset (lpo). The 64-bit offset is then added to a small kzalloc'd buffer that holds the discovery log page and the result is passed straight to nvmet_copy_to_sgl(), which memcpy()s data_len bytes out to the host with no source-side bound check: u64 offset = nvmet_get_log_page_offset(req->cmd); /* 64-bit host */ size_t data_len = nvmet_get_log_page_len(req->cmd); /* 32-bit host */ ... if (offset & 0x3) { ... } /* only check */ ... alloc_len = sizeof(*hdr) + entry_size * discovery_log_entries(req); buffer = kzalloc(alloc_len, GFP_KERNEL); ... status = nvmet_copy_to_sgl(req, 0, buffer + offset, data_len); The Discovery controller is unauthenticated -- nvmet_host_allowed() returns true unconditionally for the discovery subsystem -- so the call is reachable pre-authentication by any TCP/RDMA/FC peer that can reach the nvmet target. With a discovery log page of ~1 KiB, an attacker requesting up to 4 KiB starting at offset == alloc_len reads the next slab page out and gets its content returned over the fabric (an empirical run on a default nvmet-tcp loopback target leaked 81 canonical kernel pointers in one Get Log Page response). Pointing the offset at unmapped kernel memory faults the in-kernel memcpy and crashes (or panics, on panic_on_oops=1) the target host instead. The attacker-controlled source-side offset pattern "nvmet_copy_to_sgl(req, 0, buffer + ATTACKER_OFFSET, ...)" is unique to nvmet_execute_disc_get_log_page in the entire nvmet codebase: every other Get Log Page handler in admin-cmd.c either ignores lpo (and silently starts every response at offset 0) or tracks a local destination offset with a fixed source pointer. Validate the host-supplied offset against the log page size, cap the copy length to what is actually available, and zero-fill any remainder of the host transfer buffer. The zero-fill matches the existing short-response pattern in nvmet_execute_get_log_changed_ns() (admin-cmd.c) and prevents leaking transport SGL contents when the host asks for more bytes than the log page contains. Fixes: a07b4970f464 ("nvmet: add a generic NVMe target") Cc: stable@vger.kernel.org Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Bryam Vargas Signed-off-by: Keith Busch --- drivers/nvme/target/discovery.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index e9b35549e254..114869d16a1f 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -166,6 +166,7 @@ static void nvmet_execute_disc_get_log_page(struct nvmet_req *req) u64 offset = nvmet_get_log_page_offset(req->cmd); size_t data_len = nvmet_get_log_page_len(req->cmd); size_t alloc_len; + size_t copy_len; struct nvmet_subsys_link *p; struct nvmet_port *r; u32 numrec = 0; @@ -242,7 +243,27 @@ static void nvmet_execute_disc_get_log_page(struct nvmet_req *req) up_read(&nvmet_config_sem); - status = nvmet_copy_to_sgl(req, 0, buffer + offset, data_len); + /* + * Validate the host-supplied log page offset before copying out. + * Without this check, the host controls a 64-bit byte offset into + * a small kzalloc'd buffer: a value past the log page lets the + * subsequent memcpy read adjacent kernel heap, and a value aimed + * at unmapped kernel memory faults the in-kernel copy and crashes + * the target host. The Discovery controller is unauthenticated, + * so the bug is reachable from any reachable fabric peer. + */ + if (offset > alloc_len) { + req->error_loc = + offsetof(struct nvme_get_log_page_command, lpo); + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; + goto out_free_buffer; + } + + copy_len = min_t(size_t, data_len, alloc_len - offset); + status = nvmet_copy_to_sgl(req, 0, buffer + offset, copy_len); + if (!status && copy_len < data_len) + status = nvmet_zero_sgl(req, copy_len, data_len - copy_len); +out_free_buffer: kfree(buffer); out: nvmet_req_complete(req, status); -- cgit v1.2.3 From 8757fd9500cf2fd9b27451cb6eb7e28003c3d202 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 30 May 2026 08:10:57 +0000 Subject: nvme-tcp: Use WQ_PERCPU explicitly if wq_unbound is false. Since commit 21c05ca88a54 ("workqueue: Add warnings and ensure one among WQ_PERCPU or WQ_UNBOUND is present"), we must explicitly set WQ_PERCPU or WQ_UNBOUND when creating workqueue. nvme_tcp_init_module() sets WQ_UNBOUND when the module param wq_unbound is set, but otherwise, WQ_PERCPU is missing, triggering the warning below: workqueue: nvme_tcp_wq is using neither WQ_PERCPU or WQ_UNBOUND. Setting WQ_PERCPU. WARNING: kernel/workqueue.c:5856 at __alloc_workqueue+0x1d02/0x2070 kernel/workqueue.c:5855, CPU#0: swapper/0/1 Let's set WQ_PERCPU if wq_unbound is false. Reported-by: syzbot+d078cba4418e65f61984@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/6a1a9a86.323e8352.141b09.0001.GAE@google.com/ Tested-by: Venkat Rao Bagalkote Reviewed-by: Nilay Shroff Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Signed-off-by: Kuniyuki Iwashima Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 0552aa8a1150..6241e71130c4 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -3053,6 +3053,8 @@ static int __init nvme_tcp_init_module(void) if (wq_unbound) wq_flags |= WQ_UNBOUND; + else + wq_flags |= WQ_PERCPU; nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", wq_flags, 0); if (!nvme_tcp_wq) -- cgit v1.2.3 From 0967074f6830718fd2597404ef119bddd0dbfd00 Mon Sep 17 00:00:00 2001 From: liuxixin Date: Thu, 28 May 2026 18:00:01 +0800 Subject: nvme: fix FDP fdpcidx bounds check The fdpcidx bounds check sets n = NUMFDPC + 1 but used > instead of >=, incorrectly accepting fdp_idx when it equals n (i.e. NUMFDPC + 1). Fixes: 30b5f20bb2dd ("nvme: register fdp parameters with the block layer") Reviewed-by: Nitesh Shetty Reviewed-by: Christoph Hellwig Signed-off-by: liuxixin Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f69e3115d8cf..ea837b94d3e5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2261,7 +2261,7 @@ static int nvme_query_fdp_granularity(struct nvme_ctrl *ctrl, } n = le16_to_cpu(h->numfdpc) + 1; - if (fdp_idx > n) { + if (fdp_idx >= n) { dev_warn(ctrl->device, "FDP index:%d out of range:%d\n", fdp_idx, n); /* Proceed without registering FDP streams */ -- cgit v1.2.3 From 59c0517123f2757c41d7795f841bc4c836577d17 Mon Sep 17 00:00:00 2001 From: Yao Sang Date: Thu, 28 May 2026 15:36:01 +0800 Subject: nvme: refresh multipath head zoned limits from path limits queue_limits_stack_bdev() updates the multipath head limits from the path queue, but it does not propagate max_open_zones or max_active_zones. As a result, a zoned multipath namespace head can keep stale 0/0 values even after a ready path reports finite zoned resource limits. When refreshing the head limits in nvme_update_ns_info(), stack the zoned resource limits directly after stacking the path queue limits. Use min_not_zero() so the block layer's 0 value keeps its "no limit" meaning while finite limits are combined conservatively. This avoids advertising "no limit" on the multipath head while keeping the zoned-limit handling local to the NVMe multipath update path. Reviewed-by: Christoph Hellwig Signed-off-by: Yao Sang Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ea837b94d3e5..cad9d9735261 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2491,6 +2491,14 @@ out: return ret; } +static void nvme_stack_zone_resources(struct queue_limits *t, + const struct queue_limits *b) +{ + t->max_open_zones = min_not_zero(t->max_open_zones, b->max_open_zones); + t->max_active_zones = + min_not_zero(t->max_active_zones, b->max_active_zones); +} + static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) { bool unsupported = false; @@ -2557,6 +2565,8 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) lim.io_opt = ns_lim->io_opt; queue_limits_stack_bdev(&lim, ns->disk->part0, 0, ns->head->disk->disk_name); + if (lim.features & BLK_FEAT_ZONED) + nvme_stack_zone_resources(&lim, ns_lim); if (unsupported) ns->head->disk->flags |= GENHD_FL_HIDDEN; else -- cgit v1.2.3 From 3a413ece2504c70aa34a20be4dafec04e8c741f9 Mon Sep 17 00:00:00 2001 From: Tianchu Chen Date: Fri, 29 May 2026 14:18:39 +0000 Subject: nvmet-auth: validate reply message payload bounds against transfer length nvmet_auth_reply() accesses the variable-length rval[] array using attacker-controlled hl (hash length) and dhvlen (DH value length) fields without verifying they fit within the allocated buffer of tl bytes. A malicious NVMe-oF initiator can craft a DHCHAP_REPLY message with a small transfer length but large hl/dhvlen values, causing out-of-bounds heap reads when the target processes the DH public key (rval + 2*hl) or performs the host response memcmp. With DH authentication configured, the OOB pointer is passed directly to sg_init_one() and read by crypto_kpp_compute_shared_secret(), reaching up to 526 bytes past the buffer. This is exploitable pre-authentication. Add bounds validation ensuring sizeof(*data) + 2*hl + dhvlen <= tl before any access to the variable-length fields. Discovered by Atuin - Automated Vulnerability Discovery Engine. Fixes: db1312dd9548 ("nvmet: implement basic In-Band Authentication") Cc: stable@vger.kernel.org Reviewed-by: Hannes Reinecke Signed-off-by: Tianchu Chen Signed-off-by: Keith Busch --- drivers/nvme/target/fabrics-cmd-auth.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c index f1e613e7c63e..0a85acf1e5c7 100644 --- a/drivers/nvme/target/fabrics-cmd-auth.c +++ b/drivers/nvme/target/fabrics-cmd-auth.c @@ -132,13 +132,22 @@ static u8 nvmet_auth_negotiate(struct nvmet_req *req, void *d) return 0; } -static u8 nvmet_auth_reply(struct nvmet_req *req, void *d) +static u8 nvmet_auth_reply(struct nvmet_req *req, void *d, u32 tl) { struct nvmet_ctrl *ctrl = req->sq->ctrl; struct nvmf_auth_dhchap_reply_data *data = d; - u16 dhvlen = le16_to_cpu(data->dhvlen); + u16 dhvlen; u8 *response; + if (tl < sizeof(*data)) + return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; + + dhvlen = le16_to_cpu(data->dhvlen); + + /* Validate that hl and dhvlen fit within the transfer length */ + if (sizeof(*data) + 2 * (size_t)data->hl + dhvlen > tl) + return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; + pr_debug("%s: ctrl %d qid %d: data hl %d cvalid %d dhvlen %u\n", __func__, ctrl->cntlid, req->sq->qid, data->hl, data->cvalid, dhvlen); @@ -338,7 +347,7 @@ void nvmet_execute_auth_send(struct nvmet_req *req) switch (data->auth_id) { case NVME_AUTH_DHCHAP_MESSAGE_REPLY: - dhchap_status = nvmet_auth_reply(req, d); + dhchap_status = nvmet_auth_reply(req, d, tl); if (dhchap_status == 0) req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1; -- cgit v1.2.3 From 0ef4daa6534a510d61ea67c8ad9bb5097b0dd5f8 Mon Sep 17 00:00:00 2001 From: liuxixin Date: Tue, 2 Jun 2026 22:00:01 +0800 Subject: nvme: validate FDP configuration descriptor sizes Validate descriptor sizes while walking the FDP configurations log so dsze == 0 or a descriptor past the log end cannot cause unbounded iteration or reads past the buffer. Reviewed-by: Nitesh Shetty Reviewed-by: Christoph Hellwig Signed-off-by: liuxixin Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index cad9d9735261..23dfce27ace2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2273,14 +2273,16 @@ static int nvme_query_fdp_granularity(struct nvme_ctrl *ctrl, desc = log; end = log + size - sizeof(*h); for (i = 0; i < fdp_idx; i++) { - log += le16_to_cpu(desc->dsze); - desc = log; - if (log >= end) { + u16 dsze = le16_to_cpu(desc->dsze); + + if (!dsze || log + dsze > end) { dev_warn(ctrl->device, - "FDP invalid config descriptor list\n"); + "FDP invalid config descriptor at index %d\n", i); ret = 0; goto out; } + log += dsze; + desc = log; } if (le32_to_cpu(desc->nrg) > 1) { -- cgit v1.2.3 From 2caaa52c1a440a3951fb098a148d716dada1ecc2 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Sat, 30 May 2026 14:20:44 +0900 Subject: nvme-tcp: move nvme_tcp_reclassify_socket() Move nvme_tcp_reclassify_socket() in tcp.c after the struct nvme_tcp_queue definition. This is preparation for adding a reference to struct nvme_tcp_queue in the function, which would otherwise cause a compile failure due to the struct being defined after the function. Move the entire CONFIG_DEBUG_LOCK_ALLOC block along with the function to maintain the code organization. Reviewed-by: Christoph Hellwig Reviewed-by: Nilay Shroff Signed-off-by: Shin'ichiro Kawasaki Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 76 ++++++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 6241e71130c4..353ac6ce9fbd 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -56,44 +56,6 @@ MODULE_PARM_DESC(tls_handshake_timeout, static atomic_t nvme_tcp_cpu_queues[NR_CPUS]; -#ifdef CONFIG_DEBUG_LOCK_ALLOC -/* lockdep can detect a circular dependency of the form - * sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock - * because dependencies are tracked for both nvme-tcp and user contexts. Using - * a separate class prevents lockdep from conflating nvme-tcp socket use with - * user-space socket API use. - */ -static struct lock_class_key nvme_tcp_sk_key[2]; -static struct lock_class_key nvme_tcp_slock_key[2]; - -static void nvme_tcp_reclassify_socket(struct socket *sock) -{ - struct sock *sk = sock->sk; - - if (WARN_ON_ONCE(!sock_allow_reclassification(sk))) - return; - - switch (sk->sk_family) { - case AF_INET: - sock_lock_init_class_and_name(sk, "slock-AF_INET-NVME", - &nvme_tcp_slock_key[0], - "sk_lock-AF_INET-NVME", - &nvme_tcp_sk_key[0]); - break; - case AF_INET6: - sock_lock_init_class_and_name(sk, "slock-AF_INET6-NVME", - &nvme_tcp_slock_key[1], - "sk_lock-AF_INET6-NVME", - &nvme_tcp_sk_key[1]); - break; - default: - WARN_ON_ONCE(1); - } -} -#else -static void nvme_tcp_reclassify_socket(struct socket *sock) { } -#endif - enum nvme_tcp_send_state { NVME_TCP_SEND_CMD_PDU = 0, NVME_TCP_SEND_H2C_PDU, @@ -207,6 +169,44 @@ static const struct blk_mq_ops nvme_tcp_mq_ops; static const struct blk_mq_ops nvme_tcp_admin_mq_ops; static int nvme_tcp_try_send(struct nvme_tcp_queue *queue); +#ifdef CONFIG_DEBUG_LOCK_ALLOC +/* lockdep can detect a circular dependency of the form + * sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock + * because dependencies are tracked for both nvme-tcp and user contexts. Using + * a separate class prevents lockdep from conflating nvme-tcp socket use with + * user-space socket API use. + */ +static struct lock_class_key nvme_tcp_sk_key[2]; +static struct lock_class_key nvme_tcp_slock_key[2]; + +static void nvme_tcp_reclassify_socket(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (WARN_ON_ONCE(!sock_allow_reclassification(sk))) + return; + + switch (sk->sk_family) { + case AF_INET: + sock_lock_init_class_and_name(sk, "slock-AF_INET-NVME", + &nvme_tcp_slock_key[0], + "sk_lock-AF_INET-NVME", + &nvme_tcp_sk_key[0]); + break; + case AF_INET6: + sock_lock_init_class_and_name(sk, "slock-AF_INET6-NVME", + &nvme_tcp_slock_key[1], + "sk_lock-AF_INET6-NVME", + &nvme_tcp_sk_key[1]); + break; + default: + WARN_ON_ONCE(1); + } +} +#else +static void nvme_tcp_reclassify_socket(struct socket *sock) { } +#endif + static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl) { return container_of(ctrl, struct nvme_tcp_ctrl, ctrl); -- cgit v1.2.3 From 19bdb70c77d3b24239a453291299b64040bdba86 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Thu, 4 Jun 2026 11:32:08 +0900 Subject: nvme-tcp: lockdep: use dynamic lockdep keys per socket instance When NVMe-TCP controller setup and teardown are repeated with lockdep enabled, lockdep reports false positives WARN for the following locks: 1) &q->elevator_lock : IO scheduler change context 2) &q->q_usage_counter(io) : SCSI disk probe context 3) fs_reclaim : CPU hotplug bring-up context 4) cpu_hotplug_lock : socket establishment context 5) sk_lock-AF_INET-NVME : MQ sched dispatch context for the socket 6) set->srcu : NVMe controller delete context The lockdep WARN was observed by running blktests test case nvme/005 for tcp transport on v7.1-rc1 kernel with a patch. Refer to the Link tag for the details of the WARN. This is a false positive because lockdep confuses lock 4) (socket establishment) with lock 5) (socket in use) for different socket instances. The locks belong to different sockets, but lockdep treats them as the same due to shared static lockdep keys. Fix this by using dynamically allocated lockdep keys per socket instance instead of static keys nvme_tcp_sk_key[] and nvme_tcp_slock_key[]. Add nvme_tcp_sk_key and nvme_tcp_slock_key fields to struct nvme_tcp_queue and pass them to sock_lock_init_class_and_name() for proper lockdep tracking. Change the argument of nvme_tcp_reclassify_socket() from 'struct socket *' to 'struct nvme_tcp_queue *' to pass both the socket and the keys. Add CONFIG_DEBUG_LOCK_ALLOC guards to nvme_tcp_alloc_queue() and nvme_tcp_free_queue() to register and unregister the dynamic keys. Additionally, move nvme_tcp_reclassify_socket() inside these guards since it's only needed when lockdep is enabled. Link: https://lore.kernel.org/linux-nvme/afB5syZbUrppgsDQ@shinmob/ Suggested-by: Nilay Shroff Reviewed-by: Nilay Shroff Signed-off-by: Shin'ichiro Kawasaki Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 353ac6ce9fbd..9d17c88a6200 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -142,6 +142,11 @@ struct nvme_tcp_queue { void (*state_change)(struct sock *); void (*data_ready)(struct sock *); void (*write_space)(struct sock *); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lock_class_key nvme_tcp_sk_key; + struct lock_class_key nvme_tcp_slock_key; +#endif }; struct nvme_tcp_ctrl { @@ -176,12 +181,9 @@ static int nvme_tcp_try_send(struct nvme_tcp_queue *queue); * a separate class prevents lockdep from conflating nvme-tcp socket use with * user-space socket API use. */ -static struct lock_class_key nvme_tcp_sk_key[2]; -static struct lock_class_key nvme_tcp_slock_key[2]; - -static void nvme_tcp_reclassify_socket(struct socket *sock) +static void nvme_tcp_reclassify_socket(struct nvme_tcp_queue *queue) { - struct sock *sk = sock->sk; + struct sock *sk = queue->sock->sk; if (WARN_ON_ONCE(!sock_allow_reclassification(sk))) return; @@ -189,22 +191,20 @@ static void nvme_tcp_reclassify_socket(struct socket *sock) switch (sk->sk_family) { case AF_INET: sock_lock_init_class_and_name(sk, "slock-AF_INET-NVME", - &nvme_tcp_slock_key[0], + &queue->nvme_tcp_slock_key, "sk_lock-AF_INET-NVME", - &nvme_tcp_sk_key[0]); + &queue->nvme_tcp_sk_key); break; case AF_INET6: sock_lock_init_class_and_name(sk, "slock-AF_INET6-NVME", - &nvme_tcp_slock_key[1], + &queue->nvme_tcp_slock_key, "sk_lock-AF_INET6-NVME", - &nvme_tcp_sk_key[1]); + &queue->nvme_tcp_sk_key); break; default: WARN_ON_ONCE(1); } } -#else -static void nvme_tcp_reclassify_socket(struct socket *sock) { } #endif static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl) @@ -1468,6 +1468,11 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) kfree(queue->pdu); mutex_destroy(&queue->send_mutex); mutex_destroy(&queue->queue_lock); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + lockdep_unregister_key(&queue->nvme_tcp_sk_key); + lockdep_unregister_key(&queue->nvme_tcp_slock_key); +#endif } static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) @@ -1813,7 +1818,12 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, } sk_net_refcnt_upgrade(queue->sock->sk); - nvme_tcp_reclassify_socket(queue->sock); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + lockdep_register_key(&queue->nvme_tcp_sk_key); + lockdep_register_key(&queue->nvme_tcp_slock_key); + nvme_tcp_reclassify_socket(queue); +#endif /* Single syn retry */ tcp_sock_set_syncnt(queue->sock->sk, 1); @@ -1918,6 +1928,10 @@ err_sock: /* Use sync variant - see nvme_tcp_free_queue() for explanation */ __fput_sync(queue->sock->file); queue->sock = NULL; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + lockdep_unregister_key(&queue->nvme_tcp_sk_key); + lockdep_unregister_key(&queue->nvme_tcp_slock_key); +#endif err_destroy_mutex: mutex_destroy(&queue->send_mutex); mutex_destroy(&queue->queue_lock); -- cgit v1.2.3 From 37afebc79a11bd889fe8e0a98c9ae034c3cff323 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Sun, 17 May 2026 00:06:48 +0530 Subject: nvme: add diag attribute group under sysfs Add a new diag attribute group under: /sys/class/nvme// /sys/block// /sys/block// This new sysfs attribute group will be used to organize NVMe diagnostic and telemetry-related counters under it. Tested-by: Venkat Rao Bagalkote Signed-off-by: Nilay Shroff Signed-off-by: Keith Busch --- drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/pci.c | 1 + drivers/nvme/host/sysfs.c | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 86b09c06b9e0..46cfce4dbbf6 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -1014,6 +1014,7 @@ extern const struct attribute_group nvme_ns_mpath_attr_group; extern const struct pr_ops nvme_pr_ops; extern const struct block_device_operations nvme_ns_head_ops; extern const struct attribute_group nvme_dev_attrs_group; +extern const struct attribute_group nvme_dev_diag_attrs_group; extern const struct attribute_group *nvme_subsys_attrs_groups[]; extern const struct attribute_group *nvme_dev_attr_groups[]; extern const struct block_device_operations nvme_bdev_ops; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d20d8722ad96..cf7192239782 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2815,6 +2815,7 @@ static const struct attribute_group nvme_pci_dev_attrs_group = { static const struct attribute_group *nvme_pci_dev_attr_groups[] = { &nvme_dev_attrs_group, &nvme_pci_dev_attrs_group, + &nvme_dev_diag_attrs_group, NULL, }; diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 1f471f2cfd25..1d507a835783 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -344,11 +344,28 @@ const struct attribute_group nvme_ns_mpath_attr_group = { }; #endif +static struct attribute *nvme_ns_diag_attrs[] = { + NULL, +}; + +static umode_t nvme_ns_diag_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + return a->mode; +} + +const struct attribute_group nvme_ns_diag_attr_group = { + .name = "diag", + .attrs = nvme_ns_diag_attrs, + .is_visible = nvme_ns_diag_attrs_are_visible, +}; + const struct attribute_group *nvme_ns_attr_groups[] = { &nvme_ns_attr_group, #ifdef CONFIG_NVME_MULTIPATH &nvme_ns_mpath_attr_group, #endif + &nvme_ns_diag_attr_group, NULL, }; @@ -1018,11 +1035,29 @@ static const struct attribute_group nvme_tls_attrs_group = { }; #endif +static struct attribute *nvme_dev_diag_attrs[] = { + NULL, +}; + +static umode_t nvme_dev_diag_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + return a->mode; +} + +const struct attribute_group nvme_dev_diag_attrs_group = { + .name = "diag", + .attrs = nvme_dev_diag_attrs, + .is_visible = nvme_dev_diag_attrs_are_visible, +}; +EXPORT_SYMBOL_GPL(nvme_dev_diag_attrs_group); + const struct attribute_group *nvme_dev_attr_groups[] = { &nvme_dev_attrs_group, #ifdef CONFIG_NVME_TCP_TLS &nvme_tls_attrs_group, #endif + &nvme_dev_diag_attrs_group, NULL, }; -- cgit v1.2.3 From ab5af2903baa472930c94a421efdd22a49036213 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Sun, 17 May 2026 00:06:49 +0530 Subject: nvme: export command retry count via sysfs When Advanced Command Retry Enable (ACRE) is configured, a controller may interrupt command execution and return a completion status indicating command interrupted with the DNR bit cleared. In this case, the driver retries the command based on the Command Retry Delay (CRD) value provided in the completion status. Currently, these command retries are handled entirely within the NVMe driver and are not visible to userspace. As a result, there is no observability into retry behavior, which can be a useful diagnostic signal. Expose a per-namespace sysfs attribute command_retries_count, under diag attribute group to provide visibility into retry activity. This information can help identify controller-side congestion under load and enables comparison across paths in multipath setups (for example, detecting cases where one path experiences significantly more retries than another under identical workloads). This exported metric is intended for diagnostics and monitoring tools such as nvme-top, and does not change command retry behavior. A new sysfs attribute named "command_retries_count" is added for this purpose. This attribute is both readable as well as writable. So user could reset this counter if needed. Tested-by: Venkat Rao Bagalkote Signed-off-by: Nilay Shroff Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 4 ++++ drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/sysfs.c | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 23dfce27ace2..cbc2932556c5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -323,6 +323,7 @@ static void nvme_retry_req(struct request *req) { unsigned long delay = 0; u16 crd; + struct nvme_ns *ns = req->q->queuedata; /* The mask and shift result must be <= 3 */ crd = (nvme_req(req)->status & NVME_STATUS_CRD) >> 11; @@ -330,6 +331,9 @@ static void nvme_retry_req(struct request *req) delay = nvme_req(req)->ctrl->crdt[crd - 1] * 100; nvme_req(req)->retries++; + if (ns) + atomic_long_inc(&ns->retries); + blk_mq_requeue_request(req, false); blk_mq_delay_kick_requeue_list(req->q, delay); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 46cfce4dbbf6..3cf95149aa88 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -592,6 +592,7 @@ struct nvme_ns { enum nvme_ana_state ana_state; u32 ana_grpid; #endif + atomic_long_t retries; struct list_head siblings; struct kref kref; struct nvme_ns_head *head; diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 1d507a835783..9472430934a3 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -344,13 +344,46 @@ const struct attribute_group nvme_ns_mpath_attr_group = { }; #endif +static ssize_t command_retries_count_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); + + return sysfs_emit(buf, "%lu\n", atomic_long_read(&ns->retries)); +} + +static ssize_t command_retries_count_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + unsigned long retries; + int err; + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); + + err = kstrtoul(buf, 0, &retries); + if (err) + return -EINVAL; + + atomic_long_set(&ns->retries, retries); + + return count; +} +static DEVICE_ATTR_RW(command_retries_count); + static struct attribute *nvme_ns_diag_attrs[] = { + &dev_attr_command_retries_count.attr, NULL, }; static umode_t nvme_ns_diag_attrs_are_visible(struct kobject *kobj, struct attribute *a, int n) { + struct device *dev = container_of(kobj, struct device, kobj); + + if (a == &dev_attr_command_retries_count.attr) { + if (nvme_disk_is_ns_head(dev_to_disk(dev))) + return 0; + } + return a->mode; } -- cgit v1.2.3 From 66ee95b3d490d78283b6e92cb4230d4a04c99817 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Sun, 17 May 2026 00:06:50 +0530 Subject: nvme: export multipath failover count via sysfs When an NVMe command completes with a path-specific error, the NVMe driver may retry the command on an alternate controller or path if one is available. These failover events indicate that I/O was redirected away from the original path. Currently, the number of times requests are failed over to another available path is not visible to userspace. Exposing this information can be useful for diagnosing path health and stability. Export per-path sysfs attribute "multipath_failover_count" under diag attribute group. This attribute is both readable and writable and thus allowing user to reset the counter. This counter can be consumed by monitoring tools such as nvme-top to help identify paths that consistently trigger failovers under load. Tested-by: Venkat Rao Bagalkote Signed-off-by: Nilay Shroff Signed-off-by: Keith Busch --- drivers/nvme/host/multipath.c | 27 +++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 2 ++ drivers/nvme/host/sysfs.c | 10 +++++++++- 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index bd9e8d5a2713..51c8d928fc80 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -152,6 +152,7 @@ void nvme_failover_req(struct request *req) struct bio *bio; nvme_mpath_clear_current_path(ns); + atomic_long_inc(&ns->failover); /* * If we got back an ANA error, we know the controller is alive but not @@ -1165,6 +1166,32 @@ static ssize_t delayed_removal_secs_store(struct device *dev, DEVICE_ATTR_RW(delayed_removal_secs); +static ssize_t multipath_failover_count_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); + + return sysfs_emit(buf, "%lu\n", atomic_long_read(&ns->failover)); +} + +static ssize_t multipath_failover_count_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + unsigned long failover; + int ret; + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); + + ret = kstrtoul(buf, 0, &failover); + if (ret) + return -EINVAL; + + atomic_long_set(&ns->failover, failover); + + return count; +} + +DEVICE_ATTR_RW(multipath_failover_count); + static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *desc, void *data) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 3cf95149aa88..73505152fcb1 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -591,6 +591,7 @@ struct nvme_ns { #ifdef CONFIG_NVME_MULTIPATH enum nvme_ana_state ana_state; u32 ana_grpid; + atomic_long_t failover; #endif atomic_long_t retries; struct list_head siblings; @@ -1065,6 +1066,7 @@ extern struct device_attribute dev_attr_ana_state; extern struct device_attribute dev_attr_queue_depth; extern struct device_attribute dev_attr_numa_nodes; extern struct device_attribute dev_attr_delayed_removal_secs; +extern struct device_attribute dev_attr_multipath_failover_count; extern struct device_attribute subsys_attr_iopolicy; static inline bool nvme_disk_is_ns_head(struct gendisk *disk) diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 9472430934a3..0e5033db48a3 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -371,6 +371,9 @@ static DEVICE_ATTR_RW(command_retries_count); static struct attribute *nvme_ns_diag_attrs[] = { &dev_attr_command_retries_count.attr, +#ifdef CONFIG_NVME_MULTIPATH + &dev_attr_multipath_failover_count.attr, +#endif NULL, }; @@ -383,7 +386,12 @@ static umode_t nvme_ns_diag_attrs_are_visible(struct kobject *kobj, if (nvme_disk_is_ns_head(dev_to_disk(dev))) return 0; } - +#ifdef CONFIG_NVME_MULTIPATH + if (a == &dev_attr_multipath_failover_count.attr) { + if (nvme_disk_is_ns_head(dev_to_disk(dev))) + return 0; + } +#endif return a->mode; } -- cgit v1.2.3 From 30ab37a128000600dcaae2b35d4a594e304dfe7e Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Sun, 17 May 2026 00:06:51 +0530 Subject: nvme: export command error counters via sysfs When an NVMe command completes with an error status, the driver logs the error to the kernel log. However, these messages may be lost or overwritten over time since dmesg is a circular buffer. Expose per-path and ctrl sysfs attribute command_error_count, under diag attribute group to provide persistent visibility into error occurrences. This allows users to observe the total number of commands that have failed on a given path over time, which can be useful for diagnosing path health and stability. This attribute is both readable and writable thus allowing user to reset these counters. These counters can also be consumed by observability tools such as nvme-top to provide additional insight into NVMe error behavior. Tested-by: Venkat Rao Bagalkote Signed-off-by: Nilay Shroff Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 10 ++++++- drivers/nvme/host/nvme.h | 2 ++ drivers/nvme/host/sysfs.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index cbc2932556c5..5f885e0ab930 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -438,11 +438,19 @@ static inline void nvme_end_req_zoned(struct request *req) static inline void __nvme_end_req(struct request *req) { - if (unlikely(nvme_req(req)->status && !(req->rq_flags & RQF_QUIET))) { + struct nvme_ns *ns = req->q->queuedata; + struct nvme_request *nr = nvme_req(req); + + if (unlikely(nr->status && !(req->rq_flags & RQF_QUIET))) { if (blk_rq_is_passthrough(req)) nvme_log_err_passthru(req); else nvme_log_error(req); + + if (ns) + atomic_long_inc(&ns->errors); + else + atomic_long_inc(&nr->ctrl->errors); } nvme_end_req_zoned(req); nvme_trace_bio_complete(req); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 73505152fcb1..f2734f03682f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -415,6 +415,7 @@ struct nvme_ctrl { unsigned long ka_last_check_time; struct work_struct fw_act_work; unsigned long events; + atomic_long_t errors; #ifdef CONFIG_NVME_MULTIPATH /* asymmetric namespace access: */ @@ -594,6 +595,7 @@ struct nvme_ns { atomic_long_t failover; #endif atomic_long_t retries; + atomic_long_t errors; struct list_head siblings; struct kref kref; struct nvme_ns_head *head; diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 0e5033db48a3..a03a22c832d8 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -6,6 +6,7 @@ */ #include +#include #include "nvme.h" #include "fabrics.h" @@ -369,8 +370,37 @@ static ssize_t command_retries_count_store(struct device *dev, } static DEVICE_ATTR_RW(command_retries_count); +static ssize_t nvme_io_errors_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); + + return sysfs_emit(buf, "%lu\n", atomic_long_read(&ns->errors)); +} + +static ssize_t nvme_io_errors_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + unsigned long errors; + int err; + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); + + err = kstrtoul(buf, 0, &errors); + if (err) + return -EINVAL; + + atomic_long_set(&ns->errors, errors); + + return count; +} + +struct device_attribute dev_attr_io_errors = + __ATTR(command_error_count, 0644, + nvme_io_errors_show, nvme_io_errors_store); + static struct attribute *nvme_ns_diag_attrs[] = { &dev_attr_command_retries_count.attr, + &dev_attr_io_errors.attr, #ifdef CONFIG_NVME_MULTIPATH &dev_attr_multipath_failover_count.attr, #endif @@ -386,6 +416,12 @@ static umode_t nvme_ns_diag_attrs_are_visible(struct kobject *kobj, if (nvme_disk_is_ns_head(dev_to_disk(dev))) return 0; } + if (a == &dev_attr_io_errors.attr) { + struct gendisk *disk = dev_to_disk(dev); + + if (nvme_disk_is_ns_head(disk)) + return 0; + } #ifdef CONFIG_NVME_MULTIPATH if (a == &dev_attr_multipath_failover_count.attr) { if (nvme_disk_is_ns_head(dev_to_disk(dev))) @@ -1076,7 +1112,37 @@ static const struct attribute_group nvme_tls_attrs_group = { }; #endif +static ssize_t nvme_adm_errors_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%lu\n", + (unsigned long)atomic_long_read(&ctrl->errors)); +} + +static ssize_t nvme_adm_errors_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + unsigned long errors; + int err; + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + err = kstrtoul(buf, 0, &errors); + if (err) + return -EINVAL; + + atomic_long_set(&ctrl->errors, errors); + + return count; +} + +struct device_attribute dev_attr_adm_errors = + __ATTR(command_error_count, 0644, + nvme_adm_errors_show, nvme_adm_errors_store); + static struct attribute *nvme_dev_diag_attrs[] = { + &dev_attr_adm_errors.attr, NULL, }; -- cgit v1.2.3 From 76b5e1591e8cfa986971d177b5de27ce20ca056a Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Sun, 17 May 2026 00:06:52 +0530 Subject: nvme: export I/O requeue count when no path is usable via sysfs When the NVMe namespace head determines that there is no currently available path to handle I/O (for example, while a controller is resetting/connecting or due to a transient link failure), incoming I/Os are added to the requeue list. Currently, there is no visibility into how many I/Os have been requeued in this situation. Add a new ns-head sysfs counter io_requeue_no_usable_path_count, under diag attribute group to expose the number of I/Os that were requeued due to the absence of an available path. This counter is also writable thus allowing user to reset it, if needed. This statistic can help users understand I/O slowdowns or stalls caused by temporary path unavailability, and can be consumed by monitoring tools such as nvme-top for real-time observability. Tested-by: Venkat Rao Bagalkote Signed-off-by: Nilay Shroff Signed-off-by: Keith Busch --- drivers/nvme/host/multipath.c | 30 ++++++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 2 ++ drivers/nvme/host/sysfs.c | 5 +++++ 3 files changed, 37 insertions(+) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 51c8d928fc80..9021fd44f193 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -538,6 +538,7 @@ static void nvme_ns_head_submit_bio(struct bio *bio) spin_lock_irq(&head->requeue_lock); bio_list_add(&head->requeue_list, bio); spin_unlock_irq(&head->requeue_lock); + atomic_long_inc(&head->io_requeue_no_usable_path_count); } else { dev_warn_ratelimited(dev, "no available path - failing I/O\n"); @@ -1192,6 +1193,35 @@ static ssize_t multipath_failover_count_store(struct device *dev, DEVICE_ATTR_RW(multipath_failover_count); +static ssize_t io_requeue_no_usable_path_count_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + struct nvme_ns_head *head = disk->private_data; + + return sysfs_emit(buf, "%lu\n", + atomic_long_read(&head->io_requeue_no_usable_path_count)); +} + +static ssize_t io_requeue_no_usable_path_count_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + int err; + unsigned long requeue_cnt; + struct gendisk *disk = dev_to_disk(dev); + struct nvme_ns_head *head = disk->private_data; + + err = kstrtoul(buf, 0, &requeue_cnt); + if (err) + return -EINVAL; + + atomic_long_set(&head->io_requeue_no_usable_path_count, requeue_cnt); + + return count; +} + +DEVICE_ATTR_RW(io_requeue_no_usable_path_count); + static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *desc, void *data) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index f2734f03682f..bfd427184d69 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -566,6 +566,7 @@ struct nvme_ns_head { unsigned long flags; struct delayed_work remove_work; unsigned int delayed_removal_secs; + atomic_long_t io_requeue_no_usable_path_count; #define NVME_NSHEAD_DISK_LIVE 0 #define NVME_NSHEAD_QUEUE_IF_NO_PATH 1 struct nvme_ns __rcu *current_path[]; @@ -1069,6 +1070,7 @@ extern struct device_attribute dev_attr_queue_depth; extern struct device_attribute dev_attr_numa_nodes; extern struct device_attribute dev_attr_delayed_removal_secs; extern struct device_attribute dev_attr_multipath_failover_count; +extern struct device_attribute dev_attr_io_requeue_no_usable_path_count; extern struct device_attribute subsys_attr_iopolicy; static inline bool nvme_disk_is_ns_head(struct gendisk *disk) diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index a03a22c832d8..7f0575b7cdd0 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -403,6 +403,7 @@ static struct attribute *nvme_ns_diag_attrs[] = { &dev_attr_io_errors.attr, #ifdef CONFIG_NVME_MULTIPATH &dev_attr_multipath_failover_count.attr, + &dev_attr_io_requeue_no_usable_path_count.attr, #endif NULL, }; @@ -427,6 +428,10 @@ static umode_t nvme_ns_diag_attrs_are_visible(struct kobject *kobj, if (nvme_disk_is_ns_head(dev_to_disk(dev))) return 0; } + if (a == &dev_attr_io_requeue_no_usable_path_count.attr) { + if (!nvme_disk_is_ns_head(dev_to_disk(dev))) + return 0; + } #endif return a->mode; } -- cgit v1.2.3 From a8e434cb033817b29e7ad03e8df43071a1c7e90e Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Sun, 17 May 2026 00:06:53 +0530 Subject: nvme: export I/O failure count when no path is available via sysfs When I/O is submitted to the NVMe namespace head and no available path can handle the request, the driver fails the I/O immediately. Currently, such failures are only reported via kernel log messages, which may be lost over time since dmesg is a circular buffer. Add a new ns-head sysfs counter io_fail_no_available_path_count, under diag attribute group to expose the number of I/Os that failed due to the absence of an available path. This provides persistent visibility into path-related I/O failures and can help users diagnose the cause of I/O errors. This counter is also writable and so user may reset its value, if needed. This counter can also be consumed by monitoring tools such as nvme-top. Tested-by: Venkat Rao Bagalkote Signed-off-by: Nilay Shroff Signed-off-by: Keith Busch --- drivers/nvme/host/multipath.c | 30 ++++++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 2 ++ drivers/nvme/host/sysfs.c | 5 +++++ 3 files changed, 37 insertions(+) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 9021fd44f193..96337ae2b552 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -543,6 +543,7 @@ static void nvme_ns_head_submit_bio(struct bio *bio) dev_warn_ratelimited(dev, "no available path - failing I/O\n"); bio_io_error(bio); + atomic_long_inc(&head->io_fail_no_available_path_count); } srcu_read_unlock(&head->srcu, srcu_idx); @@ -1222,6 +1223,35 @@ static ssize_t io_requeue_no_usable_path_count_store(struct device *dev, DEVICE_ATTR_RW(io_requeue_no_usable_path_count); +static ssize_t io_fail_no_available_path_count_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + struct nvme_ns_head *head = disk->private_data; + + return sysfs_emit(buf, "%lu\n", + atomic_long_read(&head->io_fail_no_available_path_count)); +} + +static ssize_t io_fail_no_available_path_count_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + int err; + unsigned long fail_cnt; + struct gendisk *disk = dev_to_disk(dev); + struct nvme_ns_head *head = disk->private_data; + + err = kstrtoul(buf, 0, &fail_cnt); + if (err) + return -EINVAL; + + atomic_long_set(&head->io_fail_no_available_path_count, fail_cnt); + + return count; +} + +DEVICE_ATTR_RW(io_fail_no_available_path_count); + static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *desc, void *data) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index bfd427184d69..249f1f8dde40 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -567,6 +567,7 @@ struct nvme_ns_head { struct delayed_work remove_work; unsigned int delayed_removal_secs; atomic_long_t io_requeue_no_usable_path_count; + atomic_long_t io_fail_no_available_path_count; #define NVME_NSHEAD_DISK_LIVE 0 #define NVME_NSHEAD_QUEUE_IF_NO_PATH 1 struct nvme_ns __rcu *current_path[]; @@ -1071,6 +1072,7 @@ extern struct device_attribute dev_attr_numa_nodes; extern struct device_attribute dev_attr_delayed_removal_secs; extern struct device_attribute dev_attr_multipath_failover_count; extern struct device_attribute dev_attr_io_requeue_no_usable_path_count; +extern struct device_attribute dev_attr_io_fail_no_available_path_count; extern struct device_attribute subsys_attr_iopolicy; static inline bool nvme_disk_is_ns_head(struct gendisk *disk) diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 7f0575b7cdd0..d2c7d943b23f 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -404,6 +404,7 @@ static struct attribute *nvme_ns_diag_attrs[] = { #ifdef CONFIG_NVME_MULTIPATH &dev_attr_multipath_failover_count.attr, &dev_attr_io_requeue_no_usable_path_count.attr, + &dev_attr_io_fail_no_available_path_count.attr, #endif NULL, }; @@ -432,6 +433,10 @@ static umode_t nvme_ns_diag_attrs_are_visible(struct kobject *kobj, if (!nvme_disk_is_ns_head(dev_to_disk(dev))) return 0; } + if (a == &dev_attr_io_fail_no_available_path_count.attr) { + if (!nvme_disk_is_ns_head(dev_to_disk(dev))) + return 0; + } #endif return a->mode; } -- cgit v1.2.3 From 29aafaaf582b342ef3e2182cefd0c2aac6e9f3a8 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Sun, 17 May 2026 00:06:54 +0530 Subject: nvme: export controller reset event count via sysfs The NVMe controller transitions into the RESETTING state during error recovery, link instability, firmware activation, or when a reset is explicitly triggered by the user. Expose a per-ctrl sysfs attribute reset_count, under diag attribute group to provide visibility into these RESETTING state transitions. Observing the frequency of reset events can help users identify issues such as PCIe errors or unstable fabric links. This counter is also writable thus allowing user to reset its value, if needed. This counter can also be consumed by monitoring tools such as nvme-top to improve controller-level observability. Tested-by: Venkat Rao Bagalkote Signed-off-by: Nilay Shroff Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 1 + drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/sysfs.c | 27 +++++++++++++++++++++++++++ 3 files changed, 29 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5f885e0ab930..efaddab8296e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -596,6 +596,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, case NVME_CTRL_NEW: case NVME_CTRL_LIVE: changed = true; + atomic_long_inc(&ctrl->nr_reset); fallthrough; default: break; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 249f1f8dde40..81f297e995e4 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -416,6 +416,7 @@ struct nvme_ctrl { struct work_struct fw_act_work; unsigned long events; atomic_long_t errors; + atomic_long_t nr_reset; #ifdef CONFIG_NVME_MULTIPATH /* asymmetric namespace access: */ diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index d2c7d943b23f..ff603a9d7b8c 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -1151,8 +1151,35 @@ struct device_attribute dev_attr_adm_errors = __ATTR(command_error_count, 0644, nvme_adm_errors_show, nvme_adm_errors_store); +static ssize_t reset_count_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%lu\n", atomic_long_read(&ctrl->nr_reset)); +} + +static ssize_t reset_count_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + int err; + unsigned long reset_cnt; + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + err = kstrtoul(buf, 0, &reset_cnt); + if (err) + return -EINVAL; + + atomic_long_set(&ctrl->nr_reset, reset_cnt); + + return count; +} + +static DEVICE_ATTR_RW(reset_count); + static struct attribute *nvme_dev_diag_attrs[] = { &dev_attr_adm_errors.attr, + &dev_attr_reset_count.attr, NULL, }; -- cgit v1.2.3 From 3c8c284dfcdfce81a02fe3c911196d9876468ae4 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Sun, 17 May 2026 00:06:55 +0530 Subject: nvme: export controller reconnect event count via sysfs When an NVMe-oF link goes down, the driver attempts to recover the connection by repeatedly reconnecting to the remote controller at configured intervals. A maximum number of reconnect attempts is also configured, after which recovery stops and the controller is removed if the connection cannot be re-established. The driver maintains a counter, nr_reconnects, which is incremented on each reconnect attempt. However if in case the reconnect is successful then this counter reset to zero. Moreover, currently, this counter is only reported via kernel log messages and is not exposed to userspace. Since dmesg is a circular buffer, this information may be lost over time. So introduce a new accumulator which accumulates nr_reconnect attempts and also expose this accumulator per-fabric ctrl via a new sysfs attribute reconnect_count, under diag attribute grroup to provide persistent visibility into the number of reconnect attempts made by the host. This information can help users diagnose unstable links or connectivity issues. Furthermore, this sysfs attribute is also writable so user may reset it to zero, if needed. The reconnect_count can also be consumed by monitoring tools such as nvme-top to improve controller-level observability. Tested-by: Venkat Rao Bagalkote Signed-off-by: Nilay Shroff Signed-off-by: Keith Busch --- drivers/nvme/host/fc.c | 3 +++ drivers/nvme/host/nvme.h | 2 ++ drivers/nvme/host/rdma.c | 2 ++ drivers/nvme/host/sysfs.c | 35 +++++++++++++++++++++++++++++++++++ drivers/nvme/host/tcp.c | 2 ++ 5 files changed, 44 insertions(+) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index e4f4528fe2a2..f04eb13dd5e9 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3148,6 +3148,8 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) goto out_term_aen_ops; } + /* accumulate reconnect attempts before resetting it to zero */ + atomic_long_add(ctrl->ctrl.nr_reconnects, &ctrl->ctrl.acc_reconnects); ctrl->ctrl.nr_reconnects = 0; nvme_start_ctrl(&ctrl->ctrl); @@ -3470,6 +3472,7 @@ nvme_fc_alloc_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ctrl->ctrl.opts = opts; ctrl->ctrl.nr_reconnects = 0; + atomic_long_set(&ctrl->ctrl.acc_reconnects, 0); INIT_LIST_HEAD(&ctrl->ctrl_list); ctrl->lport = lport; ctrl->rport = rport; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 81f297e995e4..b367c67dcb37 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -458,6 +458,8 @@ struct nvme_ctrl { u16 icdoff; u16 maxcmd; int nr_reconnects; + /* accumulate reconenct attempts, as nr_reconnects can reset to zero */ + atomic_long_t acc_reconnects; unsigned long flags; struct nvmf_ctrl_options *opts; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index bf73135c1439..61a91cfb4062 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1110,6 +1110,8 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n", ctrl->ctrl.nr_reconnects); + /* accumulate reconnect attempts before resetting it to zero */ + atomic_long_add(ctrl->ctrl.nr_reconnects, &ctrl->ctrl.acc_reconnects); ctrl->ctrl.nr_reconnects = 0; return; diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index ff603a9d7b8c..933a5adfb7af 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -1175,17 +1175,52 @@ static ssize_t reset_count_store(struct device *dev, return count; } +static ssize_t reconnect_count_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%lu\n", + atomic_long_read(&ctrl->acc_reconnects) + + ctrl->nr_reconnects); +} + +static ssize_t reconnect_count_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + int err; + unsigned long reconnect_cnt; + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + err = kstrtoul(buf, 0, &reconnect_cnt); + if (err) + return -EINVAL; + + atomic_long_set(&ctrl->acc_reconnects, reconnect_cnt); + + return count; +} + +static DEVICE_ATTR_RW(reconnect_count); + static DEVICE_ATTR_RW(reset_count); static struct attribute *nvme_dev_diag_attrs[] = { &dev_attr_adm_errors.attr, &dev_attr_reset_count.attr, + &dev_attr_reconnect_count.attr, NULL, }; static umode_t nvme_dev_diag_attrs_are_visible(struct kobject *kobj, struct attribute *a, int n) { + struct device *dev = container_of(kobj, struct device, kobj); + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (a == &dev_attr_reconnect_count.attr && !ctrl->opts) + return 0; + return a->mode; } diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 9d17c88a6200..9b76b77ffdbb 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2489,6 +2489,8 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work) dev_info(ctrl->device, "Successfully reconnected (attempt %d/%d)\n", ctrl->nr_reconnects, ctrl->opts->max_reconnects); + /* accumulate reconnect attempts before resetting it to zero */ + atomic_long_add(ctrl->nr_reconnects, &ctrl->acc_reconnects); ctrl->nr_reconnects = 0; return; -- cgit v1.2.3