summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNilay Shroff <nilay@linux.ibm.com>2026-05-16 21:36:51 +0300
committerKeith Busch <kbusch@kernel.org>2026-06-04 11:57:25 +0300
commit30ab37a128000600dcaae2b35d4a594e304dfe7e (patch)
tree820fcd732ff22780136d20516fdf47f12ede866d
parent66ee95b3d490d78283b6e92cb4230d4a04c99817 (diff)
downloadlinux-30ab37a128000600dcaae2b35d4a594e304dfe7e.tar.xz
nvme: export command error counters via sysfs
When an NVMe command completes with an error status, the driver logs the error to the kernel log. However, these messages may be lost or overwritten over time since dmesg is a circular buffer. Expose per-path and ctrl sysfs attribute command_error_count, under diag attribute group to provide persistent visibility into error occurrences. This allows users to observe the total number of commands that have failed on a given path over time, which can be useful for diagnosing path health and stability. This attribute is both readable and writable thus allowing user to reset these counters. These counters can also be consumed by observability tools such as nvme-top to provide additional insight into NVMe error behavior. Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com> Signed-off-by: Nilay Shroff <nilay@linux.ibm.com> Signed-off-by: Keith Busch <kbusch@kernel.org>
-rw-r--r--drivers/nvme/host/core.c10
-rw-r--r--drivers/nvme/host/nvme.h2
-rw-r--r--drivers/nvme/host/sysfs.c66
3 files changed, 77 insertions, 1 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index cbc2932556c5..5f885e0ab930 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -438,11 +438,19 @@ static inline void nvme_end_req_zoned(struct request *req)
static inline void __nvme_end_req(struct request *req)
{
- if (unlikely(nvme_req(req)->status && !(req->rq_flags & RQF_QUIET))) {
+ struct nvme_ns *ns = req->q->queuedata;
+ struct nvme_request *nr = nvme_req(req);
+
+ if (unlikely(nr->status && !(req->rq_flags & RQF_QUIET))) {
if (blk_rq_is_passthrough(req))
nvme_log_err_passthru(req);
else
nvme_log_error(req);
+
+ if (ns)
+ atomic_long_inc(&ns->errors);
+ else
+ atomic_long_inc(&nr->ctrl->errors);
}
nvme_end_req_zoned(req);
nvme_trace_bio_complete(req);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 73505152fcb1..f2734f03682f 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -415,6 +415,7 @@ struct nvme_ctrl {
unsigned long ka_last_check_time;
struct work_struct fw_act_work;
unsigned long events;
+ atomic_long_t errors;
#ifdef CONFIG_NVME_MULTIPATH
/* asymmetric namespace access: */
@@ -594,6 +595,7 @@ struct nvme_ns {
atomic_long_t failover;
#endif
atomic_long_t retries;
+ atomic_long_t errors;
struct list_head siblings;
struct kref kref;
struct nvme_ns_head *head;
diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c
index 0e5033db48a3..a03a22c832d8 100644
--- a/drivers/nvme/host/sysfs.c
+++ b/drivers/nvme/host/sysfs.c
@@ -6,6 +6,7 @@
*/
#include <linux/nvme-auth.h>
+#include <linux/blkdev.h>
#include "nvme.h"
#include "fabrics.h"
@@ -369,8 +370,37 @@ static ssize_t command_retries_count_store(struct device *dev,
}
static DEVICE_ATTR_RW(command_retries_count);
+static ssize_t nvme_io_errors_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+
+ return sysfs_emit(buf, "%lu\n", atomic_long_read(&ns->errors));
+}
+
+static ssize_t nvme_io_errors_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ unsigned long errors;
+ int err;
+ struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+
+ err = kstrtoul(buf, 0, &errors);
+ if (err)
+ return -EINVAL;
+
+ atomic_long_set(&ns->errors, errors);
+
+ return count;
+}
+
+struct device_attribute dev_attr_io_errors =
+ __ATTR(command_error_count, 0644,
+ nvme_io_errors_show, nvme_io_errors_store);
+
static struct attribute *nvme_ns_diag_attrs[] = {
&dev_attr_command_retries_count.attr,
+ &dev_attr_io_errors.attr,
#ifdef CONFIG_NVME_MULTIPATH
&dev_attr_multipath_failover_count.attr,
#endif
@@ -386,6 +416,12 @@ static umode_t nvme_ns_diag_attrs_are_visible(struct kobject *kobj,
if (nvme_disk_is_ns_head(dev_to_disk(dev)))
return 0;
}
+ if (a == &dev_attr_io_errors.attr) {
+ struct gendisk *disk = dev_to_disk(dev);
+
+ if (nvme_disk_is_ns_head(disk))
+ return 0;
+ }
#ifdef CONFIG_NVME_MULTIPATH
if (a == &dev_attr_multipath_failover_count.attr) {
if (nvme_disk_is_ns_head(dev_to_disk(dev)))
@@ -1076,7 +1112,37 @@ static const struct attribute_group nvme_tls_attrs_group = {
};
#endif
+static ssize_t nvme_adm_errors_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+ return sysfs_emit(buf, "%lu\n",
+ (unsigned long)atomic_long_read(&ctrl->errors));
+}
+
+static ssize_t nvme_adm_errors_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ unsigned long errors;
+ int err;
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+ err = kstrtoul(buf, 0, &errors);
+ if (err)
+ return -EINVAL;
+
+ atomic_long_set(&ctrl->errors, errors);
+
+ return count;
+}
+
+struct device_attribute dev_attr_adm_errors =
+ __ATTR(command_error_count, 0644,
+ nvme_adm_errors_show, nvme_adm_errors_store);
+
static struct attribute *nvme_dev_diag_attrs[] = {
+ &dev_attr_adm_errors.attr,
NULL,
};