summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2026-06-05 14:18:58 +0300
committerJens Axboe <axboe@kernel.dk>2026-06-05 14:18:58 +0300
commited60c09f292f1383bbcf79dcf61b6257bbb3a503 (patch)
treefeb17db78c3ee8d8680e75b24eefd56530b0332f
parent3f1eccd37282de91efd0575ee8e212af4bde39b1 (diff)
parent3c8c284dfcdfce81a02fe3c911196d9876468ae4 (diff)
downloadlinux-ed60c09f292f1383bbcf79dcf61b6257bbb3a503.tar.xz
Merge tag 'nvme-7.2-2026-06-04' of git://git.infradead.org/nvme into for-7.2/block
Pull NVMe updates from Keith: "- Per-controller timeouts - Multipath telemetry - Namespace format validation - Various other fixes" * tag 'nvme-7.2-2026-06-04' of git://git.infradead.org/nvme: (34 commits) nvme: export controller reconnect event count via sysfs nvme: export controller reset event count via sysfs nvme: export I/O failure count when no path is available via sysfs nvme: export I/O requeue count when no path is usable via sysfs nvme: export command error counters via sysfs nvme: export multipath failover count via sysfs nvme: export command retry count via sysfs nvme: add diag attribute group under sysfs nvme-tcp: lockdep: use dynamic lockdep keys per socket instance nvme-tcp: move nvme_tcp_reclassify_socket() nvme: validate FDP configuration descriptor sizes nvmet-auth: validate reply message payload bounds against transfer length nvme: refresh multipath head zoned limits from path limits nvme: fix FDP fdpcidx bounds check nvme-tcp: Use WQ_PERCPU explicitly if wq_unbound is false. nvmet: fix pre-auth out-of-bounds heap read in Discovery Get Log Page nvme-multipath: set BIO_REMAPPED on bios remapped to per-path namespace disks nvme-multipath: require exact iopolicy names for module parameter nvme-multipath: pass NS head to nvme_mpath_revalidate_paths() nvme-pci: fix out-of-bounds access in nvme_setup_descriptor_pools ...
-rw-r--r--drivers/nvme/host/apple.c2
-rw-r--r--drivers/nvme/host/core.c74
-rw-r--r--drivers/nvme/host/fc.c3
-rw-r--r--drivers/nvme/host/multipath.c137
-rw-r--r--drivers/nvme/host/nvme.h21
-rw-r--r--drivers/nvme/host/pci.c14
-rw-r--r--drivers/nvme/host/rdma.c4
-rw-r--r--drivers/nvme/host/sysfs.c311
-rw-r--r--drivers/nvme/host/tcp.c98
-rw-r--r--drivers/nvme/target/discovery.c23
-rw-r--r--drivers/nvme/target/fabrics-cmd-auth.c15
-rw-r--r--drivers/nvme/target/loop.c31
-rw-r--r--drivers/nvme/target/rdma.c6
-rw-r--r--drivers/nvme/target/tcp.c11
14 files changed, 631 insertions, 119 deletions
diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c
index 7fc6b9eacf2e..de0d5126458f 100644
--- a/drivers/nvme/host/apple.c
+++ b/drivers/nvme/host/apple.c
@@ -858,7 +858,7 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown)
* doing a safe shutdown.
*/
if (!dead && shutdown && freeze)
- nvme_wait_freeze_timeout(&anv->ctrl, NVME_IO_TIMEOUT);
+ nvme_wait_freeze_timeout(&anv->ctrl);
nvme_quiesce_io_queues(&anv->ctrl);
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index dc388e24caad..efaddab8296e 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -323,6 +323,7 @@ static void nvme_retry_req(struct request *req)
{
unsigned long delay = 0;
u16 crd;
+ struct nvme_ns *ns = req->q->queuedata;
/* The mask and shift result must be <= 3 */
crd = (nvme_req(req)->status & NVME_STATUS_CRD) >> 11;
@@ -330,6 +331,9 @@ static void nvme_retry_req(struct request *req)
delay = nvme_req(req)->ctrl->crdt[crd - 1] * 100;
nvme_req(req)->retries++;
+ if (ns)
+ atomic_long_inc(&ns->retries);
+
blk_mq_requeue_request(req, false);
blk_mq_delay_kick_requeue_list(req->q, delay);
}
@@ -434,11 +438,19 @@ static inline void nvme_end_req_zoned(struct request *req)
static inline void __nvme_end_req(struct request *req)
{
- if (unlikely(nvme_req(req)->status && !(req->rq_flags & RQF_QUIET))) {
+ struct nvme_ns *ns = req->q->queuedata;
+ struct nvme_request *nr = nvme_req(req);
+
+ if (unlikely(nr->status && !(req->rq_flags & RQF_QUIET))) {
if (blk_rq_is_passthrough(req))
nvme_log_err_passthru(req);
else
nvme_log_error(req);
+
+ if (ns)
+ atomic_long_inc(&ns->errors);
+ else
+ atomic_long_inc(&nr->ctrl->errors);
}
nvme_end_req_zoned(req);
nvme_trace_bio_complete(req);
@@ -584,6 +596,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
case NVME_CTRL_NEW:
case NVME_CTRL_LIVE:
changed = true;
+ atomic_long_inc(&ctrl->nr_reset);
fallthrough;
default:
break;
@@ -729,10 +742,8 @@ void nvme_init_request(struct request *req, struct nvme_command *cmd)
struct nvme_ns *ns = req->q->disk->private_data;
logging_enabled = ns->head->passthru_err_log_enabled;
- req->timeout = NVME_IO_TIMEOUT;
} else { /* no queuedata implies admin queue */
logging_enabled = nr->ctrl->passthru_err_log_enabled;
- req->timeout = NVME_ADMIN_TIMEOUT;
}
if (!logging_enabled)
@@ -2263,7 +2274,7 @@ static int nvme_query_fdp_granularity(struct nvme_ctrl *ctrl,
}
n = le16_to_cpu(h->numfdpc) + 1;
- if (fdp_idx > n) {
+ if (fdp_idx >= n) {
dev_warn(ctrl->device, "FDP index:%d out of range:%d\n",
fdp_idx, n);
/* Proceed without registering FDP streams */
@@ -2275,14 +2286,16 @@ static int nvme_query_fdp_granularity(struct nvme_ctrl *ctrl,
desc = log;
end = log + size - sizeof(*h);
for (i = 0; i < fdp_idx; i++) {
- log += le16_to_cpu(desc->dsze);
- desc = log;
- if (log >= end) {
+ u16 dsze = le16_to_cpu(desc->dsze);
+
+ if (!dsze || log + dsze > end) {
dev_warn(ctrl->device,
- "FDP invalid config descriptor list\n");
+ "FDP invalid config descriptor at index %d\n", i);
ret = 0;
goto out;
}
+ log += dsze;
+ desc = log;
}
if (le32_to_cpu(desc->nrg) > 1) {
@@ -2409,12 +2422,22 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
goto out;
}
+ if (id->lbaf[lbaf].ds < SECTOR_SHIFT ||
+ check_shl_overflow(le64_to_cpu(id->nsze),
+ id->lbaf[lbaf].ds - SECTOR_SHIFT,
+ &capacity)) {
+ dev_warn_once(ns->ctrl->device,
+ "invalid LBA data size %u, skipping namespace\n",
+ id->lbaf[lbaf].ds);
+ ret = -ENODEV;
+ goto out;
+ }
+
lim = queue_limits_start_update(ns->disk->queue);
memflags = blk_mq_freeze_queue(ns->disk->queue);
ns->head->lba_shift = id->lbaf[lbaf].ds;
ns->head->nuse = le64_to_cpu(id->nuse);
- capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze));
nvme_set_ctrl_limits(ns->ctrl, &lim, false);
nvme_configure_metadata(ns->ctrl, ns->head, id, nvm, info);
nvme_set_chunk_sectors(ns, id, &lim);
@@ -2483,6 +2506,14 @@ out:
return ret;
}
+static void nvme_stack_zone_resources(struct queue_limits *t,
+ const struct queue_limits *b)
+{
+ t->max_open_zones = min_not_zero(t->max_open_zones, b->max_open_zones);
+ t->max_active_zones =
+ min_not_zero(t->max_active_zones, b->max_active_zones);
+}
+
static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
{
bool unsupported = false;
@@ -2549,6 +2580,8 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
lim.io_opt = ns_lim->io_opt;
queue_limits_stack_bdev(&lim, ns->disk->part0, 0,
ns->head->disk->disk_name);
+ if (lim.features & BLK_FEAT_ZONED)
+ nvme_stack_zone_resources(&lim, ns_lim);
if (unsupported)
ns->head->disk->flags |= GENHD_FL_HIDDEN;
else
@@ -2559,7 +2592,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk));
set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
- nvme_mpath_revalidate_paths(ns);
+ nvme_mpath_revalidate_paths(ns->head);
blk_mq_unfreeze_queue(ns->head->disk->queue, memflags);
}
@@ -3922,7 +3955,7 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
int ret = -ENOMEM;
#ifdef CONFIG_NVME_MULTIPATH
- size += num_possible_nodes() * sizeof(struct nvme_ns *);
+ size += nr_node_ids * sizeof(struct nvme_ns *);
#endif
head = kzalloc(size, GFP_KERNEL);
@@ -4205,6 +4238,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
mutex_unlock(&ctrl->namespaces_lock);
goto out_unlink_ns;
}
+ blk_queue_rq_timeout(ns->queue, ctrl->io_timeout);
nvme_ns_add_to_ctrl_list(ns);
mutex_unlock(&ctrl->namespaces_lock);
synchronize_srcu(&ctrl->srcu);
@@ -4890,12 +4924,7 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
if (ret)
return ret;
- /*
- * If a previous admin queue exists (e.g., from before a reset),
- * put it now before allocating a new one to avoid orphaning it.
- */
- if (ctrl->admin_q)
- blk_put_queue(ctrl->admin_q);
+ WARN_ON_ONCE(ctrl->admin_q);
ctrl->admin_q = blk_mq_alloc_queue(set, NULL, NULL);
if (IS_ERR(ctrl->admin_q)) {
@@ -4933,10 +4962,8 @@ void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl)
*/
nvme_stop_keep_alive(ctrl);
blk_mq_destroy_queue(ctrl->admin_q);
- if (ctrl->ops->flags & NVME_F_FABRICS) {
+ if (ctrl->fabrics_q)
blk_mq_destroy_queue(ctrl->fabrics_q);
- blk_put_queue(ctrl->fabrics_q);
- }
blk_mq_free_tag_set(ctrl->admin_tagset);
}
EXPORT_SYMBOL_GPL(nvme_remove_admin_tag_set);
@@ -5078,6 +5105,8 @@ static void nvme_free_ctrl(struct device *dev)
if (ctrl->admin_q)
blk_put_queue(ctrl->admin_q);
+ if (ctrl->fabrics_q)
+ blk_put_queue(ctrl->fabrics_q);
if (!subsys || ctrl->instance != subsys->instance)
ida_free(&nvme_instance_ida, ctrl->instance);
nvme_free_cels(ctrl);
@@ -5142,6 +5171,8 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
ctrl->ka_last_check_time = jiffies;
+ ctrl->admin_timeout = NVME_ADMIN_TIMEOUT;
+ ctrl->io_timeout = NVME_IO_TIMEOUT;
BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
PAGE_SIZE);
@@ -5248,8 +5279,9 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl)
}
EXPORT_SYMBOL_GPL(nvme_unfreeze);
-int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
+int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl)
{
+ long timeout = ctrl->io_timeout;
struct nvme_ns *ns;
int srcu_idx;
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 1907da499ad2..2c9a6d3c9797 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -3148,6 +3148,8 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
goto out_term_aen_ops;
}
+ /* accumulate reconnect attempts before resetting it to zero */
+ atomic_long_add(ctrl->ctrl.nr_reconnects, &ctrl->ctrl.acc_reconnects);
ctrl->ctrl.nr_reconnects = 0;
nvme_start_ctrl(&ctrl->ctrl);
@@ -3470,6 +3472,7 @@ nvme_fc_alloc_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
ctrl->ctrl.opts = opts;
ctrl->ctrl.nr_reconnects = 0;
+ atomic_long_set(&ctrl->ctrl.acc_reconnects, 0);
INIT_LIST_HEAD(&ctrl->ctrl_list);
ctrl->lport = lport;
ctrl->rport = rport;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index bca8e7c97519..e033ede953cc 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -73,19 +73,29 @@ static const char *nvme_iopolicy_names[] = {
static int iopolicy = NVME_IOPOLICY_NUMA;
+static int nvme_iopolicy_parse(const char *str)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
+ if (sysfs_streq(str, nvme_iopolicy_names[i]))
+ return i;
+ }
+ return -EINVAL;
+}
+
static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
{
+ int policy;
+
if (!val)
return -EINVAL;
- if (!strncmp(val, "numa", 4))
- iopolicy = NVME_IOPOLICY_NUMA;
- else if (!strncmp(val, "round-robin", 11))
- iopolicy = NVME_IOPOLICY_RR;
- else if (!strncmp(val, "queue-depth", 11))
- iopolicy = NVME_IOPOLICY_QD;
- else
- return -EINVAL;
+ policy = nvme_iopolicy_parse(val);
+ if (policy < 0)
+ return policy;
+
+ iopolicy = policy;
return 0;
}
@@ -142,6 +152,7 @@ void nvme_failover_req(struct request *req)
struct bio *bio;
nvme_mpath_clear_current_path(ns);
+ atomic_long_inc(&ns->failover);
/*
* If we got back an ANA error, we know the controller is alive but not
@@ -257,10 +268,10 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
-void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
+void nvme_mpath_revalidate_paths(struct nvme_ns_head *head)
{
- struct nvme_ns_head *head = ns->head;
sector_t capacity = get_capacity(head->disk);
+ struct nvme_ns *ns;
int node;
int srcu_idx;
@@ -514,6 +525,12 @@ static void nvme_ns_head_submit_bio(struct bio *bio)
ns = nvme_find_path(head);
if (likely(ns)) {
bio_set_dev(bio, ns->disk->part0);
+ /*
+ * Use BIO_REMAPPED to skip bio_check_eod() when this bio
+ * enters submit_bio_noacct() for the per-path device. The EOD
+ * check already passed on the multipath head.
+ */
+ bio_set_flag(bio, BIO_REMAPPED);
bio->bi_opf |= REQ_NVME_MPATH;
trace_block_bio_remap(bio, disk_devt(ns->head->disk),
bio->bi_iter.bi_sector);
@@ -524,10 +541,12 @@ static void nvme_ns_head_submit_bio(struct bio *bio)
spin_lock_irq(&head->requeue_lock);
bio_list_add(&head->requeue_list, bio);
spin_unlock_irq(&head->requeue_lock);
+ atomic_long_inc(&head->io_requeue_no_usable_path_count);
} else {
dev_warn_ratelimited(dev, "no available path - failing I/O\n");
bio_io_error(bio);
+ atomic_long_inc(&head->io_fail_no_available_path_count);
}
srcu_read_unlock(&head->srcu, srcu_idx);
@@ -1042,16 +1061,14 @@ static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
{
struct nvme_subsystem *subsys =
container_of(dev, struct nvme_subsystem, dev);
- int i;
+ int policy;
- for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
- if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
- nvme_subsys_iopolicy_update(subsys, i);
- return count;
- }
- }
+ policy = nvme_iopolicy_parse(buf);
+ if (policy < 0)
+ return policy;
- return -EINVAL;
+ nvme_subsys_iopolicy_update(subsys, policy);
+ return count;
}
SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
@@ -1154,6 +1171,90 @@ static ssize_t delayed_removal_secs_store(struct device *dev,
DEVICE_ATTR_RW(delayed_removal_secs);
+static ssize_t multipath_failover_count_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+
+ return sysfs_emit(buf, "%lu\n", atomic_long_read(&ns->failover));
+}
+
+static ssize_t multipath_failover_count_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ unsigned long failover;
+ int ret;
+ struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+
+ ret = kstrtoul(buf, 0, &failover);
+ if (ret)
+ return -EINVAL;
+
+ atomic_long_set(&ns->failover, failover);
+
+ return count;
+}
+
+DEVICE_ATTR_RW(multipath_failover_count);
+
+static ssize_t io_requeue_no_usable_path_count_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ struct nvme_ns_head *head = disk->private_data;
+
+ return sysfs_emit(buf, "%lu\n",
+ atomic_long_read(&head->io_requeue_no_usable_path_count));
+}
+
+static ssize_t io_requeue_no_usable_path_count_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ int err;
+ unsigned long requeue_cnt;
+ struct gendisk *disk = dev_to_disk(dev);
+ struct nvme_ns_head *head = disk->private_data;
+
+ err = kstrtoul(buf, 0, &requeue_cnt);
+ if (err)
+ return -EINVAL;
+
+ atomic_long_set(&head->io_requeue_no_usable_path_count, requeue_cnt);
+
+ return count;
+}
+
+DEVICE_ATTR_RW(io_requeue_no_usable_path_count);
+
+static ssize_t io_fail_no_available_path_count_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ struct nvme_ns_head *head = disk->private_data;
+
+ return sysfs_emit(buf, "%lu\n",
+ atomic_long_read(&head->io_fail_no_available_path_count));
+}
+
+static ssize_t io_fail_no_available_path_count_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ int err;
+ unsigned long fail_cnt;
+ struct gendisk *disk = dev_to_disk(dev);
+ struct nvme_ns_head *head = disk->private_data;
+
+ err = kstrtoul(buf, 0, &fail_cnt);
+ if (err)
+ return -EINVAL;
+
+ atomic_long_set(&head->io_fail_no_available_path_count, fail_cnt);
+
+ return count;
+}
+
+DEVICE_ATTR_RW(io_fail_no_available_path_count);
+
static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
struct nvme_ana_group_desc *desc, void *data)
{
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ccd5e05dac98..b367c67dcb37 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -370,6 +370,8 @@ struct nvme_ctrl {
u16 mtfa;
u32 ctrl_config;
u32 queue_count;
+ u32 admin_timeout;
+ u32 io_timeout;
u64 cap;
u32 max_hw_sectors;
@@ -413,6 +415,8 @@ struct nvme_ctrl {
unsigned long ka_last_check_time;
struct work_struct fw_act_work;
unsigned long events;
+ atomic_long_t errors;
+ atomic_long_t nr_reset;
#ifdef CONFIG_NVME_MULTIPATH
/* asymmetric namespace access: */
@@ -454,6 +458,8 @@ struct nvme_ctrl {
u16 icdoff;
u16 maxcmd;
int nr_reconnects;
+ /* accumulate reconenct attempts, as nr_reconnects can reset to zero */
+ atomic_long_t acc_reconnects;
unsigned long flags;
struct nvmf_ctrl_options *opts;
@@ -563,6 +569,8 @@ struct nvme_ns_head {
unsigned long flags;
struct delayed_work remove_work;
unsigned int delayed_removal_secs;
+ atomic_long_t io_requeue_no_usable_path_count;
+ atomic_long_t io_fail_no_available_path_count;
#define NVME_NSHEAD_DISK_LIVE 0
#define NVME_NSHEAD_QUEUE_IF_NO_PATH 1
struct nvme_ns __rcu *current_path[];
@@ -589,7 +597,10 @@ struct nvme_ns {
#ifdef CONFIG_NVME_MULTIPATH
enum nvme_ana_state ana_state;
u32 ana_grpid;
+ atomic_long_t failover;
#endif
+ atomic_long_t retries;
+ atomic_long_t errors;
struct list_head siblings;
struct kref kref;
struct nvme_ns_head *head;
@@ -900,7 +911,7 @@ void nvme_sync_queues(struct nvme_ctrl *ctrl);
void nvme_sync_io_queues(struct nvme_ctrl *ctrl);
void nvme_unfreeze(struct nvme_ctrl *ctrl);
void nvme_wait_freeze(struct nvme_ctrl *ctrl);
-int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout);
+int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl);
void nvme_start_freeze(struct nvme_ctrl *ctrl);
static inline enum req_op nvme_req_op(struct nvme_command *cmd)
@@ -1012,6 +1023,7 @@ extern const struct attribute_group nvme_ns_mpath_attr_group;
extern const struct pr_ops nvme_pr_ops;
extern const struct block_device_operations nvme_ns_head_ops;
extern const struct attribute_group nvme_dev_attrs_group;
+extern const struct attribute_group nvme_dev_diag_attrs_group;
extern const struct attribute_group *nvme_subsys_attrs_groups[];
extern const struct attribute_group *nvme_dev_attr_groups[];
extern const struct block_device_operations nvme_bdev_ops;
@@ -1041,7 +1053,7 @@ void nvme_mpath_update(struct nvme_ctrl *ctrl);
void nvme_mpath_uninit(struct nvme_ctrl *ctrl);
void nvme_mpath_stop(struct nvme_ctrl *ctrl);
bool nvme_mpath_clear_current_path(struct nvme_ns *ns);
-void nvme_mpath_revalidate_paths(struct nvme_ns *ns);
+void nvme_mpath_revalidate_paths(struct nvme_ns_head *head);
void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl);
void nvme_mpath_remove_disk(struct nvme_ns_head *head);
void nvme_mpath_start_request(struct request *rq);
@@ -1061,6 +1073,9 @@ extern struct device_attribute dev_attr_ana_state;
extern struct device_attribute dev_attr_queue_depth;
extern struct device_attribute dev_attr_numa_nodes;
extern struct device_attribute dev_attr_delayed_removal_secs;
+extern struct device_attribute dev_attr_multipath_failover_count;
+extern struct device_attribute dev_attr_io_requeue_no_usable_path_count;
+extern struct device_attribute dev_attr_io_fail_no_available_path_count;
extern struct device_attribute subsys_attr_iopolicy;
static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
@@ -1106,7 +1121,7 @@ static inline bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
{
return false;
}
-static inline void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
+static inline void nvme_mpath_revalidate_paths(struct nvme_ns_head *head)
{
}
static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 24911e1252d5..9b9595a5f331 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -587,11 +587,16 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db,
}
static struct nvme_descriptor_pools *
-nvme_setup_descriptor_pools(struct nvme_dev *dev, unsigned numa_node)
+nvme_setup_descriptor_pools(struct nvme_dev *dev, int numa_node)
{
- struct nvme_descriptor_pools *pools = &dev->descriptor_pools[numa_node];
+ struct nvme_descriptor_pools *pools;
size_t small_align = NVME_SMALL_POOL_SIZE;
+ if (numa_node == NUMA_NO_NODE)
+ numa_node = 0;
+
+ pools = &dev->descriptor_pools[numa_node];
+
if (pools->small)
return pools; /* already initialized */
@@ -2810,6 +2815,7 @@ static const struct attribute_group nvme_pci_dev_attrs_group = {
static const struct attribute_group *nvme_pci_dev_attr_groups[] = {
&nvme_dev_attrs_group,
&nvme_pci_dev_attrs_group,
+ &nvme_dev_diag_attrs_group,
NULL,
};
@@ -3094,7 +3100,7 @@ static bool __nvme_delete_io_queues(struct nvme_dev *dev, u8 opcode)
unsigned long timeout;
retry:
- timeout = NVME_ADMIN_TIMEOUT;
+ timeout = dev->ctrl.admin_timeout;
while (nr_queues > 0) {
if (nvme_delete_queue(&dev->queues[nr_queues], opcode))
break;
@@ -3276,7 +3282,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
* if doing a safe shutdown.
*/
if (!dead && shutdown)
- nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
+ nvme_wait_freeze_timeout(&dev->ctrl);
}
nvme_quiesce_io_queues(&dev->ctrl);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 08459c65c3d5..6909e3542794 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -888,7 +888,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
if (!new) {
nvme_start_freeze(&ctrl->ctrl);
nvme_unquiesce_io_queues(&ctrl->ctrl);
- if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) {
+ if (!nvme_wait_freeze_timeout(&ctrl->ctrl)) {
/*
* If we timed out waiting for freeze we are likely to
* be stuck. Fail the controller initialization just
@@ -1110,6 +1110,8 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n",
ctrl->ctrl.nr_reconnects);
+ /* accumulate reconnect attempts before resetting it to zero */
+ atomic_long_add(ctrl->ctrl.nr_reconnects, &ctrl->ctrl.acc_reconnects);
ctrl->ctrl.nr_reconnects = 0;
return;
diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c
index e59758616f27..933a5adfb7af 100644
--- a/drivers/nvme/host/sysfs.c
+++ b/drivers/nvme/host/sysfs.c
@@ -6,6 +6,7 @@
*/
#include <linux/nvme-auth.h>
+#include <linux/blkdev.h>
#include "nvme.h"
#include "fabrics.h"
@@ -335,14 +336,7 @@ static bool multipath_sysfs_group_visible(struct kobject *kobj)
return nvme_disk_is_ns_head(dev_to_disk(dev));
}
-
-static bool multipath_sysfs_attr_visible(struct kobject *kobj,
- struct attribute *attr, int n)
-{
- return false;
-}
-
-DEFINE_SYSFS_GROUP_VISIBLE(multipath_sysfs)
+DEFINE_SIMPLE_SYSFS_GROUP_VISIBLE(multipath_sysfs)
const struct attribute_group nvme_ns_mpath_attr_group = {
.name = "multipath",
@@ -351,11 +345,114 @@ const struct attribute_group nvme_ns_mpath_attr_group = {
};
#endif
+static ssize_t command_retries_count_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+
+ return sysfs_emit(buf, "%lu\n", atomic_long_read(&ns->retries));
+}
+
+static ssize_t command_retries_count_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ unsigned long retries;
+ int err;
+ struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+
+ err = kstrtoul(buf, 0, &retries);
+ if (err)
+ return -EINVAL;
+
+ atomic_long_set(&ns->retries, retries);
+
+ return count;
+}
+static DEVICE_ATTR_RW(command_retries_count);
+
+static ssize_t nvme_io_errors_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+
+ return sysfs_emit(buf, "%lu\n", atomic_long_read(&ns->errors));
+}
+
+static ssize_t nvme_io_errors_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ unsigned long errors;
+ int err;
+ struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+
+ err = kstrtoul(buf, 0, &errors);
+ if (err)
+ return -EINVAL;
+
+ atomic_long_set(&ns->errors, errors);
+
+ return count;
+}
+
+struct device_attribute dev_attr_io_errors =
+ __ATTR(command_error_count, 0644,
+ nvme_io_errors_show, nvme_io_errors_store);
+
+static struct attribute *nvme_ns_diag_attrs[] = {
+ &dev_attr_command_retries_count.attr,
+ &dev_attr_io_errors.attr,
+#ifdef CONFIG_NVME_MULTIPATH
+ &dev_attr_multipath_failover_count.attr,
+ &dev_attr_io_requeue_no_usable_path_count.attr,
+ &dev_attr_io_fail_no_available_path_count.attr,
+#endif
+ NULL,
+};
+
+static umode_t nvme_ns_diag_attrs_are_visible(struct kobject *kobj,
+ struct attribute *a, int n)
+{
+ struct device *dev = container_of(kobj, struct device, kobj);
+
+ if (a == &dev_attr_command_retries_count.attr) {
+ if (nvme_disk_is_ns_head(dev_to_disk(dev)))
+ return 0;
+ }
+ if (a == &dev_attr_io_errors.attr) {
+ struct gendisk *disk = dev_to_disk(dev);
+
+ if (nvme_disk_is_ns_head(disk))
+ return 0;
+ }
+#ifdef CONFIG_NVME_MULTIPATH
+ if (a == &dev_attr_multipath_failover_count.attr) {
+ if (nvme_disk_is_ns_head(dev_to_disk(dev)))
+ return 0;
+ }
+ if (a == &dev_attr_io_requeue_no_usable_path_count.attr) {
+ if (!nvme_disk_is_ns_head(dev_to_disk(dev)))
+ return 0;
+ }
+ if (a == &dev_attr_io_fail_no_available_path_count.attr) {
+ if (!nvme_disk_is_ns_head(dev_to_disk(dev)))
+ return 0;
+ }
+#endif
+ return a->mode;
+}
+
+const struct attribute_group nvme_ns_diag_attr_group = {
+ .name = "diag",
+ .attrs = nvme_ns_diag_attrs,
+ .is_visible = nvme_ns_diag_attrs_are_visible,
+};
+
const struct attribute_group *nvme_ns_attr_groups[] = {
&nvme_ns_attr_group,
#ifdef CONFIG_NVME_MULTIPATH
&nvme_ns_mpath_attr_group,
#endif
+ &nvme_ns_diag_attr_group,
NULL,
};
@@ -623,6 +720,92 @@ static ssize_t quirks_show(struct device *dev, struct device_attribute *attr,
}
static DEVICE_ATTR_RO(quirks);
+static ssize_t nvme_admin_timeout_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+ return sysfs_emit(buf, "%u\n",
+ jiffies_to_msecs(ctrl->admin_timeout));
+}
+
+static ssize_t nvme_admin_timeout_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ u32 timeout;
+ int err;
+
+ /*
+ * Wait until the controller reaches the LIVE state to be sure that
+ * admin_q and fabrics_q are properly initialized.
+ */
+ if (!test_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags))
+ return -EBUSY;
+
+ err = kstrtou32(buf, 10, &timeout);
+ if (err || !timeout)
+ return -EINVAL;
+
+ ctrl->admin_timeout = msecs_to_jiffies(timeout);
+
+ blk_queue_rq_timeout(ctrl->admin_q, ctrl->admin_timeout);
+ if (ctrl->fabrics_q)
+ blk_queue_rq_timeout(ctrl->fabrics_q, ctrl->admin_timeout);
+
+ return count;
+}
+
+static DEVICE_ATTR(admin_timeout, S_IRUGO | S_IWUSR,
+ nvme_admin_timeout_show, nvme_admin_timeout_store);
+
+static ssize_t nvme_io_timeout_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+ return sysfs_emit(buf, "%u\n", jiffies_to_msecs(ctrl->io_timeout));
+}
+
+static ssize_t nvme_io_timeout_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ struct nvme_ns *ns;
+ u32 timeout;
+ int err;
+
+ /*
+ * Wait until the controller reaches the LIVE state to be sure that
+ * connect_q is properly initialized.
+ */
+ if (!test_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags))
+ return -EBUSY;
+
+ err = kstrtou32(buf, 10, &timeout);
+ if (err || !timeout)
+ return -EINVAL;
+
+ /* Take the namespaces_lock to avoid racing against nvme_alloc_ns() */
+ mutex_lock(&ctrl->namespaces_lock);
+
+ ctrl->io_timeout = msecs_to_jiffies(timeout);
+ list_for_each_entry(ns, &ctrl->namespaces, list)
+ blk_queue_rq_timeout(ns->queue, ctrl->io_timeout);
+
+ mutex_unlock(&ctrl->namespaces_lock);
+
+ if (ctrl->connect_q)
+ blk_queue_rq_timeout(ctrl->connect_q, ctrl->io_timeout);
+
+ return count;
+}
+
+static DEVICE_ATTR(io_timeout, S_IRUGO | S_IWUSR,
+ nvme_io_timeout_show, nvme_io_timeout_store);
+
#ifdef CONFIG_NVME_HOST_AUTH
static ssize_t nvme_ctrl_dhchap_secret_show(struct device *dev,
struct device_attribute *attr, char *buf)
@@ -765,6 +948,8 @@ static struct attribute *nvme_dev_attrs[] = {
&dev_attr_cntrltype.attr,
&dev_attr_dctype.attr,
&dev_attr_quirks.attr,
+ &dev_attr_admin_timeout.attr,
+ &dev_attr_io_timeout.attr,
#ifdef CONFIG_NVME_HOST_AUTH
&dev_attr_dhchap_secret.attr,
&dev_attr_dhchap_ctrl_secret.attr,
@@ -937,11 +1122,121 @@ static const struct attribute_group nvme_tls_attrs_group = {
};
#endif
+static ssize_t nvme_adm_errors_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+ return sysfs_emit(buf, "%lu\n",
+ (unsigned long)atomic_long_read(&ctrl->errors));
+}
+
+static ssize_t nvme_adm_errors_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ unsigned long errors;
+ int err;
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+ err = kstrtoul(buf, 0, &errors);
+ if (err)
+ return -EINVAL;
+
+ atomic_long_set(&ctrl->errors, errors);
+
+ return count;
+}
+
+struct device_attribute dev_attr_adm_errors =
+ __ATTR(command_error_count, 0644,
+ nvme_adm_errors_show, nvme_adm_errors_store);
+
+static ssize_t reset_count_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+ return sysfs_emit(buf, "%lu\n", atomic_long_read(&ctrl->nr_reset));
+}
+
+static ssize_t reset_count_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ int err;
+ unsigned long reset_cnt;
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+ err = kstrtoul(buf, 0, &reset_cnt);
+ if (err)
+ return -EINVAL;
+
+ atomic_long_set(&ctrl->nr_reset, reset_cnt);
+
+ return count;
+}
+
+static ssize_t reconnect_count_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+ return sysfs_emit(buf, "%lu\n",
+ atomic_long_read(&ctrl->acc_reconnects) +
+ ctrl->nr_reconnects);
+}
+
+static ssize_t reconnect_count_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ int err;
+ unsigned long reconnect_cnt;
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+ err = kstrtoul(buf, 0, &reconnect_cnt);
+ if (err)
+ return -EINVAL;
+
+ atomic_long_set(&ctrl->acc_reconnects, reconnect_cnt);
+
+ return count;
+}
+
+static DEVICE_ATTR_RW(reconnect_count);
+
+static DEVICE_ATTR_RW(reset_count);
+
+static struct attribute *nvme_dev_diag_attrs[] = {
+ &dev_attr_adm_errors.attr,
+ &dev_attr_reset_count.attr,
+ &dev_attr_reconnect_count.attr,
+ NULL,
+};
+
+static umode_t nvme_dev_diag_attrs_are_visible(struct kobject *kobj,
+ struct attribute *a, int n)
+{
+ struct device *dev = container_of(kobj, struct device, kobj);
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+ if (a == &dev_attr_reconnect_count.attr && !ctrl->opts)
+ return 0;
+
+ return a->mode;
+}
+
+const struct attribute_group nvme_dev_diag_attrs_group = {
+ .name = "diag",
+ .attrs = nvme_dev_diag_attrs,
+ .is_visible = nvme_dev_diag_attrs_are_visible,
+};
+EXPORT_SYMBOL_GPL(nvme_dev_diag_attrs_group);
+
const struct attribute_group *nvme_dev_attr_groups[] = {
&nvme_dev_attrs_group,
#ifdef CONFIG_NVME_TCP_TLS
&nvme_tls_attrs_group,
#endif
+ &nvme_dev_diag_attrs_group,
NULL,
};
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 9313ab211c67..4b81521723f6 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -56,44 +56,6 @@ MODULE_PARM_DESC(tls_handshake_timeout,
static atomic_t nvme_tcp_cpu_queues[NR_CPUS];
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-/* lockdep can detect a circular dependency of the form
- * sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
- * because dependencies are tracked for both nvme-tcp and user contexts. Using
- * a separate class prevents lockdep from conflating nvme-tcp socket use with
- * user-space socket API use.
- */
-static struct lock_class_key nvme_tcp_sk_key[2];
-static struct lock_class_key nvme_tcp_slock_key[2];
-
-static void nvme_tcp_reclassify_socket(struct socket *sock)
-{
- struct sock *sk = sock->sk;
-
- if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
- return;
-
- switch (sk->sk_family) {
- case AF_INET:
- sock_lock_init_class_and_name(sk, "slock-AF_INET-NVME",
- &nvme_tcp_slock_key[0],
- "sk_lock-AF_INET-NVME",
- &nvme_tcp_sk_key[0]);
- break;
- case AF_INET6:
- sock_lock_init_class_and_name(sk, "slock-AF_INET6-NVME",
- &nvme_tcp_slock_key[1],
- "sk_lock-AF_INET6-NVME",
- &nvme_tcp_sk_key[1]);
- break;
- default:
- WARN_ON_ONCE(1);
- }
-}
-#else
-static void nvme_tcp_reclassify_socket(struct socket *sock) { }
-#endif
-
enum nvme_tcp_send_state {
NVME_TCP_SEND_CMD_PDU = 0,
NVME_TCP_SEND_H2C_PDU,
@@ -180,6 +142,11 @@ struct nvme_tcp_queue {
void (*state_change)(struct sock *);
void (*data_ready)(struct sock *);
void (*write_space)(struct sock *);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lock_class_key nvme_tcp_sk_key;
+ struct lock_class_key nvme_tcp_slock_key;
+#endif
};
struct nvme_tcp_ctrl {
@@ -207,6 +174,39 @@ static const struct blk_mq_ops nvme_tcp_mq_ops;
static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
static int nvme_tcp_try_send(struct nvme_tcp_queue *queue);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+/* lockdep can detect a circular dependency of the form
+ * sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
+ * because dependencies are tracked for both nvme-tcp and user contexts. Using
+ * a separate class prevents lockdep from conflating nvme-tcp socket use with
+ * user-space socket API use.
+ */
+static void nvme_tcp_reclassify_socket(struct nvme_tcp_queue *queue)
+{
+ struct sock *sk = queue->sock->sk;
+
+ if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
+ return;
+
+ switch (sk->sk_family) {
+ case AF_INET:
+ sock_lock_init_class_and_name(sk, "slock-AF_INET-NVME",
+ &queue->nvme_tcp_slock_key,
+ "sk_lock-AF_INET-NVME",
+ &queue->nvme_tcp_sk_key);
+ break;
+ case AF_INET6:
+ sock_lock_init_class_and_name(sk, "slock-AF_INET6-NVME",
+ &queue->nvme_tcp_slock_key,
+ "sk_lock-AF_INET6-NVME",
+ &queue->nvme_tcp_sk_key);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+}
+#endif
+
static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
{
return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
@@ -1461,6 +1461,11 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
kfree(queue->pdu);
mutex_destroy(&queue->send_mutex);
mutex_destroy(&queue->queue_lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ lockdep_unregister_key(&queue->nvme_tcp_sk_key);
+ lockdep_unregister_key(&queue->nvme_tcp_slock_key);
+#endif
}
static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
@@ -1806,7 +1811,12 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
}
sk_net_refcnt_upgrade(queue->sock->sk);
- nvme_tcp_reclassify_socket(queue->sock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ lockdep_register_key(&queue->nvme_tcp_sk_key);
+ lockdep_register_key(&queue->nvme_tcp_slock_key);
+ nvme_tcp_reclassify_socket(queue);
+#endif
/* Single syn retry */
tcp_sock_set_syncnt(queue->sock->sk, 1);
@@ -1911,6 +1921,10 @@ err_sock:
/* Use sync variant - see nvme_tcp_free_queue() for explanation */
__fput_sync(queue->sock->file);
queue->sock = NULL;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ lockdep_unregister_key(&queue->nvme_tcp_sk_key);
+ lockdep_unregister_key(&queue->nvme_tcp_slock_key);
+#endif
err_destroy_mutex:
mutex_destroy(&queue->send_mutex);
mutex_destroy(&queue->queue_lock);
@@ -2201,7 +2215,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
if (!new) {
nvme_start_freeze(ctrl);
nvme_unquiesce_io_queues(ctrl);
- if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) {
+ if (!nvme_wait_freeze_timeout(ctrl)) {
/*
* If we timed out waiting for freeze we are likely to
* be stuck. Fail the controller initialization just
@@ -2468,6 +2482,8 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
dev_info(ctrl->device, "Successfully reconnected (attempt %d/%d)\n",
ctrl->nr_reconnects, ctrl->opts->max_reconnects);
+ /* accumulate reconnect attempts before resetting it to zero */
+ atomic_long_add(ctrl->nr_reconnects, &ctrl->acc_reconnects);
ctrl->nr_reconnects = 0;
return;
@@ -3046,6 +3062,8 @@ static int __init nvme_tcp_init_module(void)
if (wq_unbound)
wq_flags |= WQ_UNBOUND;
+ else
+ wq_flags |= WQ_PERCPU;
nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", wq_flags, 0);
if (!nvme_tcp_wq)
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index e9b35549e254..114869d16a1f 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -166,6 +166,7 @@ static void nvmet_execute_disc_get_log_page(struct nvmet_req *req)
u64 offset = nvmet_get_log_page_offset(req->cmd);
size_t data_len = nvmet_get_log_page_len(req->cmd);
size_t alloc_len;
+ size_t copy_len;
struct nvmet_subsys_link *p;
struct nvmet_port *r;
u32 numrec = 0;
@@ -242,7 +243,27 @@ static void nvmet_execute_disc_get_log_page(struct nvmet_req *req)
up_read(&nvmet_config_sem);
- status = nvmet_copy_to_sgl(req, 0, buffer + offset, data_len);
+ /*
+ * Validate the host-supplied log page offset before copying out.
+ * Without this check, the host controls a 64-bit byte offset into
+ * a small kzalloc'd buffer: a value past the log page lets the
+ * subsequent memcpy read adjacent kernel heap, and a value aimed
+ * at unmapped kernel memory faults the in-kernel copy and crashes
+ * the target host. The Discovery controller is unauthenticated,
+ * so the bug is reachable from any reachable fabric peer.
+ */
+ if (offset > alloc_len) {
+ req->error_loc =
+ offsetof(struct nvme_get_log_page_command, lpo);
+ status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+ goto out_free_buffer;
+ }
+
+ copy_len = min_t(size_t, data_len, alloc_len - offset);
+ status = nvmet_copy_to_sgl(req, 0, buffer + offset, copy_len);
+ if (!status && copy_len < data_len)
+ status = nvmet_zero_sgl(req, copy_len, data_len - copy_len);
+out_free_buffer:
kfree(buffer);
out:
nvmet_req_complete(req, status);
diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c
index f1e613e7c63e..0a85acf1e5c7 100644
--- a/drivers/nvme/target/fabrics-cmd-auth.c
+++ b/drivers/nvme/target/fabrics-cmd-auth.c
@@ -132,13 +132,22 @@ static u8 nvmet_auth_negotiate(struct nvmet_req *req, void *d)
return 0;
}
-static u8 nvmet_auth_reply(struct nvmet_req *req, void *d)
+static u8 nvmet_auth_reply(struct nvmet_req *req, void *d, u32 tl)
{
struct nvmet_ctrl *ctrl = req->sq->ctrl;
struct nvmf_auth_dhchap_reply_data *data = d;
- u16 dhvlen = le16_to_cpu(data->dhvlen);
+ u16 dhvlen;
u8 *response;
+ if (tl < sizeof(*data))
+ return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+
+ dhvlen = le16_to_cpu(data->dhvlen);
+
+ /* Validate that hl and dhvlen fit within the transfer length */
+ if (sizeof(*data) + 2 * (size_t)data->hl + dhvlen > tl)
+ return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
+
pr_debug("%s: ctrl %d qid %d: data hl %d cvalid %d dhvlen %u\n",
__func__, ctrl->cntlid, req->sq->qid,
data->hl, data->cvalid, dhvlen);
@@ -338,7 +347,7 @@ void nvmet_execute_auth_send(struct nvmet_req *req)
switch (data->auth_id) {
case NVME_AUTH_DHCHAP_MESSAGE_REPLY:
- dhchap_status = nvmet_auth_reply(req, d);
+ dhchap_status = nvmet_auth_reply(req, d, tl);
if (dhchap_status == 0)
req->sq->dhchap_step =
NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1;
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index ae00bcef2251..fcb1f8186fdd 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -274,7 +274,6 @@ static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl)
nvmet_sq_destroy(&ctrl->queues[0].nvme_sq);
nvmet_cq_put(&ctrl->queues[0].nvme_cq);
- nvme_remove_admin_tag_set(&ctrl->ctrl);
}
static void nvme_loop_free_ctrl(struct nvme_ctrl *nctrl)
@@ -375,25 +374,18 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
}
ctrl->ctrl.queue_count = 1;
- error = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set,
- &nvme_loop_admin_mq_ops,
- sizeof(struct nvme_loop_iod) +
- NVME_INLINE_SG_CNT * sizeof(struct scatterlist));
- if (error)
- goto out_free_sq;
-
/* reset stopped state for the fresh admin queue */
clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->ctrl.flags);
error = nvmf_connect_admin_queue(&ctrl->ctrl);
if (error)
- goto out_cleanup_tagset;
+ goto out_free_sq;
set_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags);
error = nvme_enable_ctrl(&ctrl->ctrl);
if (error)
- goto out_cleanup_tagset;
+ goto out_free_sq;
ctrl->ctrl.max_hw_sectors =
(NVME_LOOP_MAX_SEGMENTS - 1) << PAGE_SECTORS_SHIFT;
@@ -402,14 +394,12 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
error = nvme_init_ctrl_finish(&ctrl->ctrl, false);
if (error)
- goto out_cleanup_tagset;
+ goto out_free_sq;
return 0;
-out_cleanup_tagset:
- clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags);
- nvme_remove_admin_tag_set(&ctrl->ctrl);
out_free_sq:
+ clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags);
nvmet_sq_destroy(&ctrl->queues[0].nvme_sq);
nvmet_cq_put(&ctrl->queues[0].nvme_cq);
return error;
@@ -432,6 +422,7 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
static void nvme_loop_delete_ctrl_host(struct nvme_ctrl *ctrl)
{
nvme_loop_shutdown_ctrl(to_loop_ctrl(ctrl));
+ nvme_remove_admin_tag_set(ctrl);
}
static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl)
@@ -494,6 +485,7 @@ out_destroy_admin:
nvme_cancel_admin_tagset(&ctrl->ctrl);
nvme_loop_destroy_admin_queue(ctrl);
out_disable:
+ nvme_remove_admin_tag_set(&ctrl->ctrl);
dev_warn(ctrl->ctrl.device, "Removing after reset failure\n");
nvme_uninit_ctrl(&ctrl->ctrl);
}
@@ -594,10 +586,17 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
if (!ctrl->queues)
goto out_uninit_ctrl;
- ret = nvme_loop_configure_admin_queue(ctrl);
+ ret = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set,
+ &nvme_loop_admin_mq_ops,
+ sizeof(struct nvme_loop_iod) +
+ NVME_INLINE_SG_CNT * sizeof(struct scatterlist));
if (ret)
goto out_free_queues;
+ ret = nvme_loop_configure_admin_queue(ctrl);
+ if (ret)
+ goto out_remove_admin_tagset;
+
if (opts->queue_size > ctrl->ctrl.maxcmd) {
/* warn if maxcmd is lower than queue_size */
dev_warn(ctrl->ctrl.device,
@@ -633,6 +632,8 @@ out_remove_admin_queue:
nvme_quiesce_admin_queue(&ctrl->ctrl);
nvme_cancel_admin_tagset(&ctrl->ctrl);
nvme_loop_destroy_admin_queue(ctrl);
+out_remove_admin_tagset:
+ nvme_remove_admin_tag_set(&ctrl->ctrl);
out_free_queues:
kfree(ctrl->queues);
out_uninit_ctrl:
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index e6e2c3f9afdf..ac26f4f774c4 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -1598,8 +1598,10 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
pending++;
}
mutex_unlock(&nvmet_rdma_queue_mutex);
- if (pending > NVMET_RDMA_BACKLOG)
- return NVME_SC_CONNECT_CTRL_BUSY;
+ if (pending > NVMET_RDMA_BACKLOG) {
+ ret = NVME_SC_CONNECT_CTRL_BUSY;
+ goto put_device;
+ }
}
ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn);
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 164a564ba3b4..3568fa9a0905 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1842,10 +1842,11 @@ static void nvmet_tcp_tls_handshake_done(void *data, int status,
if (!status)
status = nvmet_tcp_tls_key_lookup(queue, peerid);
+ if (!status)
+ status = nvmet_tcp_set_queue_sock(queue);
+
if (status)
nvmet_tcp_schedule_release_queue(queue);
- else
- nvmet_tcp_set_queue_sock(queue);
kref_put(&queue->kref, nvmet_tcp_release_queue);
}
@@ -1997,6 +1998,12 @@ out_free_connect:
nvmet_tcp_free_cmd(&queue->connect);
out_ida_remove:
ida_free(&nvmet_tcp_queue_ida, queue->idx);
+ /*
+ * Drain the page fragment cache if any allocations were done.
+ * The first allocation using pf_cache is nvmet_tcp_alloc_cmd()
+ * for queue->connect after ida_alloc().
+ */
+ page_frag_cache_drain(&queue->pf_cache);
out_sock:
fput(queue->sock->file);
out_free_queue: