From 1900e1a4495b7376cb9b4e58f1d846660f4c9c4b Mon Sep 17 00:00:00 2001
From: Guixin Liu <kanie@linux.alibaba.com>
Date: Wed, 6 Nov 2024 15:34:45 +0800
Subject: nvme: add reservation command's defines

This is a preparation patch for NVMeOF target reservation
commands implantation.
Add the defines of reservation command, such as reservation log
and sub operations.

Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
Tested-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/nvme.h | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index b58d9405d65e..44d048d68503 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -2037,4 +2037,72 @@ struct nvme_completion {
 #define NVME_MINOR(ver)		(((ver) >> 8) & 0xff)
 #define NVME_TERTIARY(ver)	((ver) & 0xff)
 
+enum {
+	NVME_AEN_RESV_LOG_PAGE_AVALIABLE	= 0x00,
+};
+
+enum {
+	NVME_PR_LOG_EMPTY_LOG_PAGE			= 0x00,
+	NVME_PR_LOG_REGISTRATION_PREEMPTED		= 0x01,
+	NVME_PR_LOG_RESERVATION_RELEASED		= 0x02,
+	NVME_PR_LOG_RESERVATOPM_PREEMPTED		= 0x03,
+};
+
+enum {
+	NVME_PR_NOTIFY_BIT_REG_PREEMPTED		= 1,
+	NVME_PR_NOTIFY_BIT_RESV_RELEASED		= 2,
+	NVME_PR_NOTIFY_BIT_RESV_PREEMPTED		= 3,
+};
+
+struct nvme_pr_log {
+	__le64			count;
+	__u8			type;
+	__u8			nr_pages;
+	__u8			rsvd1[2];
+	__le32			nsid;
+	__u8			rsvd2[48];
+};
+
+struct nvmet_pr_register_data {
+	__le64	crkey;
+	__le64	nrkey;
+};
+
+struct nvmet_pr_acquire_data {
+	__le64	crkey;
+	__le64	prkey;
+};
+
+struct nvmet_pr_release_data {
+	__le64	crkey;
+};
+
+enum nvme_pr_capabilities {
+	NVME_PR_SUPPORT_PTPL				= 1,
+	NVME_PR_SUPPORT_WRITE_EXCLUSIVE			= 1 << 1,
+	NVME_PR_SUPPORT_EXCLUSIVE_ACCESS		= 1 << 2,
+	NVME_PR_SUPPORT_WRITE_EXCLUSIVE_REG_ONLY	= 1 << 3,
+	NVME_PR_SUPPORT_EXCLUSIVE_ACCESS_REG_ONLY	= 1 << 4,
+	NVME_PR_SUPPORT_WRITE_EXCLUSIVE_ALL_REGS	= 1 << 5,
+	NVME_PR_SUPPORT_EXCLUSIVE_ACCESS_ALL_REGS	= 1 << 6,
+	NVME_PR_SUPPORT_IEKEY_VER_1_3_DEF		= 1 << 7,
+};
+
+enum nvme_pr_register_action {
+	NVME_PR_REGISTER_ACT_REG		= 0,
+	NVME_PR_REGISTER_ACT_UNREG		= 1,
+	NVME_PR_REGISTER_ACT_REPLACE		= 1 << 1,
+};
+
+enum nvme_pr_acquire_action {
+	NVME_PR_ACQUIRE_ACT_ACQUIRE		= 0,
+	NVME_PR_ACQUIRE_ACT_PREEMPT		= 1,
+	NVME_PR_ACQUIRE_ACT_PREEMPT_AND_ABORT	= 1 << 1,
+};
+
+enum nvme_pr_release_action {
+	NVME_PR_RELEASE_ACT_RELEASE		= 0,
+	NVME_PR_RELEASE_ACT_CLEAR		= 1,
+};
+
 #endif /* _LINUX_NVME_H */
-- 
cgit v1.2.3


From 5a47c2080a7316f184107464e4f76737c0c05186 Mon Sep 17 00:00:00 2001
From: Guixin Liu <kanie@linux.alibaba.com>
Date: Wed, 6 Nov 2024 15:34:46 +0800
Subject: nvmet: support reservation feature

This patch implements the reservation feature, including:
  1. reservation register(register, unregister and replace).
  2. reservation acquire(acquire, preempt, preempt and abort).
  3. reservation release(release and clear).
  4. reservation report.
  5. set feature and get feature of reservation notify mask.
  6. get log page of reservation event.

Not supported:
  1. persistent reservation through power loss.

Test cases:
  Use nvme-cli and fio to test all implemented sub features:
  1. use nvme resv-register to register host a registrant or
     unregister or replace a new key.
  2. use nvme resv-acquire to set host to the holder, and use fio
     to send read and write io in all reservation type. And also
     test preempt and "preempt and abort".
  3. use nvme resv-report to show all registrants and reservation
     status.
  4. use nvme resv-release to release all registrants.
  5. use nvme get-log to get events generated by the preceding
     operations.

In addition, make reservation configurable, one can set ns to
support reservation before enable ns. The default of resv_enable
is false.

Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
Reviewed-by: Dmitry Bogdanov <d.bogdanov@yadro.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/target/Makefile      |    2 +-
 drivers/nvme/target/admin-cmd.c   |   24 +-
 drivers/nvme/target/configfs.c    |   27 +
 drivers/nvme/target/core.c        |   62 +-
 drivers/nvme/target/fabrics-cmd.c |    4 +-
 drivers/nvme/target/nvmet.h       |   65 ++-
 drivers/nvme/target/pr.c          | 1156 +++++++++++++++++++++++++++++++++++++
 include/linux/nvme.h              |    2 +-
 8 files changed, 1329 insertions(+), 13 deletions(-)
 create mode 100644 drivers/nvme/target/pr.c

(limited to 'include/linux')

diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
index c402c44350b2..f2b025bbe10c 100644
--- a/drivers/nvme/target/Makefile
+++ b/drivers/nvme/target/Makefile
@@ -10,7 +10,7 @@ obj-$(CONFIG_NVME_TARGET_FCLOOP)	+= nvme-fcloop.o
 obj-$(CONFIG_NVME_TARGET_TCP)		+= nvmet-tcp.o
 
 nvmet-y		+= core.o configfs.o admin-cmd.o fabrics-cmd.o \
-			discovery.o io-cmd-file.o io-cmd-bdev.o
+			discovery.o io-cmd-file.o io-cmd-bdev.o pr.o
 nvmet-$(CONFIG_NVME_TARGET_DEBUGFS)	+= debugfs.o
 nvmet-$(CONFIG_NVME_TARGET_PASSTHRU)	+= passthru.o
 nvmet-$(CONFIG_BLK_DEV_ZONED)		+= zns.o
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 081f0473cd9e..19428745c795 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -176,6 +176,10 @@ static void nvmet_get_cmd_effects_nvm(struct nvme_effects_log *log)
 	log->iocs[nvme_cmd_read] =
 	log->iocs[nvme_cmd_flush] =
 	log->iocs[nvme_cmd_dsm]	=
+	log->iocs[nvme_cmd_resv_acquire] =
+	log->iocs[nvme_cmd_resv_register] =
+	log->iocs[nvme_cmd_resv_release] =
+	log->iocs[nvme_cmd_resv_report] =
 		cpu_to_le32(NVME_CMD_EFFECTS_CSUPP);
 	log->iocs[nvme_cmd_write] =
 	log->iocs[nvme_cmd_write_zeroes] =
@@ -340,6 +344,8 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
 		return nvmet_execute_get_log_cmd_effects_ns(req);
 	case NVME_LOG_ANA:
 		return nvmet_execute_get_log_page_ana(req);
+	case NVME_LOG_RESERVATION:
+		return nvmet_execute_get_log_page_resv(req);
 	}
 	pr_debug("unhandled lid %d on qid %d\n",
 	       req->cmd->get_log_page.lid, req->sq->qid);
@@ -433,7 +439,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
 	id->nn = cpu_to_le32(NVMET_MAX_NAMESPACES);
 	id->mnan = cpu_to_le32(NVMET_MAX_NAMESPACES);
 	id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM |
-			NVME_CTRL_ONCS_WRITE_ZEROES);
+			NVME_CTRL_ONCS_WRITE_ZEROES |
+			NVME_CTRL_ONCS_RESERVATIONS);
 
 	/* XXX: don't report vwc if the underlying device is write through */
 	id->vwc = NVME_CTRL_VWC_PRESENT;
@@ -551,6 +558,15 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
 	id->nmic = NVME_NS_NMIC_SHARED;
 	id->anagrpid = cpu_to_le32(req->ns->anagrpid);
 
+	if (req->ns->pr.enable)
+		id->rescap = NVME_PR_SUPPORT_WRITE_EXCLUSIVE |
+			NVME_PR_SUPPORT_EXCLUSIVE_ACCESS |
+			NVME_PR_SUPPORT_WRITE_EXCLUSIVE_REG_ONLY |
+			NVME_PR_SUPPORT_EXCLUSIVE_ACCESS_REG_ONLY |
+			NVME_PR_SUPPORT_WRITE_EXCLUSIVE_ALL_REGS |
+			NVME_PR_SUPPORT_EXCLUSIVE_ACCESS_ALL_REGS |
+			NVME_PR_SUPPORT_IEKEY_VER_1_3_DEF;
+
 	memcpy(&id->nguid, &req->ns->nguid, sizeof(id->nguid));
 
 	id->lbaf[0].ds = req->ns->blksize_shift;
@@ -861,6 +877,9 @@ void nvmet_execute_set_features(struct nvmet_req *req)
 	case NVME_FEAT_WRITE_PROTECT:
 		status = nvmet_set_feat_write_protect(req);
 		break;
+	case NVME_FEAT_RESV_MASK:
+		status = nvmet_set_feat_resv_notif_mask(req, cdw11);
+		break;
 	default:
 		req->error_loc = offsetof(struct nvme_common_command, cdw10);
 		status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
@@ -959,6 +978,9 @@ void nvmet_execute_get_features(struct nvmet_req *req)
 	case NVME_FEAT_WRITE_PROTECT:
 		status = nvmet_get_feat_write_protect(req);
 		break;
+	case NVME_FEAT_RESV_MASK:
+		status = nvmet_get_feat_resv_notif_mask(req);
+		break;
 	default:
 		req->error_loc =
 			offsetof(struct nvme_common_command, cdw10);
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index 685e89b35d33..eeee9e9b854c 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -769,6 +769,32 @@ static ssize_t nvmet_ns_revalidate_size_store(struct config_item *item,
 
 CONFIGFS_ATTR_WO(nvmet_ns_, revalidate_size);
 
+static ssize_t nvmet_ns_resv_enable_show(struct config_item *item, char *page)
+{
+	return sysfs_emit(page, "%d\n", to_nvmet_ns(item)->pr.enable);
+}
+
+static ssize_t nvmet_ns_resv_enable_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct nvmet_ns *ns = to_nvmet_ns(item);
+	bool val;
+
+	if (kstrtobool(page, &val))
+		return -EINVAL;
+
+	mutex_lock(&ns->subsys->lock);
+	if (ns->enabled) {
+		pr_err("the ns:%d is already enabled.\n", ns->nsid);
+		mutex_unlock(&ns->subsys->lock);
+		return -EINVAL;
+	}
+	ns->pr.enable = val;
+	mutex_unlock(&ns->subsys->lock);
+	return count;
+}
+CONFIGFS_ATTR(nvmet_ns_, resv_enable);
+
 static struct configfs_attribute *nvmet_ns_attrs[] = {
 	&nvmet_ns_attr_device_path,
 	&nvmet_ns_attr_device_nguid,
@@ -777,6 +803,7 @@ static struct configfs_attribute *nvmet_ns_attrs[] = {
 	&nvmet_ns_attr_enable,
 	&nvmet_ns_attr_buffered_io,
 	&nvmet_ns_attr_revalidate_size,
+	&nvmet_ns_attr_resv_enable,
 #ifdef CONFIG_PCI_P2PDMA
 	&nvmet_ns_attr_p2pmem,
 #endif
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 15b25f464e77..1f4e9989663b 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -611,6 +611,12 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
 	if (ret)
 		goto out_restore_subsys_maxnsid;
 
+	if (ns->pr.enable) {
+		ret = nvmet_pr_init_ns(ns);
+		if (ret)
+			goto out_remove_from_subsys;
+	}
+
 	subsys->nr_namespaces++;
 
 	nvmet_ns_changed(subsys, ns->nsid);
@@ -620,6 +626,8 @@ out_unlock:
 	mutex_unlock(&subsys->lock);
 	return ret;
 
+out_remove_from_subsys:
+	xa_erase(&subsys->namespaces, ns->nsid);
 out_restore_subsys_maxnsid:
 	subsys->max_nsid = nvmet_max_nsid(subsys);
 	percpu_ref_exit(&ns->ref);
@@ -663,6 +671,9 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
 	wait_for_completion(&ns->disable_done);
 	percpu_ref_exit(&ns->ref);
 
+	if (ns->pr.enable)
+		nvmet_pr_exit_ns(ns);
+
 	mutex_lock(&subsys->lock);
 
 	subsys->nr_namespaces--;
@@ -754,6 +765,7 @@ static void nvmet_set_error(struct nvmet_req *req, u16 status)
 static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
 {
 	struct nvmet_ns *ns = req->ns;
+	struct nvmet_pr_per_ctrl_ref *pc_ref = req->pc_ref;
 
 	if (!req->sq->sqhd_disabled)
 		nvmet_update_sq_head(req);
@@ -766,6 +778,9 @@ static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
 	trace_nvmet_req_complete(req);
 
 	req->ops->queue_response(req);
+
+	if (pc_ref)
+		nvmet_pr_put_ns_pc_ref(pc_ref);
 	if (ns)
 		nvmet_put_namespace(ns);
 }
@@ -929,18 +944,39 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
 		return ret;
 	}
 
+	if (req->ns->pr.enable) {
+		ret = nvmet_parse_pr_cmd(req);
+		if (!ret)
+			return ret;
+	}
+
 	switch (req->ns->csi) {
 	case NVME_CSI_NVM:
 		if (req->ns->file)
-			return nvmet_file_parse_io_cmd(req);
-		return nvmet_bdev_parse_io_cmd(req);
+			ret = nvmet_file_parse_io_cmd(req);
+		else
+			ret = nvmet_bdev_parse_io_cmd(req);
+		break;
 	case NVME_CSI_ZNS:
 		if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
-			return nvmet_bdev_zns_parse_io_cmd(req);
-		return NVME_SC_INVALID_IO_CMD_SET;
+			ret = nvmet_bdev_zns_parse_io_cmd(req);
+		else
+			ret = NVME_SC_INVALID_IO_CMD_SET;
+		break;
 	default:
-		return NVME_SC_INVALID_IO_CMD_SET;
+		ret = NVME_SC_INVALID_IO_CMD_SET;
 	}
+	if (ret)
+		return ret;
+
+	if (req->ns->pr.enable) {
+		ret = nvmet_pr_check_cmd_access(req);
+		if (ret)
+			return ret;
+
+		ret = nvmet_pr_get_ns_pc_ref(req);
+	}
+	return ret;
 }
 
 bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
@@ -964,6 +1000,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
 	req->ns = NULL;
 	req->error_loc = NVMET_NO_ERROR_LOC;
 	req->error_slba = 0;
+	req->pc_ref = NULL;
 
 	/* no support for fused commands yet */
 	if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) {
@@ -1015,6 +1052,8 @@ EXPORT_SYMBOL_GPL(nvmet_req_init);
 void nvmet_req_uninit(struct nvmet_req *req)
 {
 	percpu_ref_put(&req->sq->ref);
+	if (req->pc_ref)
+		nvmet_pr_put_ns_pc_ref(req->pc_ref);
 	if (req->ns)
 		nvmet_put_namespace(req->ns);
 }
@@ -1383,7 +1422,8 @@ static void nvmet_fatal_error_handler(struct work_struct *work)
 }
 
 u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
-		struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp)
+		struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp,
+		uuid_t *hostid)
 {
 	struct nvmet_subsys *subsys;
 	struct nvmet_ctrl *ctrl;
@@ -1462,6 +1502,8 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 	}
 	ctrl->cntlid = ret;
 
+	uuid_copy(&ctrl->hostid, hostid);
+
 	/*
 	 * Discovery controllers may use some arbitrary high value
 	 * in order to cleanup stale discovery sessions
@@ -1478,6 +1520,9 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 	nvmet_start_keep_alive_timer(ctrl);
 
 	mutex_lock(&subsys->lock);
+	ret = nvmet_ctrl_init_pr(ctrl);
+	if (ret)
+		goto init_pr_fail;
 	list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
 	nvmet_setup_p2p_ns_map(ctrl, req);
 	nvmet_debugfs_ctrl_setup(ctrl);
@@ -1486,6 +1531,10 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 	*ctrlp = ctrl;
 	return 0;
 
+init_pr_fail:
+	mutex_unlock(&subsys->lock);
+	nvmet_stop_keep_alive_timer(ctrl);
+	ida_free(&cntlid_ida, ctrl->cntlid);
 out_free_sqs:
 	kfree(ctrl->sqs);
 out_free_changed_ns_list:
@@ -1504,6 +1553,7 @@ static void nvmet_ctrl_free(struct kref *ref)
 	struct nvmet_subsys *subsys = ctrl->subsys;
 
 	mutex_lock(&subsys->lock);
+	nvmet_ctrl_destroy_pr(ctrl);
 	nvmet_release_p2p_ns_map(ctrl);
 	list_del(&ctrl->subsys_entry);
 	mutex_unlock(&subsys->lock);
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index c4b2eddd5666..28a84af1b4c0 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -245,12 +245,10 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
 	d->subsysnqn[NVMF_NQN_FIELD_LEN - 1] = '\0';
 	d->hostnqn[NVMF_NQN_FIELD_LEN - 1] = '\0';
 	status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req,
-				  le32_to_cpu(c->kato), &ctrl);
+				  le32_to_cpu(c->kato), &ctrl, &d->hostid);
 	if (status)
 		goto out;
 
-	uuid_copy(&ctrl->hostid, &d->hostid);
-
 	dhchap_status = nvmet_setup_auth(ctrl);
 	if (dhchap_status) {
 		pr_err("Failed to setup authentication, dhchap status %u\n",
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 190f55e6d753..ec379814b16c 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -20,6 +20,7 @@
 #include <linux/blkdev.h>
 #include <linux/radix-tree.h>
 #include <linux/t10-pi.h>
+#include <linux/kfifo.h>
 
 #define NVMET_DEFAULT_VS		NVME_VS(1, 3, 0)
 
@@ -30,6 +31,7 @@
 #define NVMET_MN_MAX_SIZE		40
 #define NVMET_SN_MAX_SIZE		20
 #define NVMET_FR_MAX_SIZE		8
+#define NVMET_PR_LOG_QUEUE_SIZE		64
 
 /*
  * Supported optional AENs:
@@ -56,6 +58,38 @@
 #define IPO_IATTR_CONNECT_SQE(x)	\
 	(cpu_to_le32(offsetof(struct nvmf_connect_command, x)))
 
+struct nvmet_pr_registrant {
+	u64			rkey;
+	uuid_t			hostid;
+	enum nvme_pr_type	rtype;
+	struct list_head	entry;
+	struct rcu_head		rcu;
+};
+
+struct nvmet_pr {
+	bool			enable;
+	unsigned long		notify_mask;
+	atomic_t		generation;
+	struct nvmet_pr_registrant __rcu *holder;
+	/*
+	 * During the execution of the reservation command, mutual
+	 * exclusion is required throughout the process. However,
+	 * while waiting asynchronously for the 'per controller
+	 * percpu_ref' to complete before the 'preempt and abort'
+	 * command finishes, a semaphore is needed to ensure mutual
+	 * exclusion instead of a mutex.
+	 */
+	struct semaphore	pr_sem;
+	struct list_head	registrant_list;
+};
+
+struct nvmet_pr_per_ctrl_ref {
+	struct percpu_ref	ref;
+	struct completion	free_done;
+	struct completion	confirm_done;
+	uuid_t			hostid;
+};
+
 struct nvmet_ns {
 	struct percpu_ref	ref;
 	struct file		*bdev_file;
@@ -85,6 +119,8 @@ struct nvmet_ns {
 	int			pi_type;
 	int			metadata_size;
 	u8			csi;
+	struct nvmet_pr		pr;
+	struct xarray		pr_per_ctrl_refs;
 };
 
 static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item)
@@ -191,6 +227,13 @@ static inline bool nvmet_port_secure_channel_required(struct nvmet_port *port)
     return nvmet_port_disc_addr_treq_secure_channel(port) == NVMF_TREQ_REQUIRED;
 }
 
+struct nvmet_pr_log_mgr {
+	struct mutex		lock;
+	u64			lost_count;
+	u64			counter;
+	DECLARE_KFIFO(log_queue, struct nvme_pr_log, NVMET_PR_LOG_QUEUE_SIZE);
+};
+
 struct nvmet_ctrl {
 	struct nvmet_subsys	*subsys;
 	struct nvmet_sq		**sqs;
@@ -246,6 +289,7 @@ struct nvmet_ctrl {
 	u8			*dh_key;
 	size_t			dh_keysize;
 #endif
+	struct nvmet_pr_log_mgr pr_log_mgr;
 };
 
 struct nvmet_subsys {
@@ -396,6 +440,9 @@ struct nvmet_req {
 			struct work_struct	zmgmt_work;
 		} z;
 #endif /* CONFIG_BLK_DEV_ZONED */
+		struct {
+			struct work_struct	abort_work;
+		} r;
 	};
 	int			sg_cnt;
 	int			metadata_sg_cnt;
@@ -412,6 +459,7 @@ struct nvmet_req {
 	struct device		*p2p_client;
 	u16			error_loc;
 	u64			error_slba;
+	struct nvmet_pr_per_ctrl_ref *pc_ref;
 };
 
 #define NVMET_MAX_MPOOL_BVEC		16
@@ -498,7 +546,8 @@ void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl);
 
 void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new);
 u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
-		struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp);
+		struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp,
+		uuid_t *hostid);
 struct nvmet_ctrl *nvmet_ctrl_find_get(const char *subsysnqn,
 				       const char *hostnqn, u16 cntlid,
 				       struct nvmet_req *req);
@@ -761,4 +810,18 @@ static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl)
 static inline const char *nvmet_dhchap_dhgroup_name(u8 dhgid) { return NULL; }
 #endif
 
+int nvmet_pr_init_ns(struct nvmet_ns *ns);
+u16 nvmet_parse_pr_cmd(struct nvmet_req *req);
+u16 nvmet_pr_check_cmd_access(struct nvmet_req *req);
+int nvmet_ctrl_init_pr(struct nvmet_ctrl *ctrl);
+void nvmet_ctrl_destroy_pr(struct nvmet_ctrl *ctrl);
+void nvmet_pr_exit_ns(struct nvmet_ns *ns);
+void nvmet_execute_get_log_page_resv(struct nvmet_req *req);
+u16 nvmet_set_feat_resv_notif_mask(struct nvmet_req *req, u32 mask);
+u16 nvmet_get_feat_resv_notif_mask(struct nvmet_req *req);
+u16 nvmet_pr_get_ns_pc_ref(struct nvmet_req *req);
+static inline void nvmet_pr_put_ns_pc_ref(struct nvmet_pr_per_ctrl_ref *pc_ref)
+{
+	percpu_ref_put(&pc_ref->ref);
+}
 #endif /* _NVMET_H */
diff --git a/drivers/nvme/target/pr.c b/drivers/nvme/target/pr.c
new file mode 100644
index 000000000000..25a02b50d9f3
--- /dev/null
+++ b/drivers/nvme/target/pr.c
@@ -0,0 +1,1156 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe over Fabrics Persist Reservation.
+ * Copyright (c) 2024 Guixin Liu, Alibaba Group.
+ * All rights reserved.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/unaligned.h>
+#include "nvmet.h"
+
+#define NVMET_PR_NOTIFI_MASK_ALL \
+	(1 << NVME_PR_NOTIFY_BIT_REG_PREEMPTED | \
+	 1 << NVME_PR_NOTIFY_BIT_RESV_RELEASED | \
+	 1 << NVME_PR_NOTIFY_BIT_RESV_PREEMPTED)
+
+static inline bool nvmet_pr_parse_ignore_key(u32 cdw10)
+{
+	/* Ignore existing key, bit 03. */
+	return (cdw10 >> 3) & 1;
+}
+
+static inline struct nvmet_ns *nvmet_pr_to_ns(struct nvmet_pr *pr)
+{
+	return container_of(pr, struct nvmet_ns, pr);
+}
+
+static struct nvmet_pr_registrant *
+nvmet_pr_find_registrant(struct nvmet_pr *pr, uuid_t *hostid)
+{
+	struct nvmet_pr_registrant *reg;
+
+	list_for_each_entry_rcu(reg, &pr->registrant_list, entry) {
+		if (uuid_equal(&reg->hostid, hostid))
+			return reg;
+	}
+	return NULL;
+}
+
+u16 nvmet_set_feat_resv_notif_mask(struct nvmet_req *req, u32 mask)
+{
+	u32 nsid = le32_to_cpu(req->cmd->common.nsid);
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvmet_ns *ns;
+	unsigned long idx;
+	u16 status;
+
+	if (mask & ~(NVMET_PR_NOTIFI_MASK_ALL)) {
+		req->error_loc = offsetof(struct nvme_common_command, cdw11);
+		return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+	}
+
+	if (nsid != U32_MAX) {
+		status = nvmet_req_find_ns(req);
+		if (status)
+			return status;
+		if (!req->ns->pr.enable)
+			return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+
+		WRITE_ONCE(req->ns->pr.notify_mask, mask);
+		goto success;
+	}
+
+	xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
+		if (ns->pr.enable)
+			WRITE_ONCE(ns->pr.notify_mask, mask);
+	}
+
+success:
+	nvmet_set_result(req, mask);
+	return NVME_SC_SUCCESS;
+}
+
+u16 nvmet_get_feat_resv_notif_mask(struct nvmet_req *req)
+{
+	u16 status;
+
+	status = nvmet_req_find_ns(req);
+	if (status)
+		return status;
+
+	if (!req->ns->pr.enable)
+		return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+
+	nvmet_set_result(req, READ_ONCE(req->ns->pr.notify_mask));
+	return status;
+}
+
+void nvmet_execute_get_log_page_resv(struct nvmet_req *req)
+{
+	struct nvmet_pr_log_mgr *log_mgr = &req->sq->ctrl->pr_log_mgr;
+	struct nvme_pr_log next_log = {0};
+	struct nvme_pr_log log = {0};
+	u16 status = NVME_SC_SUCCESS;
+	u64 lost_count;
+	u64 cur_count;
+	u64 next_count;
+
+	mutex_lock(&log_mgr->lock);
+	if (!kfifo_get(&log_mgr->log_queue, &log))
+		goto out;
+
+	/*
+	 * We can't get the last in kfifo.
+	 * Utilize the current count and the count from the next log to
+	 * calculate the number of lost logs, while also addressing cases
+	 * of overflow. If there is no subsequent log, the number of lost
+	 * logs is equal to the lost_count within the nvmet_pr_log_mgr.
+	 */
+	cur_count = le64_to_cpu(log.count);
+	if (kfifo_peek(&log_mgr->log_queue, &next_log)) {
+		next_count = le64_to_cpu(next_log.count);
+		if (next_count > cur_count)
+			lost_count = next_count - cur_count - 1;
+		else
+			lost_count = U64_MAX - cur_count + next_count - 1;
+	} else {
+		lost_count = log_mgr->lost_count;
+	}
+
+	log.count = cpu_to_le64((cur_count + lost_count) == 0 ?
+				1 : (cur_count + lost_count));
+	log_mgr->lost_count -= lost_count;
+
+	log.nr_pages = kfifo_len(&log_mgr->log_queue);
+
+out:
+	status = nvmet_copy_to_sgl(req, 0, &log, sizeof(log));
+	mutex_unlock(&log_mgr->lock);
+	nvmet_req_complete(req, status);
+}
+
+static void nvmet_pr_add_resv_log(struct nvmet_ctrl *ctrl, u8 log_type,
+				  u32 nsid)
+{
+	struct nvmet_pr_log_mgr *log_mgr = &ctrl->pr_log_mgr;
+	struct nvme_pr_log log = {0};
+
+	mutex_lock(&log_mgr->lock);
+	log_mgr->counter++;
+	if (log_mgr->counter == 0)
+		log_mgr->counter = 1;
+
+	log.count = cpu_to_le64(log_mgr->counter);
+	log.type = log_type;
+	log.nsid = cpu_to_le32(nsid);
+
+	if (!kfifo_put(&log_mgr->log_queue, log)) {
+		pr_info("a reservation log lost, cntlid:%d, log_type:%d, nsid:%d\n",
+			ctrl->cntlid, log_type, nsid);
+		log_mgr->lost_count++;
+	}
+
+	mutex_unlock(&log_mgr->lock);
+}
+
+static void nvmet_pr_resv_released(struct nvmet_pr *pr, uuid_t *hostid)
+{
+	struct nvmet_ns *ns = nvmet_pr_to_ns(pr);
+	struct nvmet_subsys *subsys = ns->subsys;
+	struct nvmet_ctrl *ctrl;
+
+	if (test_bit(NVME_PR_NOTIFY_BIT_RESV_RELEASED, &pr->notify_mask))
+		return;
+
+	mutex_lock(&subsys->lock);
+	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+		if (!uuid_equal(&ctrl->hostid, hostid) &&
+		    nvmet_pr_find_registrant(pr, &ctrl->hostid)) {
+			nvmet_pr_add_resv_log(ctrl,
+				NVME_PR_LOG_RESERVATION_RELEASED, ns->nsid);
+			nvmet_add_async_event(ctrl, NVME_AER_CSS,
+				NVME_AEN_RESV_LOG_PAGE_AVALIABLE,
+				NVME_LOG_RESERVATION);
+		}
+	}
+	mutex_unlock(&subsys->lock);
+}
+
+static void nvmet_pr_send_event_to_host(struct nvmet_pr *pr, uuid_t *hostid,
+					  u8 log_type)
+{
+	struct nvmet_ns *ns = nvmet_pr_to_ns(pr);
+	struct nvmet_subsys *subsys = ns->subsys;
+	struct nvmet_ctrl *ctrl;
+
+	mutex_lock(&subsys->lock);
+	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+		if (uuid_equal(hostid, &ctrl->hostid)) {
+			nvmet_pr_add_resv_log(ctrl, log_type, ns->nsid);
+			nvmet_add_async_event(ctrl, NVME_AER_CSS,
+				NVME_AEN_RESV_LOG_PAGE_AVALIABLE,
+				NVME_LOG_RESERVATION);
+		}
+	}
+	mutex_unlock(&subsys->lock);
+}
+
+static void nvmet_pr_resv_preempted(struct nvmet_pr *pr, uuid_t *hostid)
+{
+	if (test_bit(NVME_PR_NOTIFY_BIT_RESV_PREEMPTED, &pr->notify_mask))
+		return;
+
+	nvmet_pr_send_event_to_host(pr, hostid,
+		NVME_PR_LOG_RESERVATOIN_PREEMPTED);
+}
+
+static void nvmet_pr_registration_preempted(struct nvmet_pr *pr,
+					    uuid_t *hostid)
+{
+	if (test_bit(NVME_PR_NOTIFY_BIT_REG_PREEMPTED, &pr->notify_mask))
+		return;
+
+	nvmet_pr_send_event_to_host(pr, hostid,
+		NVME_PR_LOG_REGISTRATION_PREEMPTED);
+}
+
+static inline void nvmet_pr_set_new_holder(struct nvmet_pr *pr, u8 new_rtype,
+					   struct nvmet_pr_registrant *reg)
+{
+	reg->rtype = new_rtype;
+	rcu_assign_pointer(pr->holder, reg);
+}
+
+static u16 nvmet_pr_register(struct nvmet_req *req,
+			     struct nvmet_pr_register_data *d)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvmet_pr_registrant *new, *reg;
+	struct nvmet_pr *pr = &req->ns->pr;
+	u16 status = NVME_SC_SUCCESS;
+	u64 nrkey = le64_to_cpu(d->nrkey);
+
+	new = kmalloc(sizeof(*new), GFP_KERNEL);
+	if (!new)
+		return NVME_SC_INTERNAL;
+
+	down(&pr->pr_sem);
+	reg = nvmet_pr_find_registrant(pr, &ctrl->hostid);
+	if (reg) {
+		if (reg->rkey != nrkey)
+			status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+		kfree(new);
+		goto out;
+	}
+
+	memset(new, 0, sizeof(*new));
+	INIT_LIST_HEAD(&new->entry);
+	new->rkey = nrkey;
+	uuid_copy(&new->hostid, &ctrl->hostid);
+	list_add_tail_rcu(&new->entry, &pr->registrant_list);
+
+out:
+	up(&pr->pr_sem);
+	return status;
+}
+
+static void nvmet_pr_unregister_one(struct nvmet_pr *pr,
+				    struct nvmet_pr_registrant *reg)
+{
+	struct nvmet_pr_registrant *first_reg;
+	struct nvmet_pr_registrant *holder;
+	u8 original_rtype;
+
+	list_del_rcu(&reg->entry);
+
+	holder = rcu_dereference_protected(pr->holder, 1);
+	if (reg != holder)
+		goto out;
+
+	original_rtype = holder->rtype;
+	if (original_rtype == NVME_PR_WRITE_EXCLUSIVE_ALL_REGS ||
+	    original_rtype == NVME_PR_EXCLUSIVE_ACCESS_ALL_REGS) {
+		first_reg = list_first_or_null_rcu(&pr->registrant_list,
+				struct nvmet_pr_registrant, entry);
+		if (first_reg)
+			first_reg->rtype = original_rtype;
+		rcu_assign_pointer(pr->holder, first_reg);
+	} else {
+		rcu_assign_pointer(pr->holder, NULL);
+
+		if (original_rtype == NVME_PR_WRITE_EXCLUSIVE_REG_ONLY ||
+		    original_rtype == NVME_PR_EXCLUSIVE_ACCESS_REG_ONLY)
+			nvmet_pr_resv_released(pr, &reg->hostid);
+	}
+out:
+	kfree_rcu(reg, rcu);
+}
+
+static u16 nvmet_pr_unregister(struct nvmet_req *req,
+			       struct nvmet_pr_register_data *d,
+			       bool ignore_key)
+{
+	u16 status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvmet_pr *pr = &req->ns->pr;
+	struct nvmet_pr_registrant *reg;
+
+	down(&pr->pr_sem);
+	list_for_each_entry_rcu(reg, &pr->registrant_list, entry) {
+		if (uuid_equal(&reg->hostid, &ctrl->hostid)) {
+			if (ignore_key || reg->rkey == le64_to_cpu(d->crkey)) {
+				status = NVME_SC_SUCCESS;
+				nvmet_pr_unregister_one(pr, reg);
+			}
+			break;
+		}
+	}
+	up(&pr->pr_sem);
+
+	return status;
+}
+
+static void nvmet_pr_update_reg_rkey(struct nvmet_pr_registrant *reg,
+				     void *attr)
+{
+	reg->rkey = *(u64 *)attr;
+}
+
+static u16 nvmet_pr_update_reg_attr(struct nvmet_pr *pr,
+			struct nvmet_pr_registrant *reg,
+			void (*change_attr)(struct nvmet_pr_registrant *reg,
+			void *attr),
+			void *attr)
+{
+	struct nvmet_pr_registrant *holder;
+	struct nvmet_pr_registrant *new;
+
+	holder = rcu_dereference_protected(pr->holder, 1);
+	if (reg != holder) {
+		change_attr(reg, attr);
+		return NVME_SC_SUCCESS;
+	}
+
+	new = kmalloc(sizeof(*new), GFP_ATOMIC);
+	if (!new)
+		return NVME_SC_INTERNAL;
+
+	new->rkey = holder->rkey;
+	new->rtype = holder->rtype;
+	uuid_copy(&new->hostid, &holder->hostid);
+	INIT_LIST_HEAD(&new->entry);
+
+	change_attr(new, attr);
+	list_replace_rcu(&holder->entry, &new->entry);
+	rcu_assign_pointer(pr->holder, new);
+	kfree_rcu(holder, rcu);
+
+	return NVME_SC_SUCCESS;
+}
+
+static u16 nvmet_pr_replace(struct nvmet_req *req,
+			    struct nvmet_pr_register_data *d,
+			    bool ignore_key)
+{
+	u16 status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvmet_pr *pr = &req->ns->pr;
+	struct nvmet_pr_registrant *reg;
+	u64 nrkey = le64_to_cpu(d->nrkey);
+
+	down(&pr->pr_sem);
+	list_for_each_entry_rcu(reg, &pr->registrant_list, entry) {
+		if (uuid_equal(&reg->hostid, &ctrl->hostid)) {
+			if (ignore_key || reg->rkey == le64_to_cpu(d->crkey))
+				status = nvmet_pr_update_reg_attr(pr, reg,
+						nvmet_pr_update_reg_rkey,
+						&nrkey);
+			break;
+		}
+	}
+	up(&pr->pr_sem);
+	return status;
+}
+
+static void nvmet_execute_pr_register(struct nvmet_req *req)
+{
+	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
+	bool ignore_key = nvmet_pr_parse_ignore_key(cdw10);
+	struct nvmet_pr_register_data *d;
+	u8 reg_act = cdw10 & 0x07; /* Reservation Register Action, bit 02:00 */
+	u16 status;
+
+	d = kmalloc(sizeof(*d), GFP_KERNEL);
+	if (!d) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d));
+	if (status)
+		goto free_data;
+
+	switch (reg_act) {
+	case NVME_PR_REGISTER_ACT_REG:
+		status = nvmet_pr_register(req, d);
+		break;
+	case NVME_PR_REGISTER_ACT_UNREG:
+		status = nvmet_pr_unregister(req, d, ignore_key);
+		break;
+	case NVME_PR_REGISTER_ACT_REPLACE:
+		status = nvmet_pr_replace(req, d, ignore_key);
+		break;
+	default:
+		req->error_loc = offsetof(struct nvme_common_command, cdw10);
+		status = NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR;
+		break;
+	}
+free_data:
+	kfree(d);
+out:
+	if (!status)
+		atomic_inc(&req->ns->pr.generation);
+	nvmet_req_complete(req, status);
+}
+
+static u16 nvmet_pr_acquire(struct nvmet_req *req,
+			    struct nvmet_pr_registrant *reg,
+			    u8 rtype)
+{
+	struct nvmet_pr *pr = &req->ns->pr;
+	struct nvmet_pr_registrant *holder;
+
+	holder = rcu_dereference_protected(pr->holder, 1);
+	if (holder && reg != holder)
+		return  NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+	if (holder && reg == holder) {
+		if (holder->rtype == rtype)
+			return NVME_SC_SUCCESS;
+		return NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+	}
+
+	nvmet_pr_set_new_holder(pr, rtype, reg);
+	return NVME_SC_SUCCESS;
+}
+
+static void nvmet_pr_confirm_ns_pc_ref(struct percpu_ref *ref)
+{
+	struct nvmet_pr_per_ctrl_ref *pc_ref =
+		container_of(ref, struct nvmet_pr_per_ctrl_ref, ref);
+
+	complete(&pc_ref->confirm_done);
+}
+
+static void nvmet_pr_set_ctrl_to_abort(struct nvmet_req *req, uuid_t *hostid)
+{
+	struct nvmet_pr_per_ctrl_ref *pc_ref;
+	struct nvmet_ns *ns = req->ns;
+	unsigned long idx;
+
+	xa_for_each(&ns->pr_per_ctrl_refs, idx, pc_ref) {
+		if (uuid_equal(&pc_ref->hostid, hostid)) {
+			percpu_ref_kill_and_confirm(&pc_ref->ref,
+						nvmet_pr_confirm_ns_pc_ref);
+			wait_for_completion(&pc_ref->confirm_done);
+		}
+	}
+}
+
+static u16 nvmet_pr_unreg_all_host_by_prkey(struct nvmet_req *req, u64 prkey,
+					    uuid_t *send_hostid,
+					    bool abort)
+{
+	u16 status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+	struct nvmet_pr_registrant *reg, *tmp;
+	struct nvmet_pr *pr = &req->ns->pr;
+	uuid_t hostid;
+
+	list_for_each_entry_safe(reg, tmp, &pr->registrant_list, entry) {
+		if (reg->rkey == prkey) {
+			status = NVME_SC_SUCCESS;
+			uuid_copy(&hostid, &reg->hostid);
+			if (abort)
+				nvmet_pr_set_ctrl_to_abort(req, &hostid);
+			nvmet_pr_unregister_one(pr, reg);
+			if (!uuid_equal(&hostid, send_hostid))
+				nvmet_pr_registration_preempted(pr, &hostid);
+		}
+	}
+	return status;
+}
+
+static void nvmet_pr_unreg_all_others_by_prkey(struct nvmet_req *req,
+					       u64 prkey,
+					       uuid_t *send_hostid,
+					       bool abort)
+{
+	struct nvmet_pr_registrant *reg, *tmp;
+	struct nvmet_pr *pr = &req->ns->pr;
+	uuid_t hostid;
+
+	list_for_each_entry_safe(reg, tmp, &pr->registrant_list, entry) {
+		if (reg->rkey == prkey &&
+		    !uuid_equal(&reg->hostid, send_hostid)) {
+			uuid_copy(&hostid, &reg->hostid);
+			if (abort)
+				nvmet_pr_set_ctrl_to_abort(req, &hostid);
+			nvmet_pr_unregister_one(pr, reg);
+			nvmet_pr_registration_preempted(pr, &hostid);
+		}
+	}
+}
+
+static void nvmet_pr_unreg_all_others(struct nvmet_req *req,
+				      uuid_t *send_hostid,
+				      bool abort)
+{
+	struct nvmet_pr_registrant *reg, *tmp;
+	struct nvmet_pr *pr = &req->ns->pr;
+	uuid_t hostid;
+
+	list_for_each_entry_safe(reg, tmp, &pr->registrant_list, entry) {
+		if (!uuid_equal(&reg->hostid, send_hostid)) {
+			uuid_copy(&hostid, &reg->hostid);
+			if (abort)
+				nvmet_pr_set_ctrl_to_abort(req, &hostid);
+			nvmet_pr_unregister_one(pr, reg);
+			nvmet_pr_registration_preempted(pr, &hostid);
+		}
+	}
+}
+
+static void nvmet_pr_update_holder_rtype(struct nvmet_pr_registrant *reg,
+					 void *attr)
+{
+	u8 new_rtype = *(u8 *)attr;
+
+	reg->rtype = new_rtype;
+}
+
+static u16 nvmet_pr_preempt(struct nvmet_req *req,
+			    struct nvmet_pr_registrant *reg,
+			    u8 rtype,
+			    struct nvmet_pr_acquire_data *d,
+			    bool abort)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvmet_pr *pr = &req->ns->pr;
+	struct nvmet_pr_registrant *holder;
+	enum nvme_pr_type original_rtype;
+	u64 prkey = le64_to_cpu(d->prkey);
+	u16 status;
+
+	holder = rcu_dereference_protected(pr->holder, 1);
+	if (!holder)
+		return nvmet_pr_unreg_all_host_by_prkey(req, prkey,
+					&ctrl->hostid, abort);
+
+	original_rtype = holder->rtype;
+	if (original_rtype == NVME_PR_WRITE_EXCLUSIVE_ALL_REGS ||
+	    original_rtype == NVME_PR_EXCLUSIVE_ACCESS_ALL_REGS) {
+		if (!prkey) {
+			/*
+			 * To prevent possible access from other hosts, and
+			 * avoid terminate the holder, set the new holder
+			 * first before unregistering.
+			 */
+			nvmet_pr_set_new_holder(pr, rtype, reg);
+			nvmet_pr_unreg_all_others(req, &ctrl->hostid, abort);
+			return NVME_SC_SUCCESS;
+		}
+		return nvmet_pr_unreg_all_host_by_prkey(req, prkey,
+				&ctrl->hostid, abort);
+	}
+
+	if (holder == reg) {
+		status = nvmet_pr_update_reg_attr(pr, holder,
+				nvmet_pr_update_holder_rtype, &rtype);
+		if (!status && original_rtype != rtype)
+			nvmet_pr_resv_released(pr, &reg->hostid);
+		return status;
+	}
+
+	if (prkey == holder->rkey) {
+		/*
+		 * Same as before, set the new holder first.
+		 */
+		nvmet_pr_set_new_holder(pr, rtype, reg);
+		nvmet_pr_unreg_all_others_by_prkey(req, prkey, &ctrl->hostid,
+						abort);
+		if (original_rtype != rtype)
+			nvmet_pr_resv_released(pr, &reg->hostid);
+		return NVME_SC_SUCCESS;
+	}
+
+	if (prkey)
+		return nvmet_pr_unreg_all_host_by_prkey(req, prkey,
+					&ctrl->hostid, abort);
+	return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+}
+
+static void nvmet_pr_do_abort(struct work_struct *w)
+{
+	struct nvmet_req *req = container_of(w, struct nvmet_req, r.abort_work);
+	struct nvmet_pr_per_ctrl_ref *pc_ref;
+	struct nvmet_ns *ns = req->ns;
+	unsigned long idx;
+
+	/*
+	 * The target does not support abort, just wait per-controller ref to 0.
+	 */
+	xa_for_each(&ns->pr_per_ctrl_refs, idx, pc_ref) {
+		if (percpu_ref_is_dying(&pc_ref->ref)) {
+			wait_for_completion(&pc_ref->free_done);
+			reinit_completion(&pc_ref->confirm_done);
+			reinit_completion(&pc_ref->free_done);
+			percpu_ref_resurrect(&pc_ref->ref);
+		}
+	}
+
+	up(&ns->pr.pr_sem);
+	nvmet_req_complete(req, NVME_SC_SUCCESS);
+}
+
+static u16 __nvmet_execute_pr_acquire(struct nvmet_req *req,
+				      struct nvmet_pr_registrant *reg,
+				      u8 acquire_act,
+				      u8 rtype,
+				      struct nvmet_pr_acquire_data *d)
+{
+	u16 status;
+
+	switch (acquire_act) {
+	case NVME_PR_ACQUIRE_ACT_ACQUIRE:
+		status = nvmet_pr_acquire(req, reg, rtype);
+		goto out;
+	case NVME_PR_ACQUIRE_ACT_PREEMPT:
+		status = nvmet_pr_preempt(req, reg, rtype, d, false);
+		goto inc_gen;
+	case NVME_PR_ACQUIRE_ACT_PREEMPT_AND_ABORT:
+		status = nvmet_pr_preempt(req, reg, rtype, d, true);
+		goto inc_gen;
+	default:
+		req->error_loc = offsetof(struct nvme_common_command, cdw10);
+		status = NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR;
+		goto out;
+	}
+inc_gen:
+	if (!status)
+		atomic_inc(&req->ns->pr.generation);
+out:
+	return status;
+}
+
+static void nvmet_execute_pr_acquire(struct nvmet_req *req)
+{
+	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
+	bool ignore_key = nvmet_pr_parse_ignore_key(cdw10);
+	/* Reservation type, bit 15:08 */
+	u8 rtype = (u8)((cdw10 >> 8) & 0xff);
+	/* Reservation acquire action, bit 02:00 */
+	u8 acquire_act = cdw10 & 0x07;
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvmet_pr_acquire_data *d = NULL;
+	struct nvmet_pr *pr = &req->ns->pr;
+	struct nvmet_pr_registrant *reg;
+	u16 status = NVME_SC_SUCCESS;
+
+	if (ignore_key ||
+	    rtype < NVME_PR_WRITE_EXCLUSIVE ||
+	    rtype > NVME_PR_EXCLUSIVE_ACCESS_ALL_REGS) {
+		status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+		goto out;
+	}
+
+	d = kmalloc(sizeof(*d), GFP_KERNEL);
+	if (!d) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d));
+	if (status)
+		goto free_data;
+
+	status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+	down(&pr->pr_sem);
+	list_for_each_entry_rcu(reg, &pr->registrant_list, entry) {
+		if (uuid_equal(&reg->hostid, &ctrl->hostid) &&
+		    reg->rkey == le64_to_cpu(d->crkey)) {
+			status = __nvmet_execute_pr_acquire(req, reg,
+					acquire_act, rtype, d);
+			break;
+		}
+	}
+
+	if (!status && acquire_act == NVME_PR_ACQUIRE_ACT_PREEMPT_AND_ABORT) {
+		kfree(d);
+		INIT_WORK(&req->r.abort_work, nvmet_pr_do_abort);
+		queue_work(nvmet_wq, &req->r.abort_work);
+		return;
+	}
+
+	up(&pr->pr_sem);
+
+free_data:
+	kfree(d);
+out:
+	nvmet_req_complete(req, status);
+}
+
+static u16 nvmet_pr_release(struct nvmet_req *req,
+			    struct nvmet_pr_registrant *reg,
+			    u8 rtype)
+{
+	struct nvmet_pr *pr = &req->ns->pr;
+	struct nvmet_pr_registrant *holder;
+	u8 original_rtype;
+
+	holder = rcu_dereference_protected(pr->holder, 1);
+	if (!holder || reg != holder)
+		return NVME_SC_SUCCESS;
+
+	original_rtype = holder->rtype;
+	if (original_rtype != rtype)
+		return NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+
+	rcu_assign_pointer(pr->holder, NULL);
+
+	if (original_rtype != NVME_PR_WRITE_EXCLUSIVE &&
+	    original_rtype != NVME_PR_EXCLUSIVE_ACCESS)
+		nvmet_pr_resv_released(pr, &reg->hostid);
+
+	return NVME_SC_SUCCESS;
+}
+
+static void nvmet_pr_clear(struct nvmet_req *req)
+{
+	struct nvmet_pr_registrant *reg, *tmp;
+	struct nvmet_pr *pr = &req->ns->pr;
+
+	rcu_assign_pointer(pr->holder, NULL);
+
+	list_for_each_entry_safe(reg, tmp, &pr->registrant_list, entry) {
+		list_del_rcu(&reg->entry);
+		if (!uuid_equal(&req->sq->ctrl->hostid, &reg->hostid))
+			nvmet_pr_resv_preempted(pr, &reg->hostid);
+		kfree_rcu(reg, rcu);
+	}
+
+	atomic_inc(&pr->generation);
+}
+
+static u16 __nvmet_execute_pr_release(struct nvmet_req *req,
+				      struct nvmet_pr_registrant *reg,
+				      u8 release_act, u8 rtype)
+{
+	switch (release_act) {
+	case NVME_PR_RELEASE_ACT_RELEASE:
+		return nvmet_pr_release(req, reg, rtype);
+	case NVME_PR_RELEASE_ACT_CLEAR:
+		nvmet_pr_clear(req);
+		return NVME_SC_SUCCESS;
+	default:
+		req->error_loc = offsetof(struct nvme_common_command, cdw10);
+		return NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR;
+	}
+}
+
+static void nvmet_execute_pr_release(struct nvmet_req *req)
+{
+	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
+	bool ignore_key = nvmet_pr_parse_ignore_key(cdw10);
+	u8 rtype = (u8)((cdw10 >> 8) & 0xff); /* Reservation type, bit 15:08 */
+	u8 release_act = cdw10 & 0x07; /* Reservation release action, bit 02:00 */
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvmet_pr *pr = &req->ns->pr;
+	struct nvmet_pr_release_data *d;
+	struct nvmet_pr_registrant *reg;
+	u16 status;
+
+	if (ignore_key) {
+		status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+		goto out;
+	}
+
+	d = kmalloc(sizeof(*d), GFP_KERNEL);
+	if (!d) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d));
+	if (status)
+		goto free_data;
+
+	status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+	down(&pr->pr_sem);
+	list_for_each_entry_rcu(reg, &pr->registrant_list, entry) {
+		if (uuid_equal(&reg->hostid, &ctrl->hostid) &&
+		    reg->rkey == le64_to_cpu(d->crkey)) {
+			status = __nvmet_execute_pr_release(req, reg,
+					release_act, rtype);
+			break;
+		}
+	}
+	up(&pr->pr_sem);
+free_data:
+	kfree(d);
+out:
+	nvmet_req_complete(req, status);
+}
+
+static void nvmet_execute_pr_report(struct nvmet_req *req)
+{
+	u32 cdw11 = le32_to_cpu(req->cmd->common.cdw11);
+	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
+	u32 num_bytes = 4 * (cdw10 + 1); /* cdw10 is number of dwords */
+	u8 eds = cdw11 & 1; /* Extended data structure, bit 00 */
+	struct nvme_registered_ctrl_ext *ctrl_eds;
+	struct nvme_reservation_status_ext *data;
+	struct nvmet_pr *pr = &req->ns->pr;
+	struct nvmet_pr_registrant *holder;
+	struct nvmet_pr_registrant *reg;
+	u16 num_ctrls = 0;
+	u16 status;
+	u8 rtype;
+
+	/* nvmet hostid(uuid_t) is 128 bit. */
+	if (!eds) {
+		req->error_loc = offsetof(struct nvme_common_command, cdw11);
+		status = NVME_SC_HOST_ID_INCONSIST | NVME_STATUS_DNR;
+		goto out;
+	}
+
+	if (num_bytes < sizeof(struct nvme_reservation_status_ext)) {
+		req->error_loc = offsetof(struct nvme_common_command, cdw10);
+		status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+		goto out;
+	}
+
+	data = kmalloc(num_bytes, GFP_KERNEL);
+	if (!data) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+	memset(data, 0, num_bytes);
+	data->gen = cpu_to_le32(atomic_read(&pr->generation));
+	data->ptpls = 0;
+	ctrl_eds = data->regctl_eds;
+
+	rcu_read_lock();
+	holder = rcu_dereference(pr->holder);
+	rtype = holder ? holder->rtype : 0;
+	data->rtype = rtype;
+
+	list_for_each_entry_rcu(reg, &pr->registrant_list, entry) {
+		num_ctrls++;
+		/*
+		 * continue to get the number of all registrans.
+		 */
+		if (((void *)ctrl_eds + sizeof(*ctrl_eds)) >
+		    ((void *)data + num_bytes))
+			continue;
+		/*
+		 * Dynamic controller, set cntlid to 0xffff.
+		 */
+		ctrl_eds->cntlid = cpu_to_le16(NVME_CNTLID_DYNAMIC);
+		if (rtype == NVME_PR_WRITE_EXCLUSIVE_ALL_REGS ||
+		    rtype == NVME_PR_EXCLUSIVE_ACCESS_ALL_REGS)
+			ctrl_eds->rcsts = 1;
+		if (reg == holder)
+			ctrl_eds->rcsts = 1;
+		uuid_copy((uuid_t *)&ctrl_eds->hostid, &reg->hostid);
+		ctrl_eds->rkey = cpu_to_le64(reg->rkey);
+		ctrl_eds++;
+	}
+	rcu_read_unlock();
+
+	put_unaligned_le16(num_ctrls, data->regctl);
+	status = nvmet_copy_to_sgl(req, 0, data, num_bytes);
+	kfree(data);
+out:
+	nvmet_req_complete(req, status);
+}
+
+u16 nvmet_parse_pr_cmd(struct nvmet_req *req)
+{
+	struct nvme_command *cmd = req->cmd;
+
+	switch (cmd->common.opcode) {
+	case nvme_cmd_resv_register:
+		req->execute = nvmet_execute_pr_register;
+		break;
+	case nvme_cmd_resv_acquire:
+		req->execute = nvmet_execute_pr_acquire;
+		break;
+	case nvme_cmd_resv_release:
+		req->execute = nvmet_execute_pr_release;
+		break;
+	case nvme_cmd_resv_report:
+		req->execute = nvmet_execute_pr_report;
+		break;
+	default:
+		return 1;
+	}
+	return NVME_SC_SUCCESS;
+}
+
+static bool nvmet_is_req_write_cmd_group(struct nvmet_req *req)
+{
+	u8 opcode = req->cmd->common.opcode;
+
+	if (req->sq->qid) {
+		switch (opcode) {
+		case nvme_cmd_flush:
+		case nvme_cmd_write:
+		case nvme_cmd_write_zeroes:
+		case nvme_cmd_dsm:
+		case nvme_cmd_zone_append:
+		case nvme_cmd_zone_mgmt_send:
+			return true;
+		default:
+			return false;
+		}
+	}
+	return false;
+}
+
+static bool nvmet_is_req_read_cmd_group(struct nvmet_req *req)
+{
+	u8 opcode = req->cmd->common.opcode;
+
+	if (req->sq->qid) {
+		switch (opcode) {
+		case nvme_cmd_read:
+		case nvme_cmd_zone_mgmt_recv:
+			return true;
+		default:
+			return false;
+		}
+	}
+	return false;
+}
+
+u16 nvmet_pr_check_cmd_access(struct nvmet_req *req)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvmet_pr_registrant *holder;
+	struct nvmet_ns *ns = req->ns;
+	struct nvmet_pr *pr = &ns->pr;
+	u16 status = NVME_SC_SUCCESS;
+
+	rcu_read_lock();
+	holder = rcu_dereference(pr->holder);
+	if (!holder)
+		goto unlock;
+	if (uuid_equal(&ctrl->hostid, &holder->hostid))
+		goto unlock;
+
+	/*
+	 * The Reservation command group is checked in executing,
+	 * allow it here.
+	 */
+	switch (holder->rtype) {
+	case NVME_PR_WRITE_EXCLUSIVE:
+		if (nvmet_is_req_write_cmd_group(req))
+			status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+		break;
+	case NVME_PR_EXCLUSIVE_ACCESS:
+		if (nvmet_is_req_read_cmd_group(req) ||
+		    nvmet_is_req_write_cmd_group(req))
+			status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+		break;
+	case NVME_PR_WRITE_EXCLUSIVE_REG_ONLY:
+	case NVME_PR_WRITE_EXCLUSIVE_ALL_REGS:
+		if ((nvmet_is_req_write_cmd_group(req)) &&
+		    !nvmet_pr_find_registrant(pr, &ctrl->hostid))
+			status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+		break;
+	case NVME_PR_EXCLUSIVE_ACCESS_REG_ONLY:
+	case NVME_PR_EXCLUSIVE_ACCESS_ALL_REGS:
+		if ((nvmet_is_req_read_cmd_group(req) ||
+		    nvmet_is_req_write_cmd_group(req)) &&
+		    !nvmet_pr_find_registrant(pr, &ctrl->hostid))
+			status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+		break;
+	default:
+		pr_warn("the reservation type is set wrong, type:%d\n",
+			holder->rtype);
+		break;
+	}
+
+unlock:
+	rcu_read_unlock();
+	if (status)
+		req->error_loc = offsetof(struct nvme_common_command, opcode);
+	return status;
+}
+
+u16 nvmet_pr_get_ns_pc_ref(struct nvmet_req *req)
+{
+	struct nvmet_pr_per_ctrl_ref *pc_ref;
+
+	pc_ref = xa_load(&req->ns->pr_per_ctrl_refs,
+			req->sq->ctrl->cntlid);
+	if (unlikely(!percpu_ref_tryget_live(&pc_ref->ref)))
+		return NVME_SC_INTERNAL;
+	req->pc_ref = pc_ref;
+	return NVME_SC_SUCCESS;
+}
+
+static void nvmet_pr_ctrl_ns_all_cmds_done(struct percpu_ref *ref)
+{
+	struct nvmet_pr_per_ctrl_ref *pc_ref =
+		container_of(ref, struct nvmet_pr_per_ctrl_ref, ref);
+
+	complete(&pc_ref->free_done);
+}
+
+static int nvmet_pr_alloc_and_insert_pc_ref(struct nvmet_ns *ns,
+					    unsigned long idx,
+					    uuid_t *hostid)
+{
+	struct nvmet_pr_per_ctrl_ref *pc_ref;
+	int ret;
+
+	pc_ref = kmalloc(sizeof(*pc_ref), GFP_ATOMIC);
+	if (!pc_ref)
+		return  -ENOMEM;
+
+	ret = percpu_ref_init(&pc_ref->ref, nvmet_pr_ctrl_ns_all_cmds_done,
+			PERCPU_REF_ALLOW_REINIT, GFP_KERNEL);
+	if (ret)
+		goto free;
+
+	init_completion(&pc_ref->free_done);
+	init_completion(&pc_ref->confirm_done);
+	uuid_copy(&pc_ref->hostid, hostid);
+
+	ret = xa_insert(&ns->pr_per_ctrl_refs, idx, pc_ref, GFP_KERNEL);
+	if (ret)
+		goto exit;
+	return ret;
+exit:
+	percpu_ref_exit(&pc_ref->ref);
+free:
+	kfree(pc_ref);
+	return ret;
+}
+
+int nvmet_ctrl_init_pr(struct nvmet_ctrl *ctrl)
+{
+	struct nvmet_subsys *subsys = ctrl->subsys;
+	struct nvmet_pr_per_ctrl_ref *pc_ref;
+	struct nvmet_ns *ns = NULL;
+	unsigned long idx;
+	int ret;
+
+	ctrl->pr_log_mgr.counter = 0;
+	ctrl->pr_log_mgr.lost_count = 0;
+	mutex_init(&ctrl->pr_log_mgr.lock);
+	INIT_KFIFO(ctrl->pr_log_mgr.log_queue);
+
+	/*
+	 * Here we are under subsys lock, if an ns not in subsys->namespaces,
+	 * we can make sure that ns is not enabled, and not call
+	 * nvmet_pr_init_ns(), see more details in nvmet_ns_enable().
+	 * So just check ns->pr.enable.
+	 */
+	xa_for_each(&subsys->namespaces, idx, ns) {
+		if (ns->pr.enable) {
+			ret = nvmet_pr_alloc_and_insert_pc_ref(ns, ctrl->cntlid,
+							&ctrl->hostid);
+			if (ret)
+				goto free_per_ctrl_refs;
+		}
+	}
+	return 0;
+
+free_per_ctrl_refs:
+	xa_for_each(&subsys->namespaces, idx, ns) {
+		if (ns->pr.enable) {
+			pc_ref = xa_erase(&ns->pr_per_ctrl_refs, ctrl->cntlid);
+			if (pc_ref)
+				percpu_ref_exit(&pc_ref->ref);
+			kfree(pc_ref);
+		}
+	}
+	return ret;
+}
+
+void nvmet_ctrl_destroy_pr(struct nvmet_ctrl *ctrl)
+{
+	struct nvmet_pr_per_ctrl_ref *pc_ref;
+	struct nvmet_ns *ns;
+	unsigned long idx;
+
+	kfifo_free(&ctrl->pr_log_mgr.log_queue);
+	mutex_destroy(&ctrl->pr_log_mgr.lock);
+
+	xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
+		if (ns->pr.enable) {
+			pc_ref = xa_erase(&ns->pr_per_ctrl_refs, ctrl->cntlid);
+			if (pc_ref)
+				percpu_ref_exit(&pc_ref->ref);
+			kfree(pc_ref);
+		}
+	}
+}
+
+int nvmet_pr_init_ns(struct nvmet_ns *ns)
+{
+	struct nvmet_subsys *subsys = ns->subsys;
+	struct nvmet_pr_per_ctrl_ref *pc_ref;
+	struct nvmet_ctrl *ctrl = NULL;
+	unsigned long idx;
+	int ret;
+
+	ns->pr.holder = NULL;
+	atomic_set(&ns->pr.generation, 0);
+	sema_init(&ns->pr.pr_sem, 1);
+	INIT_LIST_HEAD(&ns->pr.registrant_list);
+	ns->pr.notify_mask = 0;
+
+	xa_init(&ns->pr_per_ctrl_refs);
+
+	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+		ret = nvmet_pr_alloc_and_insert_pc_ref(ns, ctrl->cntlid,
+						&ctrl->hostid);
+		if (ret)
+			goto free_per_ctrl_refs;
+	}
+	return 0;
+
+free_per_ctrl_refs:
+	xa_for_each(&ns->pr_per_ctrl_refs, idx, pc_ref) {
+		xa_erase(&ns->pr_per_ctrl_refs, idx);
+		percpu_ref_exit(&pc_ref->ref);
+		kfree(pc_ref);
+	}
+	return ret;
+}
+
+void nvmet_pr_exit_ns(struct nvmet_ns *ns)
+{
+	struct nvmet_pr_registrant *reg, *tmp;
+	struct nvmet_pr_per_ctrl_ref *pc_ref;
+	struct nvmet_pr *pr = &ns->pr;
+	unsigned long idx;
+
+	list_for_each_entry_safe(reg, tmp, &pr->registrant_list, entry) {
+		list_del(&reg->entry);
+		kfree(reg);
+	}
+
+	xa_for_each(&ns->pr_per_ctrl_refs, idx, pc_ref) {
+		/*
+		 * No command on ns here, we can safely free pc_ref.
+		 */
+		pc_ref = xa_erase(&ns->pr_per_ctrl_refs, idx);
+		percpu_ref_exit(&pc_ref->ref);
+		kfree(pc_ref);
+	}
+
+	xa_destroy(&ns->pr_per_ctrl_refs);
+}
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 44d048d68503..0179bb6d502d 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -2045,7 +2045,7 @@ enum {
 	NVME_PR_LOG_EMPTY_LOG_PAGE			= 0x00,
 	NVME_PR_LOG_REGISTRATION_PREEMPTED		= 0x01,
 	NVME_PR_LOG_RESERVATION_RELEASED		= 0x02,
-	NVME_PR_LOG_RESERVATOPM_PREEMPTED		= 0x03,
+	NVME_PR_LOG_RESERVATOIN_PREEMPTED		= 0x03,
 };
 
 enum {
-- 
cgit v1.2.3


From 61c9967cd63448292a64f9ee9aeb6e2053e3a624 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Mon, 4 Nov 2024 13:24:36 -0800
Subject: nvmet: implement active command set ns list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is required for nvme 2.1 for targets that support multiple command
sets. We support NVM and ZNS, so are required to support this
identification.

Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Matias Bjørling <matias.bjorling@wdc.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/target/admin-cmd.c | 9 +++++++--
 include/linux/nvme.h            | 1 +
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 3bf2e06baad7..c59e21434833 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -592,7 +592,7 @@ out:
 	nvmet_req_complete(req, status);
 }
 
-static void nvmet_execute_identify_nslist(struct nvmet_req *req)
+static void nvmet_execute_identify_nslist(struct nvmet_req *req, bool match_css)
 {
 	static const int buf_size = NVME_IDENTIFY_DATA_SIZE;
 	struct nvmet_ctrl *ctrl = req->sq->ctrl;
@@ -622,6 +622,8 @@ static void nvmet_execute_identify_nslist(struct nvmet_req *req)
 	xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
 		if (ns->nsid <= min_nsid)
 			continue;
+		if (match_css && req->ns->csi != req->cmd->identify.csi)
+			continue;
 		list[i++] = cpu_to_le32(ns->nsid);
 		if (i == buf_size / sizeof(__le32))
 			break;
@@ -728,7 +730,7 @@ static void nvmet_execute_identify(struct nvmet_req *req)
 		nvmet_execute_identify_ctrl(req);
 		return;
 	case NVME_ID_CNS_NS_ACTIVE_LIST:
-		nvmet_execute_identify_nslist(req);
+		nvmet_execute_identify_nslist(req, false);
 		return;
 	case NVME_ID_CNS_NS_DESC_LIST:
 		nvmet_execute_identify_desclist(req);
@@ -759,6 +761,9 @@ static void nvmet_execute_identify(struct nvmet_req *req)
 			break;
 		}
 		break;
+	case NVME_ID_CNS_NS_ACTIVE_LIST_CS:
+		nvmet_execute_identify_nslist(req, true);
+		return;
 	}
 
 	pr_debug("unhandled identify cns %d on qid %d\n",
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 0179bb6d502d..26de7c5c12be 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -522,6 +522,7 @@ enum {
 	NVME_ID_CNS_NS_DESC_LIST	= 0x03,
 	NVME_ID_CNS_CS_NS		= 0x05,
 	NVME_ID_CNS_CS_CTRL		= 0x06,
+	NVME_ID_CNS_NS_ACTIVE_LIST_CS	= 0x07,
 	NVME_ID_CNS_NS_CS_INDEP		= 0x08,
 	NVME_ID_CNS_NS_PRESENT_LIST	= 0x10,
 	NVME_ID_CNS_NS_PRESENT		= 0x11,
-- 
cgit v1.2.3


From 83acb24e6de7bbb5cb0df1ba0f47a92da9112061 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Mon, 4 Nov 2024 14:00:14 -0800
Subject: nvmet: implement supported log pages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This log is required for nvme 2.1.

Reviewed-by: Matias Bjørling <matias.bjorling@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/target/admin-cmd.c | 28 ++++++++++++++++++++++++++++
 include/linux/nvme.h            |  9 +++++++++
 2 files changed, 37 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index c59e21434833..cbbf911c5cc0 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -71,6 +71,32 @@ static void nvmet_execute_get_log_page_error(struct nvmet_req *req)
 	nvmet_req_complete(req, 0);
 }
 
+static void nvmet_execute_get_supported_log_pages(struct nvmet_req *req)
+{
+	struct nvme_supported_log *logs;
+	u16 status;
+
+	logs = kzalloc(sizeof(*logs), GFP_KERNEL);
+	if (!logs) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	logs->lids[NVME_LOG_SUPPORTED] = cpu_to_le32(NVME_LIDS_LSUPP);
+	logs->lids[NVME_LOG_ERROR] = cpu_to_le32(NVME_LIDS_LSUPP);
+	logs->lids[NVME_LOG_SMART] = cpu_to_le32(NVME_LIDS_LSUPP);
+	logs->lids[NVME_LOG_FW_SLOT] = cpu_to_le32(NVME_LIDS_LSUPP);
+	logs->lids[NVME_LOG_CHANGED_NS] = cpu_to_le32(NVME_LIDS_LSUPP);
+	logs->lids[NVME_LOG_CMD_EFFECTS] = cpu_to_le32(NVME_LIDS_LSUPP);
+	logs->lids[NVME_LOG_ANA] = cpu_to_le32(NVME_LIDS_LSUPP);
+	logs->lids[NVME_LOG_RESERVATION] = cpu_to_le32(NVME_LIDS_LSUPP);
+
+	status = nvmet_copy_to_sgl(req, 0, logs, sizeof(*logs));
+	kfree(logs);
+out:
+	nvmet_req_complete(req, status);
+}
+
 static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
 		struct nvme_smart_log *slog)
 {
@@ -327,6 +353,8 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
 		return;
 
 	switch (req->cmd->get_log_page.lid) {
+	case NVME_LOG_SUPPORTED:
+		return nvmet_execute_get_supported_log_pages(req);
 	case NVME_LOG_ERROR:
 		return nvmet_execute_get_log_page_error(req);
 	case NVME_LOG_SMART:
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 26de7c5c12be..e9e508bca60f 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -1245,6 +1245,7 @@ enum {
 	NVME_FEAT_WRITE_PROTECT	= 0x84,
 	NVME_FEAT_VENDOR_START	= 0xC0,
 	NVME_FEAT_VENDOR_END	= 0xFF,
+	NVME_LOG_SUPPORTED	= 0x00,
 	NVME_LOG_ERROR		= 0x01,
 	NVME_LOG_SMART		= 0x02,
 	NVME_LOG_FW_SLOT	= 0x03,
@@ -1262,6 +1263,14 @@ enum {
 	NVME_FWACT_ACTV		= (2 << 3),
 };
 
+struct nvme_supported_log {
+	__le32	lids[256];
+};
+
+enum {
+	NVME_LIDS_LSUPP	= 1 << 0,
+};
+
 /* NVMe Namespace Write Protect State */
 enum {
 	NVME_NS_NO_WRITE_PROTECT = 0,
-- 
cgit v1.2.3


From e973c91727d49bb128c95210b3aa1960b9421d18 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Mon, 4 Nov 2024 14:07:42 -0800
Subject: nvmet: implement supported features log
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This log is required for nvme 2.1.

Reviewed-by: Matias Bjørling <matias.bjorling@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/target/admin-cmd.c | 33 +++++++++++++++++++++++++++++++++
 include/linux/nvme.h            | 11 +++++++++++
 2 files changed, 44 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index cbbf911c5cc0..f832661a4913 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -89,6 +89,7 @@ static void nvmet_execute_get_supported_log_pages(struct nvmet_req *req)
 	logs->lids[NVME_LOG_CHANGED_NS] = cpu_to_le32(NVME_LIDS_LSUPP);
 	logs->lids[NVME_LOG_CMD_EFFECTS] = cpu_to_le32(NVME_LIDS_LSUPP);
 	logs->lids[NVME_LOG_ANA] = cpu_to_le32(NVME_LIDS_LSUPP);
+	logs->lids[NVME_LOG_FEATURES] = cpu_to_le32(NVME_LIDS_LSUPP);
 	logs->lids[NVME_LOG_RESERVATION] = cpu_to_le32(NVME_LIDS_LSUPP);
 
 	status = nvmet_copy_to_sgl(req, 0, logs, sizeof(*logs));
@@ -347,6 +348,36 @@ out:
 	nvmet_req_complete(req, status);
 }
 
+static void nvmet_execute_get_log_page_features(struct nvmet_req *req)
+{
+	struct nvme_supported_features_log *features;
+	u16 status;
+
+	features = kzalloc(sizeof(*features), GFP_KERNEL);
+	if (!features) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	features->fis[NVME_FEAT_NUM_QUEUES] =
+		cpu_to_le32(NVME_FIS_FSUPP | NVME_FIS_CSCPE);
+	features->fis[NVME_FEAT_KATO] =
+		cpu_to_le32(NVME_FIS_FSUPP | NVME_FIS_CSCPE);
+	features->fis[NVME_FEAT_ASYNC_EVENT] =
+		cpu_to_le32(NVME_FIS_FSUPP | NVME_FIS_CSCPE);
+	features->fis[NVME_FEAT_HOST_ID] =
+		cpu_to_le32(NVME_FIS_FSUPP | NVME_FIS_CSCPE);
+	features->fis[NVME_FEAT_WRITE_PROTECT] =
+		cpu_to_le32(NVME_FIS_FSUPP | NVME_FIS_NSCPE);
+	features->fis[NVME_FEAT_RESV_MASK] =
+		cpu_to_le32(NVME_FIS_FSUPP | NVME_FIS_NSCPE);
+
+	status = nvmet_copy_to_sgl(req, 0, features, sizeof(*features));
+	kfree(features);
+out:
+	nvmet_req_complete(req, status);
+}
+
 static void nvmet_execute_get_log_page(struct nvmet_req *req)
 {
 	if (!nvmet_check_transfer_len(req, nvmet_get_log_page_len(req->cmd)))
@@ -372,6 +403,8 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
 		return nvmet_execute_get_log_cmd_effects_ns(req);
 	case NVME_LOG_ANA:
 		return nvmet_execute_get_log_page_ana(req);
+	case NVME_LOG_FEATURES:
+		return nvmet_execute_get_log_page_features(req);
 	case NVME_LOG_RESERVATION:
 		return nvmet_execute_get_log_page_resv(req);
 	}
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index e9e508bca60f..31d7ec6d8b93 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -1256,6 +1256,7 @@ enum {
 	NVME_LOG_TELEMETRY_CTRL = 0x08,
 	NVME_LOG_ENDURANCE_GROUP = 0x09,
 	NVME_LOG_ANA		= 0x0c,
+	NVME_LOG_FEATURES	= 0x12,
 	NVME_LOG_DISC		= 0x70,
 	NVME_LOG_RESERVATION	= 0x80,
 	NVME_FWACT_REPL		= (0 << 3),
@@ -1271,6 +1272,16 @@ enum {
 	NVME_LIDS_LSUPP	= 1 << 0,
 };
 
+struct nvme_supported_features_log {
+	__le32	fis[256];
+};
+
+enum {
+	NVME_FIS_FSUPP	= 1 << 0,
+	NVME_FIS_NSCPE	= 1 << 20,
+	NVME_FIS_CSCPE	= 1 << 21,
+};
+
 /* NVMe Namespace Write Protect State */
 enum {
 	NVME_NS_NO_WRITE_PROTECT = 0,
-- 
cgit v1.2.3


From 266b652c65b44fb2ccfa17cdb54ce2ef723deb0a Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Fri, 1 Nov 2024 14:46:01 -0700
Subject: nvmet: implement endurance groups

Most of the returned information is just stubbed data. The target must
support these in order to report rotational media. Since this driver
doesn't know any better, each namespace is its own endurance group with
the engid value matching the nsid.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/core.c        |  1 +
 drivers/nvme/target/admin-cmd.c | 95 +++++++++++++++++++++++++++++++++++++++++
 include/linux/nvme.h            | 29 ++++++++++++-
 3 files changed, 123 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index e9aac07f4c26..426d4b90ecd7 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -5001,6 +5001,7 @@ static inline void _nvme_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_nvm) != NVME_IDENTIFY_DATA_SIZE);
 	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
+	BUILD_BUG_ON(sizeof(struct nvme_endurance_group_log) != 512);
 	BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_feat_host_behavior) != 512);
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index f832661a4913..366582f52200 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -88,6 +88,7 @@ static void nvmet_execute_get_supported_log_pages(struct nvmet_req *req)
 	logs->lids[NVME_LOG_FW_SLOT] = cpu_to_le32(NVME_LIDS_LSUPP);
 	logs->lids[NVME_LOG_CHANGED_NS] = cpu_to_le32(NVME_LIDS_LSUPP);
 	logs->lids[NVME_LOG_CMD_EFFECTS] = cpu_to_le32(NVME_LIDS_LSUPP);
+	logs->lids[NVME_LOG_ENDURANCE_GROUP] = cpu_to_le32(NVME_LIDS_LSUPP);
 	logs->lids[NVME_LOG_ANA] = cpu_to_le32(NVME_LIDS_LSUPP);
 	logs->lids[NVME_LOG_FEATURES] = cpu_to_le32(NVME_LIDS_LSUPP);
 	logs->lids[NVME_LOG_RESERVATION] = cpu_to_le32(NVME_LIDS_LSUPP);
@@ -303,6 +304,49 @@ static u32 nvmet_format_ana_group(struct nvmet_req *req, u32 grpid,
 	return struct_size(desc, nsids, count);
 }
 
+static void nvmet_execute_get_log_page_endgrp(struct nvmet_req *req)
+{
+	u64 host_reads, host_writes, data_units_read, data_units_written;
+	struct nvme_endurance_group_log *log;
+	u16 status;
+
+	/*
+	 * The target driver emulates each endurance group as its own
+	 * namespace, reusing the nsid as the endurance group identifier.
+	 */
+	req->cmd->common.nsid = cpu_to_le32(le16_to_cpu(
+					    req->cmd->get_log_page.lsi));
+	status = nvmet_req_find_ns(req);
+	if (status)
+		goto out;
+
+	log = kzalloc(sizeof(*log), GFP_KERNEL);
+	if (!log) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	if (!req->ns->bdev)
+		goto copy;
+
+	host_reads = part_stat_read(req->ns->bdev, ios[READ]);
+	data_units_read =
+		DIV_ROUND_UP(part_stat_read(req->ns->bdev, sectors[READ]), 1000);
+	host_writes = part_stat_read(req->ns->bdev, ios[WRITE]);
+	data_units_written =
+		DIV_ROUND_UP(part_stat_read(req->ns->bdev, sectors[WRITE]), 1000);
+
+	put_unaligned_le64(host_reads, &log->hrc[0]);
+	put_unaligned_le64(data_units_read, &log->dur[0]);
+	put_unaligned_le64(host_writes, &log->hwc[0]);
+	put_unaligned_le64(data_units_written, &log->duw[0]);
+copy:
+	status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log));
+	kfree(log);
+out:
+	nvmet_req_complete(req, status);
+}
+
 static void nvmet_execute_get_log_page_ana(struct nvmet_req *req)
 {
 	struct nvme_ana_rsp_hdr hdr = { 0, };
@@ -401,6 +445,8 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
 		return nvmet_execute_get_log_changed_ns(req);
 	case NVME_LOG_CMD_EFFECTS:
 		return nvmet_execute_get_log_cmd_effects_ns(req);
+	case NVME_LOG_ENDURANCE_GROUP:
+		return nvmet_execute_get_log_page_endgrp(req);
 	case NVME_LOG_ANA:
 		return nvmet_execute_get_log_page_ana(req);
 	case NVME_LOG_FEATURES:
@@ -535,6 +581,13 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
 
 	id->msdbd = ctrl->ops->msdbd;
 
+	/*
+	 * Endurance group identifier is 16 bits, so we can't let namespaces
+	 * overflow that since we reuse the nsid
+	 */
+	BUILD_BUG_ON(NVMET_MAX_NAMESPACES > USHRT_MAX);
+	id->endgidmax = cpu_to_le16(NVMET_MAX_NAMESPACES);
+
 	id->anacap = (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3) | (1 << 4);
 	id->anatt = 10; /* random value */
 	id->anagrpmax = cpu_to_le32(NVMET_MAX_ANAGRPS);
@@ -628,6 +681,12 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
 			NVME_PR_SUPPORT_EXCLUSIVE_ACCESS_ALL_REGS |
 			NVME_PR_SUPPORT_IEKEY_VER_1_3_DEF;
 
+	/*
+	 * Since we don't know any better, every namespace is its own endurance
+	 * group.
+	 */
+	id->endgid = cpu_to_le16(req->ns->nsid);
+
 	memcpy(&id->nguid, &req->ns->nguid, sizeof(id->nguid));
 
 	id->lbaf[0].ds = req->ns->blksize_shift;
@@ -653,6 +712,39 @@ out:
 	nvmet_req_complete(req, status);
 }
 
+static void nvmet_execute_identify_endgrp_list(struct nvmet_req *req)
+{
+	u16 min_endgid = le16_to_cpu(req->cmd->identify.cnssid);
+	static const int buf_size = NVME_IDENTIFY_DATA_SIZE;
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvmet_ns *ns;
+	unsigned long idx;
+	__le16 *list;
+	u16 status;
+	int i = 1;
+
+	list = kzalloc(buf_size, GFP_KERNEL);
+	if (!list) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
+		if (ns->nsid <= min_endgid)
+			continue;
+
+		list[i++] = cpu_to_le16(ns->nsid);
+		if (i == buf_size / sizeof(__le16))
+			break;
+	}
+
+	list[0] = cpu_to_le16(i - 1);
+	status = nvmet_copy_to_sgl(req, 0, list, buf_size);
+	kfree(list);
+out:
+	nvmet_req_complete(req, status);
+}
+
 static void nvmet_execute_identify_nslist(struct nvmet_req *req, bool match_css)
 {
 	static const int buf_size = NVME_IDENTIFY_DATA_SIZE;
@@ -825,6 +917,9 @@ static void nvmet_execute_identify(struct nvmet_req *req)
 	case NVME_ID_CNS_NS_ACTIVE_LIST_CS:
 		nvmet_execute_identify_nslist(req, true);
 		return;
+	case NVME_ID_CNS_ENDGRP_LIST:
+		nvmet_execute_identify_endgrp_list(req);
+		return;
 	}
 
 	pr_debug("unhandled identify cns %d on qid %d\n",
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 31d7ec6d8b93..6d5b4299a1b2 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -327,7 +327,8 @@ struct nvme_id_ctrl {
 	__le32			sanicap;
 	__le32			hmminds;
 	__le16			hmmaxd;
-	__u8			rsvd338[4];
+	__le16			nvmsetidmax;
+	__le16			endgidmax;
 	__u8			anatt;
 	__u8			anacap;
 	__le32			anagrpmax;
@@ -531,6 +532,7 @@ enum {
 	NVME_ID_CNS_SCNDRY_CTRL_LIST	= 0x15,
 	NVME_ID_CNS_NS_GRANULARITY	= 0x16,
 	NVME_ID_CNS_UUID_LIST		= 0x17,
+	NVME_ID_CNS_ENDGRP_LIST		= 0x19,
 };
 
 enum {
@@ -618,6 +620,28 @@ enum {
 	NVME_NIDT_CSI		= 0x04,
 };
 
+struct nvme_endurance_group_log {
+	__u8	egcw;
+	__u8	egfeat;
+	__u8	rsvd2;
+	__u8	avsp;
+	__u8	avspt;
+	__u8	pused;
+	__le16	did;
+	__u8	rsvd8[24];
+	__u8	ee[16];
+	__u8	dur[16];
+	__u8	duw[16];
+	__u8	muw[16];
+	__u8	hrc[16];
+	__u8	hwc[16];
+	__u8	mdie[16];
+	__u8	neile[16];
+	__u8	tegcap[16];
+	__u8	uegcap[16];
+	__u8	rsvd192[320];
+};
+
 struct nvme_smart_log {
 	__u8			critical_warning;
 	__u8			temperature[2];
@@ -1302,7 +1326,8 @@ struct nvme_identify {
 	__u8			cns;
 	__u8			rsvd3;
 	__le16			ctrlid;
-	__u8			rsvd11[3];
+	__le16			cnssid;
+	__u8			rsvd11;
 	__u8			csi;
 	__u32			rsvd12[4];
 };
-- 
cgit v1.2.3


From 5fd075cdaf3649000677d960fd9e45c08081b7e0 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Fri, 1 Nov 2024 13:48:47 -0700
Subject: nvmet: implement rotational media information log

Most of the information is stubbed. Supporting these commands is a
requirement for supporting rotational media.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/core.c        |  1 +
 drivers/nvme/target/admin-cmd.c | 42 +++++++++++++++++++++++++++++++++++++++++
 include/linux/nvme.h            | 15 ++++++++++++++-
 3 files changed, 57 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 426d4b90ecd7..279b0f445904 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -5002,6 +5002,7 @@ static inline void _nvme_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
 	BUILD_BUG_ON(sizeof(struct nvme_endurance_group_log) != 512);
+	BUILD_BUG_ON(sizeof(struct nvme_rotational_media_log) != 512);
 	BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_feat_host_behavior) != 512);
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 366582f52200..33b7ecfee3fe 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -91,6 +91,7 @@ static void nvmet_execute_get_supported_log_pages(struct nvmet_req *req)
 	logs->lids[NVME_LOG_ENDURANCE_GROUP] = cpu_to_le32(NVME_LIDS_LSUPP);
 	logs->lids[NVME_LOG_ANA] = cpu_to_le32(NVME_LIDS_LSUPP);
 	logs->lids[NVME_LOG_FEATURES] = cpu_to_le32(NVME_LIDS_LSUPP);
+	logs->lids[NVME_LOG_RMI] = cpu_to_le32(NVME_LIDS_LSUPP);
 	logs->lids[NVME_LOG_RESERVATION] = cpu_to_le32(NVME_LIDS_LSUPP);
 
 	status = nvmet_copy_to_sgl(req, 0, logs, sizeof(*logs));
@@ -158,6 +159,45 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
 	return NVME_SC_SUCCESS;
 }
 
+static void nvmet_execute_get_log_page_rmi(struct nvmet_req *req)
+{
+	struct nvme_rotational_media_log *log;
+	struct gendisk *disk;
+	u16 status;
+
+	req->cmd->common.nsid = cpu_to_le32(le16_to_cpu(
+					    req->cmd->get_log_page.lsi));
+	status = nvmet_req_find_ns(req);
+	if (status)
+		goto out;
+
+	if (!req->ns->bdev || bdev_nonrot(req->ns->bdev)) {
+		status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+		goto out;
+	}
+
+	if (req->transfer_len != sizeof(*log)) {
+		status = NVME_SC_SGL_INVALID_DATA | NVME_STATUS_DNR;
+		goto out;
+	}
+
+	log = kzalloc(sizeof(*log), GFP_KERNEL);
+	if (!log)
+		goto out;
+
+	log->endgid = req->cmd->get_log_page.lsi;
+	disk = req->ns->bdev->bd_disk;
+	if (disk && disk->ia_ranges)
+		log->numa = cpu_to_le16(disk->ia_ranges->nr_ia_ranges);
+	else
+		log->numa = cpu_to_le16(1);
+
+	status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log));
+	kfree(log);
+out:
+	nvmet_req_complete(req, status);
+}
+
 static void nvmet_execute_get_log_page_smart(struct nvmet_req *req)
 {
 	struct nvme_smart_log *log;
@@ -451,6 +491,8 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
 		return nvmet_execute_get_log_page_ana(req);
 	case NVME_LOG_FEATURES:
 		return nvmet_execute_get_log_page_features(req);
+	case NVME_LOG_RMI:
+		return nvmet_execute_get_log_page_rmi(req);
 	case NVME_LOG_RESERVATION:
 		return nvmet_execute_get_log_page_resv(req);
 	}
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 6d5b4299a1b2..99cf0ee73714 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -642,6 +642,18 @@ struct nvme_endurance_group_log {
 	__u8	rsvd192[320];
 };
 
+struct nvme_rotational_media_log {
+	__le16	endgid;
+	__le16	numa;
+	__le16	nrs;
+	__u8	rsvd6[2];
+	__le32	spinc;
+	__le32	fspinc;
+	__le32	ldc;
+	__le32	fldc;
+	__u8	rsvd24[488];
+};
+
 struct nvme_smart_log {
 	__u8			critical_warning;
 	__u8			temperature[2];
@@ -1281,6 +1293,7 @@ enum {
 	NVME_LOG_ENDURANCE_GROUP = 0x09,
 	NVME_LOG_ANA		= 0x0c,
 	NVME_LOG_FEATURES	= 0x12,
+	NVME_LOG_RMI		= 0x16,
 	NVME_LOG_DISC		= 0x70,
 	NVME_LOG_RESERVATION	= 0x80,
 	NVME_FWACT_REPL		= (0 << 3),
@@ -1435,7 +1448,7 @@ struct nvme_get_log_page_command {
 	__u8			lsp; /* upper 4 bits reserved */
 	__le16			numdl;
 	__le16			numdu;
-	__u16			rsvd11;
+	__le16			lsi;
 	union {
 		struct {
 			__le32 lpol;
-- 
cgit v1.2.3


From e2758c76a0ab2032a0d11abc1c2ff08661fdf428 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Fri, 1 Nov 2024 12:29:40 -0700
Subject: nvmet: support for csi identify ns

Implements reporting the I/O Command Set Independent Identify Namespace
command.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/target/admin-cmd.c | 32 ++++++++++++++++++++++++++++++++
 include/linux/nvme.h            |  1 +
 2 files changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 33b7ecfee3fe..0a9fdc533186 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -912,6 +912,35 @@ out:
 	nvmet_req_complete(req, status);
 }
 
+static void nvmet_execute_id_cs_indep(struct nvmet_req *req)
+{
+	struct nvme_id_ns_cs_indep *id;
+	u16 status;
+
+	status = nvmet_req_find_ns(req);
+	if (status)
+		goto out;
+
+	id = kzalloc(sizeof(*id), GFP_KERNEL);
+	if (!id) {
+		status = NVME_SC_INTERNAL;
+		goto out;
+	}
+
+	id->nstat = NVME_NSTAT_NRDY;
+	id->anagrpid = cpu_to_le32(req->ns->anagrpid);
+	id->nmic = NVME_NS_NMIC_SHARED;
+	if (req->ns->readonly)
+		id->nsattr |= NVME_NS_ATTR_RO;
+	if (req->ns->bdev && !bdev_nonrot(req->ns->bdev))
+		id->nsfeat |= NVME_NS_ROTATIONAL;
+
+	status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
+	kfree(id);
+out:
+	nvmet_req_complete(req, status);
+}
+
 static void nvmet_execute_identify(struct nvmet_req *req)
 {
 	if (!nvmet_check_transfer_len(req, NVME_IDENTIFY_DATA_SIZE))
@@ -959,6 +988,9 @@ static void nvmet_execute_identify(struct nvmet_req *req)
 	case NVME_ID_CNS_NS_ACTIVE_LIST_CS:
 		nvmet_execute_identify_nslist(req, true);
 		return;
+	case NVME_ID_CNS_NS_CS_INDEP:
+		nvmet_execute_id_cs_indep(req);
+		return;
 	case NVME_ID_CNS_ENDGRP_LIST:
 		nvmet_execute_identify_endgrp_list(req);
 		return;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 99cf0ee73714..c136d64c7d73 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -563,6 +563,7 @@ enum {
 	NVME_NS_FLBAS_LBA_SHIFT	= 1,
 	NVME_NS_FLBAS_META_EXT	= 0x10,
 	NVME_NS_NMIC_SHARED	= 1 << 0,
+	NVME_NS_ROTATIONAL	= 1 << 4,
 	NVME_LBAF_RP_BEST	= 0,
 	NVME_LBAF_RP_BETTER	= 1,
 	NVME_LBAF_RP_GOOD	= 2,
-- 
cgit v1.2.3


From 8a825d22a70915bd80c811fa93538cf2540af29d Mon Sep 17 00:00:00 2001
From: Guixin Liu <kanie@linux.alibaba.com>
Date: Mon, 4 Nov 2024 16:55:00 +0800
Subject: nvme: check ns's volatile write cache not present

When the VWC of a namespace does not exist, the BLK_FEAT_WRITE_CACHE
flag should not be set when registering the block device, regardless
of whether the controller supports VWC.

Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/core.c | 4 +++-
 include/linux/nvme.h     | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 6f51dde7de6c..e119ba0f8ab8 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -43,6 +43,7 @@ struct nvme_ns_info {
 	bool is_ready;
 	bool is_removed;
 	bool is_rotational;
+	bool no_vwc;
 };
 
 unsigned int admin_timeout = 60;
@@ -1617,6 +1618,7 @@ static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
 		info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
 		info->is_ready = id->nstat & NVME_NSTAT_NRDY;
 		info->is_rotational = id->nsfeat & NVME_NS_ROTATIONAL;
+		info->no_vwc = id->nsfeat & NVME_NS_VWC_NOT_PRESENT;
 	}
 	kfree(id);
 	return ret;
@@ -2159,7 +2161,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
 	    ns->head->ids.csi == NVME_CSI_ZNS)
 		nvme_update_zone_info(ns, &lim, &zi);
 
-	if (ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT)
+	if ((ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT) && !info->no_vwc)
 		lim.features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA;
 	else
 		lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index c136d64c7d73..0a6e22038ce3 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -564,6 +564,7 @@ enum {
 	NVME_NS_FLBAS_META_EXT	= 0x10,
 	NVME_NS_NMIC_SHARED	= 1 << 0,
 	NVME_NS_ROTATIONAL	= 1 << 4,
+	NVME_NS_VWC_NOT_PRESENT = 1 << 5,
 	NVME_LBAF_RP_BEST	= 0,
 	NVME_LBAF_RP_BETTER	= 1,
 	NVME_LBAF_RP_GOOD	= 2,
-- 
cgit v1.2.3