diff options
Diffstat (limited to 'drivers/nvme/host/core.c')
| -rw-r--r-- | drivers/nvme/host/core.c | 1119 | 
1 files changed, 430 insertions, 689 deletions
| diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a5653892d773..b6f7815fa239 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -89,6 +89,10 @@ static dev_t nvme_ctrl_base_chr_devt;  static struct class *nvme_class;  static struct class *nvme_subsys_class; +static DEFINE_IDA(nvme_ns_chr_minor_ida); +static dev_t nvme_ns_chr_devt; +static struct class *nvme_ns_chr_class; +  static void nvme_put_subsystem(struct nvme_subsystem *subsys);  static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,  					   unsigned nsid); @@ -112,7 +116,7 @@ static void nvme_set_queue_dying(struct nvme_ns *ns)  	set_capacity_and_notify(ns->disk, 0);  } -static void nvme_queue_scan(struct nvme_ctrl *ctrl) +void nvme_queue_scan(struct nvme_ctrl *ctrl)  {  	/*  	 * Only new queue scan work when admin and IO queues are both alive @@ -179,7 +183,7 @@ int nvme_reset_ctrl(struct nvme_ctrl *ctrl)  }  EXPORT_SYMBOL_GPL(nvme_reset_ctrl); -static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) +int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)  {  	int ret; @@ -549,7 +553,12 @@ static void nvme_free_ns_head(struct kref *ref)  	kfree(head);  } -static void nvme_put_ns_head(struct nvme_ns_head *head) +bool nvme_tryget_ns_head(struct nvme_ns_head *head) +{ +	return kref_get_unless_zero(&head->ref); +} + +void nvme_put_ns_head(struct nvme_ns_head *head)  {  	kref_put(&head->ref, nvme_free_ns_head);  } @@ -575,11 +584,12 @@ EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU);  static inline void nvme_clear_nvme_request(struct request *req)  { -	if (!(req->rq_flags & RQF_DONTPREP)) { -		nvme_req(req)->retries = 0; -		nvme_req(req)->flags = 0; -		req->rq_flags |= RQF_DONTPREP; -	} +	struct nvme_command *cmd = nvme_req(req)->cmd; + +	memset(cmd, 0, sizeof(*cmd)); +	nvme_req(req)->retries = 0; +	nvme_req(req)->flags = 0; +	req->rq_flags |= RQF_DONTPREP;  }  static inline unsigned int nvme_req_op(struct nvme_command *cmd) @@ -595,9 +605,12 @@ static inline void nvme_init_request(struct request *req,  	else /* no queuedata implies admin queue */  		req->timeout = NVME_ADMIN_TIMEOUT; +	/* passthru commands should let the driver set the SGL flags */ +	cmd->common.flags &= ~NVME_CMD_SGL_ALL; +  	req->cmd_flags |= REQ_FAILFAST_DRIVER;  	nvme_clear_nvme_request(req); -	nvme_req(req)->cmd = cmd; +	memcpy(nvme_req(req)->cmd, cmd, sizeof(*cmd));  }  struct request *nvme_alloc_request(struct request_queue *q, @@ -726,14 +739,6 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,  		req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9;  } -static void nvme_setup_passthrough(struct request *req, -		struct nvme_command *cmd) -{ -	memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd)); -	/* passthru commands should let the driver set the SGL flags */ -	cmd->common.flags &= ~NVME_CMD_SGL_ALL; -} -  static inline void nvme_setup_flush(struct nvme_ns *ns,  		struct nvme_command *cmnd)  { @@ -888,18 +893,18 @@ void nvme_cleanup_cmd(struct request *req)  }  EXPORT_SYMBOL_GPL(nvme_cleanup_cmd); -blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, -		struct nvme_command *cmd) +blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req)  { +	struct nvme_command *cmd = nvme_req(req)->cmd;  	blk_status_t ret = BLK_STS_OK; -	nvme_clear_nvme_request(req); +	if (!(req->rq_flags & RQF_DONTPREP)) +		nvme_clear_nvme_request(req); -	memset(cmd, 0, sizeof(*cmd));  	switch (req_op(req)) {  	case REQ_OP_DRV_IN:  	case REQ_OP_DRV_OUT: -		nvme_setup_passthrough(req, cmd); +		/* these are setup prior to execution in nvme_init_request() */  		break;  	case REQ_OP_FLUSH:  		nvme_setup_flush(ns, cmd); @@ -1020,40 +1025,6 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,  }  EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); -static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf, -		unsigned len, u32 seed, bool write) -{ -	struct bio_integrity_payload *bip; -	int ret = -ENOMEM; -	void *buf; - -	buf = kmalloc(len, GFP_KERNEL); -	if (!buf) -		goto out; - -	ret = -EFAULT; -	if (write && copy_from_user(buf, ubuf, len)) -		goto out_free_meta; - -	bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); -	if (IS_ERR(bip)) { -		ret = PTR_ERR(bip); -		goto out_free_meta; -	} - -	bip->bip_iter.bi_size = len; -	bip->bip_iter.bi_sector = seed; -	ret = bio_integrity_add_page(bio, virt_to_page(buf), len, -			offset_in_page(buf)); -	if (ret == len) -		return buf; -	ret = -ENOMEM; -out_free_meta: -	kfree(buf); -out: -	return ERR_PTR(ret); -} -  static u32 nvme_known_admin_effects(u8 opcode)  {  	switch (opcode) { @@ -1076,9 +1047,9 @@ u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)  		if (ns->head->effects)  			effects = le32_to_cpu(ns->head->effects->iocs[opcode]);  		if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) -			dev_warn(ctrl->device, -				 "IO command:%02x has unhandled effects:%08x\n", -				 opcode, effects); +			dev_warn_once(ctrl->device, +				"IO command:%02x has unhandled effects:%08x\n", +				opcode, effects);  		return 0;  	} @@ -1120,7 +1091,7 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)  		mutex_unlock(&ctrl->scan_lock);  	}  	if (effects & NVME_CMD_EFFECTS_CCC) -		nvme_init_identify(ctrl); +		nvme_init_ctrl_finish(ctrl);  	if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {  		nvme_queue_scan(ctrl);  		flush_work(&ctrl->scan_work); @@ -1137,68 +1108,20 @@ void nvme_execute_passthru_rq(struct request *rq)  	effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);  	blk_execute_rq(disk, rq, 0); -	nvme_passthru_end(ctrl, effects); +	if (effects) /* nothing to be done for zero cmd effects */ +		nvme_passthru_end(ctrl, effects);  }  EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); -static int nvme_submit_user_cmd(struct request_queue *q, -		struct nvme_command *cmd, void __user *ubuffer, -		unsigned bufflen, void __user *meta_buffer, unsigned meta_len, -		u32 meta_seed, u64 *result, unsigned timeout) +/* + * Recommended frequency for KATO commands per NVMe 1.4 section 7.12.1: + *  + *   The host should send Keep Alive commands at half of the Keep Alive Timeout + *   accounting for transport roundtrip times [..]. + */ +static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)  { -	bool write = nvme_is_write(cmd); -	struct nvme_ns *ns = q->queuedata; -	struct block_device *bdev = ns ? ns->disk->part0 : NULL; -	struct request *req; -	struct bio *bio = NULL; -	void *meta = NULL; -	int ret; - -	req = nvme_alloc_request(q, cmd, 0); -	if (IS_ERR(req)) -		return PTR_ERR(req); - -	if (timeout) -		req->timeout = timeout; -	nvme_req(req)->flags |= NVME_REQ_USERCMD; - -	if (ubuffer && bufflen) { -		ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, -				GFP_KERNEL); -		if (ret) -			goto out; -		bio = req->bio; -		if (bdev) -			bio_set_dev(bio, bdev); -		if (bdev && meta_buffer && meta_len) { -			meta = nvme_add_user_metadata(bio, meta_buffer, meta_len, -					meta_seed, write); -			if (IS_ERR(meta)) { -				ret = PTR_ERR(meta); -				goto out_unmap; -			} -			req->cmd_flags |= REQ_INTEGRITY; -		} -	} - -	nvme_execute_passthru_rq(req); -	if (nvme_req(req)->flags & NVME_REQ_CANCELLED) -		ret = -EINTR; -	else -		ret = nvme_req(req)->status; -	if (result) -		*result = le64_to_cpu(nvme_req(req)->result.u64); -	if (meta && !ret && !write) { -		if (copy_to_user(meta_buffer, meta, meta_len)) -			ret = -EFAULT; -	} -	kfree(meta); - out_unmap: -	if (bio) -		blk_rq_unmap_user(bio); - out: -	blk_mq_free_request(req); -	return ret; +	queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ / 2);  }  static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) @@ -1223,24 +1146,7 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)  		startka = true;  	spin_unlock_irqrestore(&ctrl->lock, flags);  	if (startka) -		queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); -} - -static int nvme_keep_alive(struct nvme_ctrl *ctrl) -{ -	struct request *rq; - -	rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, -			BLK_MQ_REQ_RESERVED); -	if (IS_ERR(rq)) -		return PTR_ERR(rq); - -	rq->timeout = ctrl->kato * HZ; -	rq->end_io_data = ctrl; - -	blk_execute_rq_nowait(NULL, rq, 0, nvme_keep_alive_end_io); - -	return 0; +		nvme_queue_keep_alive_work(ctrl);  }  static void nvme_keep_alive_work(struct work_struct *work) @@ -1248,21 +1154,28 @@ static void nvme_keep_alive_work(struct work_struct *work)  	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),  			struct nvme_ctrl, ka_work);  	bool comp_seen = ctrl->comp_seen; +	struct request *rq;  	if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {  		dev_dbg(ctrl->device,  			"reschedule traffic based keep-alive timer\n");  		ctrl->comp_seen = false; -		queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); +		nvme_queue_keep_alive_work(ctrl);  		return;  	} -	if (nvme_keep_alive(ctrl)) { +	rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, +				BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); +	if (IS_ERR(rq)) {  		/* allocation failure, reset the controller */ -		dev_err(ctrl->device, "keep-alive failed\n"); +		dev_err(ctrl->device, "keep-alive failed: %ld\n", PTR_ERR(rq));  		nvme_reset_ctrl(ctrl);  		return;  	} + +	rq->timeout = ctrl->kato * HZ; +	rq->end_io_data = ctrl; +	blk_execute_rq_nowait(NULL, rq, 0, nvme_keep_alive_end_io);  }  static void nvme_start_keep_alive(struct nvme_ctrl *ctrl) @@ -1270,7 +1183,7 @@ static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)  	if (unlikely(ctrl->kato == 0))  		return; -	queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); +	nvme_queue_keep_alive_work(ctrl);  }  void nvme_stop_keep_alive(struct nvme_ctrl *ctrl) @@ -1546,170 +1459,6 @@ static void nvme_enable_aen(struct nvme_ctrl *ctrl)  }  /* - * Convert integer values from ioctl structures to user pointers, silently - * ignoring the upper bits in the compat case to match behaviour of 32-bit - * kernels. - */ -static void __user *nvme_to_user_ptr(uintptr_t ptrval) -{ -	if (in_compat_syscall()) -		ptrval = (compat_uptr_t)ptrval; -	return (void __user *)ptrval; -} - -static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) -{ -	struct nvme_user_io io; -	struct nvme_command c; -	unsigned length, meta_len; -	void __user *metadata; - -	if (copy_from_user(&io, uio, sizeof(io))) -		return -EFAULT; -	if (io.flags) -		return -EINVAL; - -	switch (io.opcode) { -	case nvme_cmd_write: -	case nvme_cmd_read: -	case nvme_cmd_compare: -		break; -	default: -		return -EINVAL; -	} - -	length = (io.nblocks + 1) << ns->lba_shift; - -	if ((io.control & NVME_RW_PRINFO_PRACT) && -	    ns->ms == sizeof(struct t10_pi_tuple)) { -		/* -		 * Protection information is stripped/inserted by the -		 * controller. -		 */ -		if (nvme_to_user_ptr(io.metadata)) -			return -EINVAL; -		meta_len = 0; -		metadata = NULL; -	} else { -		meta_len = (io.nblocks + 1) * ns->ms; -		metadata = nvme_to_user_ptr(io.metadata); -	} - -	if (ns->features & NVME_NS_EXT_LBAS) { -		length += meta_len; -		meta_len = 0; -	} else if (meta_len) { -		if ((io.metadata & 3) || !io.metadata) -			return -EINVAL; -	} - -	memset(&c, 0, sizeof(c)); -	c.rw.opcode = io.opcode; -	c.rw.flags = io.flags; -	c.rw.nsid = cpu_to_le32(ns->head->ns_id); -	c.rw.slba = cpu_to_le64(io.slba); -	c.rw.length = cpu_to_le16(io.nblocks); -	c.rw.control = cpu_to_le16(io.control); -	c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); -	c.rw.reftag = cpu_to_le32(io.reftag); -	c.rw.apptag = cpu_to_le16(io.apptag); -	c.rw.appmask = cpu_to_le16(io.appmask); - -	return nvme_submit_user_cmd(ns->queue, &c, -			nvme_to_user_ptr(io.addr), length, -			metadata, meta_len, lower_32_bits(io.slba), NULL, 0); -} - -static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, -			struct nvme_passthru_cmd __user *ucmd) -{ -	struct nvme_passthru_cmd cmd; -	struct nvme_command c; -	unsigned timeout = 0; -	u64 result; -	int status; - -	if (!capable(CAP_SYS_ADMIN)) -		return -EACCES; -	if (copy_from_user(&cmd, ucmd, sizeof(cmd))) -		return -EFAULT; -	if (cmd.flags) -		return -EINVAL; - -	memset(&c, 0, sizeof(c)); -	c.common.opcode = cmd.opcode; -	c.common.flags = cmd.flags; -	c.common.nsid = cpu_to_le32(cmd.nsid); -	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); -	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); -	c.common.cdw10 = cpu_to_le32(cmd.cdw10); -	c.common.cdw11 = cpu_to_le32(cmd.cdw11); -	c.common.cdw12 = cpu_to_le32(cmd.cdw12); -	c.common.cdw13 = cpu_to_le32(cmd.cdw13); -	c.common.cdw14 = cpu_to_le32(cmd.cdw14); -	c.common.cdw15 = cpu_to_le32(cmd.cdw15); - -	if (cmd.timeout_ms) -		timeout = msecs_to_jiffies(cmd.timeout_ms); - -	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, -			nvme_to_user_ptr(cmd.addr), cmd.data_len, -			nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, -			0, &result, timeout); - -	if (status >= 0) { -		if (put_user(result, &ucmd->result)) -			return -EFAULT; -	} - -	return status; -} - -static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, -			struct nvme_passthru_cmd64 __user *ucmd) -{ -	struct nvme_passthru_cmd64 cmd; -	struct nvme_command c; -	unsigned timeout = 0; -	int status; - -	if (!capable(CAP_SYS_ADMIN)) -		return -EACCES; -	if (copy_from_user(&cmd, ucmd, sizeof(cmd))) -		return -EFAULT; -	if (cmd.flags) -		return -EINVAL; - -	memset(&c, 0, sizeof(c)); -	c.common.opcode = cmd.opcode; -	c.common.flags = cmd.flags; -	c.common.nsid = cpu_to_le32(cmd.nsid); -	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); -	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); -	c.common.cdw10 = cpu_to_le32(cmd.cdw10); -	c.common.cdw11 = cpu_to_le32(cmd.cdw11); -	c.common.cdw12 = cpu_to_le32(cmd.cdw12); -	c.common.cdw13 = cpu_to_le32(cmd.cdw13); -	c.common.cdw14 = cpu_to_le32(cmd.cdw14); -	c.common.cdw15 = cpu_to_le32(cmd.cdw15); - -	if (cmd.timeout_ms) -		timeout = msecs_to_jiffies(cmd.timeout_ms); - -	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, -			nvme_to_user_ptr(cmd.addr), cmd.data_len, -			nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, -			0, &cmd.result, timeout); - -	if (status >= 0) { -		if (put_user(cmd.result, &ucmd->result)) -			return -EFAULT; -	} - -	return status; -} - -/*   * Issue ioctl requests on the first available path.  Note that unlike normal   * block layer requests we will not retry failed request on another controller.   */ @@ -1739,136 +1488,12 @@ void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)  		srcu_read_unlock(&head->srcu, idx);  } -static bool is_ctrl_ioctl(unsigned int cmd) -{ -	if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD) -		return true; -	if (is_sed_ioctl(cmd)) -		return true; -	return false; -} - -static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, -				  void __user *argp, -				  struct nvme_ns_head *head, -				  int srcu_idx) -{ -	struct nvme_ctrl *ctrl = ns->ctrl; -	int ret; - -	nvme_get_ctrl(ns->ctrl); -	nvme_put_ns_from_disk(head, srcu_idx); - -	switch (cmd) { -	case NVME_IOCTL_ADMIN_CMD: -		ret = nvme_user_cmd(ctrl, NULL, argp); -		break; -	case NVME_IOCTL_ADMIN64_CMD: -		ret = nvme_user_cmd64(ctrl, NULL, argp); -		break; -	default: -		ret = sed_ioctl(ctrl->opal_dev, cmd, argp); -		break; -	} -	nvme_put_ctrl(ctrl); -	return ret; -} - -static int nvme_ioctl(struct block_device *bdev, fmode_t mode, -		unsigned int cmd, unsigned long arg) -{ -	struct nvme_ns_head *head = NULL; -	void __user *argp = (void __user *)arg; -	struct nvme_ns *ns; -	int srcu_idx, ret; - -	ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); -	if (unlikely(!ns)) -		return -EWOULDBLOCK; - -	/* -	 * Handle ioctls that apply to the controller instead of the namespace -	 * seperately and drop the ns SRCU reference early.  This avoids a -	 * deadlock when deleting namespaces using the passthrough interface. -	 */ -	if (is_ctrl_ioctl(cmd)) -		return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); - -	switch (cmd) { -	case NVME_IOCTL_ID: -		force_successful_syscall_return(); -		ret = ns->head->ns_id; -		break; -	case NVME_IOCTL_IO_CMD: -		ret = nvme_user_cmd(ns->ctrl, ns, argp); -		break; -	case NVME_IOCTL_SUBMIT_IO: -		ret = nvme_submit_io(ns, argp); -		break; -	case NVME_IOCTL_IO64_CMD: -		ret = nvme_user_cmd64(ns->ctrl, ns, argp); -		break; -	default: -		if (ns->ndev) -			ret = nvme_nvm_ioctl(ns, cmd, arg); -		else -			ret = -ENOTTY; -	} - -	nvme_put_ns_from_disk(head, srcu_idx); -	return ret; -} - -#ifdef CONFIG_COMPAT -struct nvme_user_io32 { -	__u8	opcode; -	__u8	flags; -	__u16	control; -	__u16	nblocks; -	__u16	rsvd; -	__u64	metadata; -	__u64	addr; -	__u64	slba; -	__u32	dsmgmt; -	__u32	reftag; -	__u16	apptag; -	__u16	appmask; -} __attribute__((__packed__)); - -#define NVME_IOCTL_SUBMIT_IO32	_IOW('N', 0x42, struct nvme_user_io32) - -static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, -		unsigned int cmd, unsigned long arg) +static int nvme_ns_open(struct nvme_ns *ns)  { -	/* -	 * Corresponds to the difference of NVME_IOCTL_SUBMIT_IO -	 * between 32 bit programs and 64 bit kernel. -	 * The cause is that the results of sizeof(struct nvme_user_io), -	 * which is used to define NVME_IOCTL_SUBMIT_IO, -	 * are not same between 32 bit compiler and 64 bit compiler. -	 * NVME_IOCTL_SUBMIT_IO32 is for 64 bit kernel handling -	 * NVME_IOCTL_SUBMIT_IO issued from 32 bit programs. -	 * Other IOCTL numbers are same between 32 bit and 64 bit. -	 * So there is nothing to do regarding to other IOCTL numbers. -	 */ -	if (cmd == NVME_IOCTL_SUBMIT_IO32) -		return nvme_ioctl(bdev, mode, NVME_IOCTL_SUBMIT_IO, arg); - -	return nvme_ioctl(bdev, mode, cmd, arg); -} -#else -#define nvme_compat_ioctl	NULL -#endif /* CONFIG_COMPAT */ -static int nvme_open(struct block_device *bdev, fmode_t mode) -{ -	struct nvme_ns *ns = bdev->bd_disk->private_data; - -#ifdef CONFIG_NVME_MULTIPATH  	/* should never be called due to GENHD_FL_HIDDEN */ -	if (WARN_ON_ONCE(ns->head->disk)) +	if (WARN_ON_ONCE(nvme_ns_head_multipath(ns->head)))  		goto fail; -#endif  	if (!kref_get_unless_zero(&ns->kref))  		goto fail;  	if (!try_module_get(ns->ctrl->ops->module)) @@ -1882,15 +1507,24 @@ fail:  	return -ENXIO;  } -static void nvme_release(struct gendisk *disk, fmode_t mode) +static void nvme_ns_release(struct nvme_ns *ns)  { -	struct nvme_ns *ns = disk->private_data;  	module_put(ns->ctrl->ops->module);  	nvme_put_ns(ns);  } -static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int nvme_open(struct block_device *bdev, fmode_t mode) +{ +	return nvme_ns_open(bdev->bd_disk->private_data); +} + +static void nvme_release(struct gendisk *disk, fmode_t mode) +{ +	nvme_ns_release(disk->private_data); +} + +int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)  {  	/* some standard values */  	geo->heads = 1 << 6; @@ -1939,7 +1573,7 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)  	struct request_queue *queue = disk->queue;  	u32 size = queue_logical_block_size(queue); -	if (!(ctrl->oncs & NVME_CTRL_ONCS_DSM)) { +	if (ctrl->max_discard_sectors == 0) {  		blk_queue_flag_clear(QUEUE_FLAG_DISCARD, queue);  		return;  	} @@ -1957,39 +1591,13 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)  	if (blk_queue_flag_test_and_set(QUEUE_FLAG_DISCARD, queue))  		return; -	blk_queue_max_discard_sectors(queue, UINT_MAX); -	blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES); +	blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors); +	blk_queue_max_discard_segments(queue, ctrl->max_discard_segments);  	if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)  		blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);  } -static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns) -{ -	u64 max_blocks; - -	if (!(ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) || -	    (ns->ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES)) -		return; -	/* -	 * Even though NVMe spec explicitly states that MDTS is not -	 * applicable to the write-zeroes:- "The restriction does not apply to -	 * commands that do not transfer data between the host and the -	 * controller (e.g., Write Uncorrectable ro Write Zeroes command).". -	 * In order to be more cautious use controller's max_hw_sectors value -	 * to configure the maximum sectors for the write-zeroes which is -	 * configured based on the controller's MDTS field in the -	 * nvme_init_identify() if available. -	 */ -	if (ns->ctrl->max_hw_sectors == UINT_MAX) -		max_blocks = (u64)USHRT_MAX + 1; -	else -		max_blocks = ns->ctrl->max_hw_sectors + 1; - -	blk_queue_max_write_zeroes_sectors(disk->queue, -					   nvme_lba_to_sect(ns, max_blocks)); -} -  static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids)  {  	return !uuid_is_null(&ids->uuid) || @@ -2159,7 +1767,8 @@ static void nvme_update_disk_info(struct gendisk *disk,  	set_capacity_and_notify(disk, capacity);  	nvme_config_discard(disk, ns); -	nvme_config_write_zeroes(disk, ns); +	blk_queue_max_write_zeroes_sectors(disk->queue, +					   ns->ctrl->max_zeroes_sectors);  	set_disk_ro(disk, (id->nsattr & NVME_NS_ATTR_RO) ||  		test_bit(NVME_NS_FORCE_RO, &ns->flags)); @@ -2228,11 +1837,10 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)  	if (blk_queue_is_zoned(ns->queue)) {  		ret = nvme_revalidate_zones(ns);  		if (ret && !nvme_first_scan(ns->disk)) -			return ret; +			goto out;  	} -#ifdef CONFIG_NVME_MULTIPATH -	if (ns->head->disk) { +	if (nvme_ns_head_multipath(ns->head)) {  		blk_mq_freeze_queue(ns->head->disk->queue);  		nvme_update_disk_info(ns->head->disk, ns, id);  		blk_stack_limits(&ns->head->disk->queue->limits, @@ -2240,11 +1848,19 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)  		blk_queue_update_readahead(ns->head->disk->queue);  		blk_mq_unfreeze_queue(ns->head->disk->queue);  	} -#endif  	return 0;  out_unfreeze:  	blk_mq_unfreeze_queue(ns->disk->queue); +out: +	/* +	 * If probing fails due an unsupported feature, hide the block device, +	 * but still allow other access. +	 */ +	if (ret == -ENODEV) { +		ns->disk->flags |= GENHD_FL_HIDDEN; +		ret = 0; +	}  	return ret;  } @@ -2325,22 +1941,25 @@ static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,  		enum pr_type type, bool abort)  {  	u32 cdw10 = nvme_pr_type(type) << 8 | (abort ? 2 : 1); +  	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);  }  static int nvme_pr_clear(struct block_device *bdev, u64 key)  {  	u32 cdw10 = 1 | (key ? 1 << 3 : 0); +  	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);  }  static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)  {  	u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 1 << 3 : 0); +  	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);  } -static const struct pr_ops nvme_pr_ops = { +const struct pr_ops nvme_pr_ops = {  	.pr_register	= nvme_pr_register,  	.pr_reserve	= nvme_pr_reserve,  	.pr_release	= nvme_pr_release, @@ -2373,7 +1992,6 @@ EXPORT_SYMBOL_GPL(nvme_sec_submit);  static const struct block_device_operations nvme_bdev_ops = {  	.owner		= THIS_MODULE,  	.ioctl		= nvme_ioctl, -	.compat_ioctl	= nvme_compat_ioctl,  	.open		= nvme_open,  	.release	= nvme_release,  	.getgeo		= nvme_getgeo, @@ -2382,31 +2000,25 @@ static const struct block_device_operations nvme_bdev_ops = {  };  #ifdef CONFIG_NVME_MULTIPATH -static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode) +struct nvme_ctrl *nvme_find_get_live_ctrl(struct nvme_subsystem *subsys)  { -	struct nvme_ns_head *head = bdev->bd_disk->private_data; - -	if (!kref_get_unless_zero(&head->ref)) -		return -ENXIO; -	return 0; -} +	struct nvme_ctrl *ctrl; +	int ret; -static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) -{ -	nvme_put_ns_head(disk->private_data); +	ret = mutex_lock_killable(&nvme_subsystems_lock); +	if (ret) +		return ERR_PTR(ret); +	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { +		if (ctrl->state == NVME_CTRL_LIVE) +			goto found; +	} +	mutex_unlock(&nvme_subsystems_lock); +	return ERR_PTR(-EWOULDBLOCK); +found: +	nvme_get_ctrl(ctrl); +	mutex_unlock(&nvme_subsystems_lock); +	return ctrl;  } - -const struct block_device_operations nvme_ns_head_ops = { -	.owner		= THIS_MODULE, -	.submit_bio	= nvme_ns_head_submit_bio, -	.open		= nvme_ns_head_open, -	.release	= nvme_ns_head_release, -	.ioctl		= nvme_ioctl, -	.compat_ioctl	= nvme_compat_ioctl, -	.getgeo		= nvme_getgeo, -	.report_zones	= nvme_report_zones, -	.pr_ops		= &nvme_pr_ops, -};  #endif /* CONFIG_NVME_MULTIPATH */  static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) @@ -2563,28 +2175,28 @@ static int nvme_configure_acre(struct nvme_ctrl *ctrl)  	return ret;  } +/* + * APST (Autonomous Power State Transition) lets us program a table of power + * state transitions that the controller will perform automatically. + * We configure it with a simple heuristic: we are willing to spend at most 2% + * of the time transitioning between power states.  Therefore, when running in + * any given state, we will enter the next lower-power non-operational state + * after waiting 50 * (enlat + exlat) microseconds, as long as that state's exit + * latency is under the requested maximum latency. + * + * We will not autonomously enter any non-operational state for which the total + * latency exceeds ps_max_latency_us. + * + * Users can set ps_max_latency_us to zero to turn off APST. + */  static int nvme_configure_apst(struct nvme_ctrl *ctrl)  { -	/* -	 * APST (Autonomous Power State Transition) lets us program a -	 * table of power state transitions that the controller will -	 * perform automatically.  We configure it with a simple -	 * heuristic: we are willing to spend at most 2% of the time -	 * transitioning between power states.  Therefore, when running -	 * in any given state, we will enter the next lower-power -	 * non-operational state after waiting 50 * (enlat + exlat) -	 * microseconds, as long as that state's exit latency is under -	 * the requested maximum latency. -	 * -	 * We will not autonomously enter any non-operational state for -	 * which the total latency exceeds ps_max_latency_us.  Users -	 * can set ps_max_latency_us to zero to turn off APST. -	 */ - -	unsigned apste;  	struct nvme_feat_auto_pst *table; +	unsigned apste = 0;  	u64 max_lat_us = 0; +	__le64 target = 0;  	int max_ps = -1; +	int state;  	int ret;  	/* @@ -2605,83 +2217,72 @@ static int nvme_configure_apst(struct nvme_ctrl *ctrl)  	if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) {  		/* Turn off APST. */ -		apste = 0;  		dev_dbg(ctrl->device, "APST disabled\n"); -	} else { -		__le64 target = cpu_to_le64(0); -		int state; - -		/* -		 * Walk through all states from lowest- to highest-power. -		 * According to the spec, lower-numbered states use more -		 * power.  NPSS, despite the name, is the index of the -		 * lowest-power state, not the number of states. -		 */ -		for (state = (int)ctrl->npss; state >= 0; state--) { -			u64 total_latency_us, exit_latency_us, transition_ms; - -			if (target) -				table->entries[state] = target; - -			/* -			 * Don't allow transitions to the deepest state -			 * if it's quirked off. -			 */ -			if (state == ctrl->npss && -			    (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) -				continue; - -			/* -			 * Is this state a useful non-operational state for -			 * higher-power states to autonomously transition to? -			 */ -			if (!(ctrl->psd[state].flags & -			      NVME_PS_FLAGS_NON_OP_STATE)) -				continue; - -			exit_latency_us = -				(u64)le32_to_cpu(ctrl->psd[state].exit_lat); -			if (exit_latency_us > ctrl->ps_max_latency_us) -				continue; +		goto done; +	} -			total_latency_us = -				exit_latency_us + -				le32_to_cpu(ctrl->psd[state].entry_lat); +	/* +	 * Walk through all states from lowest- to highest-power. +	 * According to the spec, lower-numbered states use more power.  NPSS, +	 * despite the name, is the index of the lowest-power state, not the +	 * number of states. +	 */ +	for (state = (int)ctrl->npss; state >= 0; state--) { +		u64 total_latency_us, exit_latency_us, transition_ms; -			/* -			 * This state is good.  Use it as the APST idle -			 * target for higher power states. -			 */ -			transition_ms = total_latency_us + 19; -			do_div(transition_ms, 20); -			if (transition_ms > (1 << 24) - 1) -				transition_ms = (1 << 24) - 1; +		if (target) +			table->entries[state] = target; -			target = cpu_to_le64((state << 3) | -					     (transition_ms << 8)); +		/* +		 * Don't allow transitions to the deepest state if it's quirked +		 * off. +		 */ +		if (state == ctrl->npss && +		    (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) +			continue; -			if (max_ps == -1) -				max_ps = state; +		/* +		 * Is this state a useful non-operational state for higher-power +		 * states to autonomously transition to? +		 */ +		if (!(ctrl->psd[state].flags & NVME_PS_FLAGS_NON_OP_STATE)) +			continue; -			if (total_latency_us > max_lat_us) -				max_lat_us = total_latency_us; -		} +		exit_latency_us = (u64)le32_to_cpu(ctrl->psd[state].exit_lat); +		if (exit_latency_us > ctrl->ps_max_latency_us) +			continue; -		apste = 1; +		total_latency_us = exit_latency_us + +			le32_to_cpu(ctrl->psd[state].entry_lat); -		if (max_ps == -1) { -			dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n"); -		} else { -			dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n", -				max_ps, max_lat_us, (int)sizeof(*table), table); -		} +		/* +		 * This state is good.  Use it as the APST idle target for +		 * higher power states. +		 */ +		transition_ms = total_latency_us + 19; +		do_div(transition_ms, 20); +		if (transition_ms > (1 << 24) - 1) +			transition_ms = (1 << 24) - 1; + +		target = cpu_to_le64((state << 3) | (transition_ms << 8)); +		if (max_ps == -1) +			max_ps = state; +		if (total_latency_us > max_lat_us) +			max_lat_us = total_latency_us;  	} +	if (max_ps == -1) +		dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n"); +	else +		dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n", +			max_ps, max_lat_us, (int)sizeof(*table), table); +	apste = 1; + +done:  	ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,  				table, sizeof(*table), NULL);  	if (ret)  		dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret); -  	kfree(table);  	return ret;  } @@ -2703,7 +2304,8 @@ static void nvme_set_latency_tolerance(struct device *dev, s32 val)  	if (ctrl->ps_max_latency_us != latency) {  		ctrl->ps_max_latency_us = latency; -		nvme_configure_apst(ctrl); +		if (ctrl->state == NVME_CTRL_LIVE) +			nvme_configure_apst(ctrl);  	}  } @@ -2876,8 +2478,8 @@ static ssize_t subsys_##field##_show(struct device *dev,		\  {									\  	struct nvme_subsystem *subsys =					\  		container_of(dev, struct nvme_subsystem, dev);		\ -	return sprintf(buf, "%.*s\n",					\ -		       (int)sizeof(subsys->field), subsys->field);	\ +	return sysfs_emit(buf, "%.*s\n",				\ +			   (int)sizeof(subsys->field), subsys->field);	\  }									\  static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show); @@ -3060,28 +2662,74 @@ out:  	return 0;  } -/* - * Initialize the cached copies of the Identify data and various controller - * register in our nvme_ctrl structure.  This should be called as soon as - * the admin queue is fully up and running. - */ -int nvme_init_identify(struct nvme_ctrl *ctrl) +static inline u32 nvme_mps_to_sectors(struct nvme_ctrl *ctrl, u32 units)  { -	struct nvme_id_ctrl *id; -	int ret, page_shift; -	u32 max_hw_sectors; -	bool prev_apst_enabled; +	u32 page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12, val; -	ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); -	if (ret) { -		dev_err(ctrl->device, "Reading VS failed (%d)\n", ret); -		return ret; +	if (check_shl_overflow(1U, units + page_shift - 9, &val)) +		return UINT_MAX; +	return val; +} + +static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) +{ +	struct nvme_command c = { }; +	struct nvme_id_ctrl_nvm *id; +	int ret; + +	if (ctrl->oncs & NVME_CTRL_ONCS_DSM) { +		ctrl->max_discard_sectors = UINT_MAX; +		ctrl->max_discard_segments = NVME_DSM_MAX_RANGES; +	} else { +		ctrl->max_discard_sectors = 0; +		ctrl->max_discard_segments = 0;  	} -	page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; -	ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); -	if (ctrl->vs >= NVME_VS(1, 1, 0)) -		ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap); +	/* +	 * Even though NVMe spec explicitly states that MDTS is not applicable +	 * to the write-zeroes, we are cautious and limit the size to the +	 * controllers max_hw_sectors value, which is based on the MDTS field +	 * and possibly other limiting factors. +	 */ +	if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) && +	    !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES)) +		ctrl->max_zeroes_sectors = ctrl->max_hw_sectors; +	else +		ctrl->max_zeroes_sectors = 0; + +	if (nvme_ctrl_limited_cns(ctrl)) +		return 0; + +	id = kzalloc(sizeof(*id), GFP_KERNEL); +	if (!id) +		return 0; + +	c.identify.opcode = nvme_admin_identify; +	c.identify.cns = NVME_ID_CNS_CS_CTRL; +	c.identify.csi = NVME_CSI_NVM; + +	ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id)); +	if (ret) +		goto free_data; + +	if (id->dmrl) +		ctrl->max_discard_segments = id->dmrl; +	if (id->dmrsl) +		ctrl->max_discard_sectors = le32_to_cpu(id->dmrsl); +	if (id->wzsl) +		ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl); + +free_data: +	kfree(id); +	return ret; +} + +static int nvme_init_identify(struct nvme_ctrl *ctrl) +{ +	struct nvme_id_ctrl *id; +	u32 max_hw_sectors; +	bool prev_apst_enabled; +	int ret;  	ret = nvme_identify_ctrl(ctrl, &id);  	if (ret) { @@ -3099,7 +2747,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)  		ctrl->cntlid = le16_to_cpu(id->cntlid);  	if (!ctrl->identified) { -		int i; +		unsigned int i;  		ret = nvme_init_subsystem(ctrl, id);  		if (ret) @@ -3138,7 +2786,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)  	atomic_set(&ctrl->abort_limit, id->acl + 1);  	ctrl->vwc = id->vwc;  	if (id->mdts) -		max_hw_sectors = 1 << (id->mdts + page_shift - 9); +		max_hw_sectors = nvme_mps_to_sectors(ctrl, id->mdts);  	else  		max_hw_sectors = UINT_MAX;  	ctrl->max_hw_sectors = @@ -3212,20 +2860,51 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)  	}  	ret = nvme_mpath_init(ctrl, id); -	kfree(id); -  	if (ret < 0) -		return ret; +		goto out_free;  	if (ctrl->apst_enabled && !prev_apst_enabled)  		dev_pm_qos_expose_latency_tolerance(ctrl->device);  	else if (!ctrl->apst_enabled && prev_apst_enabled)  		dev_pm_qos_hide_latency_tolerance(ctrl->device); +out_free: +	kfree(id); +	return ret; +} + +/* + * Initialize the cached copies of the Identify data and various controller + * register in our nvme_ctrl structure.  This should be called as soon as + * the admin queue is fully up and running. + */ +int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl) +{ +	int ret; + +	ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); +	if (ret) { +		dev_err(ctrl->device, "Reading VS failed (%d)\n", ret); +		return ret; +	} + +	ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); + +	if (ctrl->vs >= NVME_VS(1, 1, 0)) +		ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap); + +	ret = nvme_init_identify(ctrl); +	if (ret) +		return ret; + +	ret = nvme_init_non_mdts_limits(ctrl); +	if (ret < 0) +		return ret; +  	ret = nvme_configure_apst(ctrl);  	if (ret < 0)  		return ret; -	 +  	ret = nvme_configure_timestamp(ctrl);  	if (ret < 0)  		return ret; @@ -3247,12 +2926,8 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)  	ctrl->identified = true;  	return 0; - -out_free: -	kfree(id); -	return ret;  } -EXPORT_SYMBOL_GPL(nvme_init_identify); +EXPORT_SYMBOL_GPL(nvme_init_ctrl_finish);  static int nvme_dev_open(struct inode *inode, struct file *file)  { @@ -3286,65 +2961,6 @@ static int nvme_dev_release(struct inode *inode, struct file *file)  	return 0;  } -static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) -{ -	struct nvme_ns *ns; -	int ret; - -	down_read(&ctrl->namespaces_rwsem); -	if (list_empty(&ctrl->namespaces)) { -		ret = -ENOTTY; -		goto out_unlock; -	} - -	ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); -	if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { -		dev_warn(ctrl->device, -			"NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); -		ret = -EINVAL; -		goto out_unlock; -	} - -	dev_warn(ctrl->device, -		"using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); -	kref_get(&ns->kref); -	up_read(&ctrl->namespaces_rwsem); - -	ret = nvme_user_cmd(ctrl, ns, argp); -	nvme_put_ns(ns); -	return ret; - -out_unlock: -	up_read(&ctrl->namespaces_rwsem); -	return ret; -} - -static long nvme_dev_ioctl(struct file *file, unsigned int cmd, -		unsigned long arg) -{ -	struct nvme_ctrl *ctrl = file->private_data; -	void __user *argp = (void __user *)arg; - -	switch (cmd) { -	case NVME_IOCTL_ADMIN_CMD: -		return nvme_user_cmd(ctrl, NULL, argp); -	case NVME_IOCTL_ADMIN64_CMD: -		return nvme_user_cmd64(ctrl, NULL, argp); -	case NVME_IOCTL_IO_CMD: -		return nvme_dev_user_cmd(ctrl, argp); -	case NVME_IOCTL_RESET: -		dev_warn(ctrl->device, "resetting controller\n"); -		return nvme_reset_ctrl_sync(ctrl); -	case NVME_IOCTL_SUBSYS_RESET: -		return nvme_reset_subsystem(ctrl); -	case NVME_IOCTL_RESCAN: -		nvme_queue_scan(ctrl); -		return 0; -	default: -		return -ENOTTY; -	} -} -  static const struct file_operations nvme_dev_fops = {  	.owner		= THIS_MODULE,  	.open		= nvme_dev_open, @@ -3398,13 +3014,13 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,  	int model_len = sizeof(subsys->model);  	if (!uuid_is_null(&ids->uuid)) -		return sprintf(buf, "uuid.%pU\n", &ids->uuid); +		return sysfs_emit(buf, "uuid.%pU\n", &ids->uuid);  	if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) -		return sprintf(buf, "eui.%16phN\n", ids->nguid); +		return sysfs_emit(buf, "eui.%16phN\n", ids->nguid);  	if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) -		return sprintf(buf, "eui.%8phN\n", ids->eui64); +		return sysfs_emit(buf, "eui.%8phN\n", ids->eui64);  	while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' ||  				  subsys->serial[serial_len - 1] == '\0')) @@ -3413,7 +3029,7 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,  				 subsys->model[model_len - 1] == '\0'))  		model_len--; -	return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id, +	return sysfs_emit(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id,  		serial_len, subsys->serial, model_len, subsys->model,  		head->ns_id);  } @@ -3422,7 +3038,7 @@ static DEVICE_ATTR_RO(wwid);  static ssize_t nguid_show(struct device *dev, struct device_attribute *attr,  		char *buf)  { -	return sprintf(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid); +	return sysfs_emit(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid);  }  static DEVICE_ATTR_RO(nguid); @@ -3437,23 +3053,23 @@ static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,  	if (uuid_is_null(&ids->uuid)) {  		printk_ratelimited(KERN_WARNING  				   "No UUID available providing old NGUID\n"); -		return sprintf(buf, "%pU\n", ids->nguid); +		return sysfs_emit(buf, "%pU\n", ids->nguid);  	} -	return sprintf(buf, "%pU\n", &ids->uuid); +	return sysfs_emit(buf, "%pU\n", &ids->uuid);  }  static DEVICE_ATTR_RO(uuid);  static ssize_t eui_show(struct device *dev, struct device_attribute *attr,  		char *buf)  { -	return sprintf(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64); +	return sysfs_emit(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64);  }  static DEVICE_ATTR_RO(eui);  static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,  		char *buf)  { -	return sprintf(buf, "%d\n", dev_to_ns_head(dev)->ns_id); +	return sysfs_emit(buf, "%d\n", dev_to_ns_head(dev)->ns_id);  }  static DEVICE_ATTR_RO(nsid); @@ -3518,7 +3134,7 @@ static ssize_t  field##_show(struct device *dev,				\  			    struct device_attribute *attr, char *buf)		\  {										\          struct nvme_ctrl *ctrl = dev_get_drvdata(dev);				\ -        return sprintf(buf, "%.*s\n",						\ +        return sysfs_emit(buf, "%.*s\n",					\  		(int)sizeof(ctrl->subsys->field), ctrl->subsys->field);		\  }										\  static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); @@ -3532,7 +3148,7 @@ static ssize_t  field##_show(struct device *dev,				\  			    struct device_attribute *attr, char *buf)		\  {										\          struct nvme_ctrl *ctrl = dev_get_drvdata(dev);				\ -        return sprintf(buf, "%d\n", ctrl->field);	\ +        return sysfs_emit(buf, "%d\n", ctrl->field);				\  }										\  static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); @@ -3540,6 +3156,7 @@ nvme_show_int_function(cntlid);  nvme_show_int_function(numa_node);  nvme_show_int_function(queue_count);  nvme_show_int_function(sqsize); +nvme_show_int_function(kato);  static ssize_t nvme_sysfs_delete(struct device *dev,  				struct device_attribute *attr, const char *buf, @@ -3580,9 +3197,9 @@ static ssize_t nvme_sysfs_show_state(struct device *dev,  	if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) &&  	    state_name[ctrl->state]) -		return sprintf(buf, "%s\n", state_name[ctrl->state]); +		return sysfs_emit(buf, "%s\n", state_name[ctrl->state]); -	return sprintf(buf, "unknown state\n"); +	return sysfs_emit(buf, "unknown state\n");  }  static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL); @@ -3634,9 +3251,9 @@ static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev,  	struct nvmf_ctrl_options *opts = ctrl->opts;  	if (ctrl->opts->max_reconnects == -1) -		return sprintf(buf, "off\n"); -	return sprintf(buf, "%d\n", -			opts->max_reconnects * opts->reconnect_delay); +		return sysfs_emit(buf, "off\n"); +	return sysfs_emit(buf, "%d\n", +			  opts->max_reconnects * opts->reconnect_delay);  }  static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev, @@ -3650,7 +3267,7 @@ static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev,  	if (err)  		return -EINVAL; -	else if (ctrl_loss_tmo < 0) +	if (ctrl_loss_tmo < 0)  		opts->max_reconnects = -1;  	else  		opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo, @@ -3666,8 +3283,8 @@ static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev,  	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);  	if (ctrl->opts->reconnect_delay == -1) -		return sprintf(buf, "off\n"); -	return sprintf(buf, "%d\n", ctrl->opts->reconnect_delay); +		return sysfs_emit(buf, "off\n"); +	return sysfs_emit(buf, "%d\n", ctrl->opts->reconnect_delay);  }  static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev, @@ -3687,6 +3304,36 @@ static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev,  static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR,  	nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store); +static ssize_t nvme_ctrl_fast_io_fail_tmo_show(struct device *dev, +		struct device_attribute *attr, char *buf) +{ +	struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + +	if (ctrl->opts->fast_io_fail_tmo == -1) +		return sysfs_emit(buf, "off\n"); +	return sysfs_emit(buf, "%d\n", ctrl->opts->fast_io_fail_tmo); +} + +static ssize_t nvme_ctrl_fast_io_fail_tmo_store(struct device *dev, +		struct device_attribute *attr, const char *buf, size_t count) +{ +	struct nvme_ctrl *ctrl = dev_get_drvdata(dev); +	struct nvmf_ctrl_options *opts = ctrl->opts; +	int fast_io_fail_tmo, err; + +	err = kstrtoint(buf, 10, &fast_io_fail_tmo); +	if (err) +		return -EINVAL; + +	if (fast_io_fail_tmo < 0) +		opts->fast_io_fail_tmo = -1; +	else +		opts->fast_io_fail_tmo = fast_io_fail_tmo; +	return count; +} +static DEVICE_ATTR(fast_io_fail_tmo, S_IRUGO | S_IWUSR, +	nvme_ctrl_fast_io_fail_tmo_show, nvme_ctrl_fast_io_fail_tmo_store); +  static struct attribute *nvme_dev_attrs[] = {  	&dev_attr_reset_controller.attr,  	&dev_attr_rescan_controller.attr, @@ -3706,6 +3353,8 @@ static struct attribute *nvme_dev_attrs[] = {  	&dev_attr_hostid.attr,  	&dev_attr_ctrl_loss_tmo.attr,  	&dev_attr_reconnect_delay.attr, +	&dev_attr_fast_io_fail_tmo.attr, +	&dev_attr_kato.attr,  	NULL  }; @@ -3727,6 +3376,8 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,  		return 0;  	if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts)  		return 0; +	if (a == &dev_attr_fast_io_fail_tmo.attr && !ctrl->opts) +		return 0;  	return a->mode;  } @@ -3749,7 +3400,7 @@ static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys,  	lockdep_assert_held(&subsys->lock);  	list_for_each_entry(h, &subsys->nsheads, entry) { -		if (h->ns_id == nsid && kref_get_unless_zero(&h->ref)) +		if (h->ns_id == nsid && nvme_tryget_ns_head(h))  			return h;  	} @@ -3772,6 +3423,66 @@ static int __nvme_check_ids(struct nvme_subsystem *subsys,  	return 0;  } +void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device) +{ +	cdev_device_del(cdev, cdev_device); +	ida_simple_remove(&nvme_ns_chr_minor_ida, MINOR(cdev_device->devt)); +} + +int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device, +		const struct file_operations *fops, struct module *owner) +{ +	int minor, ret; + +	minor = ida_simple_get(&nvme_ns_chr_minor_ida, 0, 0, GFP_KERNEL); +	if (minor < 0) +		return minor; +	cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor); +	cdev_device->class = nvme_ns_chr_class; +	device_initialize(cdev_device); +	cdev_init(cdev, fops); +	cdev->owner = owner; +	ret = cdev_device_add(cdev, cdev_device); +	if (ret) +		ida_simple_remove(&nvme_ns_chr_minor_ida, minor); +	return ret; +} + +static int nvme_ns_chr_open(struct inode *inode, struct file *file) +{ +	return nvme_ns_open(container_of(inode->i_cdev, struct nvme_ns, cdev)); +} + +static int nvme_ns_chr_release(struct inode *inode, struct file *file) +{ +	nvme_ns_release(container_of(inode->i_cdev, struct nvme_ns, cdev)); +	return 0; +} + +static const struct file_operations nvme_ns_chr_fops = { +	.owner		= THIS_MODULE, +	.open		= nvme_ns_chr_open, +	.release	= nvme_ns_chr_release, +	.unlocked_ioctl	= nvme_ns_chr_ioctl, +	.compat_ioctl	= compat_ptr_ioctl, +}; + +static int nvme_add_ns_cdev(struct nvme_ns *ns) +{ +	int ret; + +	ns->cdev_device.parent = ns->ctrl->device; +	ret = dev_set_name(&ns->cdev_device, "ng%dn%d", +			   ns->ctrl->instance, ns->head->instance); +	if (ret) +		return ret; +	ret = nvme_cdev_add(&ns->cdev, &ns->cdev_device, &nvme_ns_chr_fops, +			    ns->ctrl->ops->module); +	if (ret) +		kfree_const(ns->cdev_device.kobj.name); +	return ret; +} +  static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,  		unsigned nsid, struct nvme_ns_ids *ids)  { @@ -3877,7 +3588,8 @@ out_unlock:  	return ret;  } -static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) +static int ns_cmp(void *priv, const struct list_head *a, +		const struct list_head *b)  {  	struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);  	struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); @@ -3911,8 +3623,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,  	struct nvme_ns *ns;  	struct gendisk *disk;  	struct nvme_id_ns *id; -	char disk_name[DISK_NAME_LEN]; -	int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT; +	int node = ctrl->numa_node;  	if (nvme_identify_ns(ctrl, nsid, ids, &id))  		return; @@ -3938,7 +3649,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,  	if (nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED))  		goto out_free_queue; -	nvme_set_disk_name(disk_name, ns, ctrl, &flags);  	disk = alloc_disk_node(0, node);  	if (!disk) @@ -3947,15 +3657,22 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,  	disk->fops = &nvme_bdev_ops;  	disk->private_data = ns;  	disk->queue = ns->queue; -	disk->flags = flags; -	memcpy(disk->disk_name, disk_name, DISK_NAME_LEN); +	disk->flags = GENHD_FL_EXT_DEVT; +	/* +	 * Without the multipath code enabled, multiple controller per +	 * subsystems are visible as devices and thus we cannot use the +	 * subsystem instance. +	 */ +	if (!nvme_mpath_set_disk_name(ns, disk->disk_name, &disk->flags)) +		sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, +			ns->head->instance);  	ns->disk = disk;  	if (nvme_update_ns_info(ns, id))  		goto out_put_disk;  	if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { -		if (nvme_nvm_register(ns, disk_name, node)) { +		if (nvme_nvm_register(ns, disk->disk_name, node)) {  			dev_warn(ctrl->device, "LightNVM init failure\n");  			goto out_put_disk;  		} @@ -3968,6 +3685,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,  	nvme_get_ctrl(ctrl);  	device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups); +	if (!nvme_ns_head_multipath(ns->head)) +		nvme_add_ns_cdev(ns);  	nvme_mpath_add_disk(ns, id);  	nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name); @@ -4012,6 +3731,8 @@ static void nvme_ns_remove(struct nvme_ns *ns)  	synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */  	if (ns->disk->flags & GENHD_FL_UP) { +		if (!nvme_ns_head_multipath(ns->head)) +			nvme_cdev_del(&ns->cdev, &ns->cdev_device);  		del_gendisk(ns->disk);  		blk_cleanup_queue(ns->queue);  		if (blk_get_integrity(ns->disk)) @@ -4756,6 +4477,7 @@ static inline void _nvme_check_size(void)  	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);  	BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);  	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE); +	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_nvm) != NVME_IDENTIFY_DATA_SIZE);  	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);  	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);  	BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64); @@ -4801,8 +4523,24 @@ static int __init nvme_core_init(void)  		result = PTR_ERR(nvme_subsys_class);  		goto destroy_class;  	} + +	result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS, +				     "nvme-generic"); +	if (result < 0) +		goto destroy_subsys_class; + +	nvme_ns_chr_class = class_create(THIS_MODULE, "nvme-generic"); +	if (IS_ERR(nvme_ns_chr_class)) { +		result = PTR_ERR(nvme_ns_chr_class); +		goto unregister_generic_ns; +	} +  	return 0; +unregister_generic_ns: +	unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS); +destroy_subsys_class: +	class_destroy(nvme_subsys_class);  destroy_class:  	class_destroy(nvme_class);  unregister_chrdev: @@ -4819,12 +4557,15 @@ out:  static void __exit nvme_core_exit(void)  { +	class_destroy(nvme_ns_chr_class);  	class_destroy(nvme_subsys_class);  	class_destroy(nvme_class); +	unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);  	unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);  	destroy_workqueue(nvme_delete_wq);  	destroy_workqueue(nvme_reset_wq);  	destroy_workqueue(nvme_wq); +	ida_destroy(&nvme_ns_chr_minor_ida);  	ida_destroy(&nvme_instance_ida);  } | 
