diff options
-rw-r--r-- | block/blk-throttle.c | 3 | ||||
-rw-r--r-- | drivers/block/nvme-core.c | 496 | ||||
-rw-r--r-- | drivers/block/nvme-scsi.c | 96 | ||||
-rw-r--r-- | include/linux/nvme.h | 9 | ||||
-rw-r--r-- | include/uapi/linux/nvme.h | 26 |
5 files changed, 417 insertions, 213 deletions
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 9273d0969ebd..5b9c6d5c3636 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1292,6 +1292,9 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, struct blkg_rwstat rwstat = { }, tmp; int i, cpu; + if (tg->stats_cpu == NULL) + return 0; + for_each_possible_cpu(cpu) { struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index cbdfbbf98392..b64bccbb78c9 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -37,17 +37,18 @@ #include <linux/ptrace.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/t10-pi.h> #include <linux/types.h> #include <scsi/sg.h> #include <asm-generic/io-64-nonatomic-lo-hi.h> +#define NVME_MINORS (1U << MINORBITS) #define NVME_Q_DEPTH 1024 #define NVME_AQ_DEPTH 64 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) #define ADMIN_TIMEOUT (admin_timeout * HZ) #define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) -#define IOD_TIMEOUT (retry_time * HZ) static unsigned char admin_timeout = 60; module_param(admin_timeout, byte, 0644); @@ -57,10 +58,6 @@ unsigned char nvme_io_timeout = 30; module_param_named(io_timeout, nvme_io_timeout, byte, 0644); MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); -static unsigned char retry_time = 30; -module_param(retry_time, byte, 0644); -MODULE_PARM_DESC(retry_time, "time in seconds to retry failed I/O"); - static unsigned char shutdown_timeout = 5; module_param(shutdown_timeout, byte, 0644); MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); @@ -68,6 +65,9 @@ MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown") static int nvme_major; module_param(nvme_major, int, 0); +static int nvme_char_major; +module_param(nvme_char_major, int, 0); + static int use_threaded_interrupts; module_param(use_threaded_interrupts, int, 0); @@ -76,7 +76,8 @@ static LIST_HEAD(dev_list); static struct task_struct *nvme_thread; static struct workqueue_struct *nvme_workq; static wait_queue_head_t nvme_kthread_wait; -static struct notifier_block nvme_nb; + +static struct class *nvme_class; static void nvme_reset_failed_dev(struct work_struct *ws); static int nvme_process_cq(struct nvme_queue *nvmeq); @@ -95,7 +96,6 @@ struct async_cmd_info { * commands and one for I/O commands). */ struct nvme_queue { - struct llist_node node; struct device *q_dmadev; struct nvme_dev *dev; char irqname[24]; /* nvme4294967295-65535\0 */ @@ -482,6 +482,62 @@ static int nvme_error_status(u16 status) } } +static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) +{ + if (be32_to_cpu(pi->ref_tag) == v) + pi->ref_tag = cpu_to_be32(p); +} + +static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) +{ + if (be32_to_cpu(pi->ref_tag) == p) + pi->ref_tag = cpu_to_be32(v); +} + +/** + * nvme_dif_remap - remaps ref tags to bip seed and physical lba + * + * The virtual start sector is the one that was originally submitted by the + * block layer. Due to partitioning, MD/DM cloning, etc. the actual physical + * start sector may be different. Remap protection information to match the + * physical LBA on writes, and back to the original seed on reads. + * + * Type 0 and 3 do not have a ref tag, so no remapping required. + */ +static void nvme_dif_remap(struct request *req, + void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) +{ + struct nvme_ns *ns = req->rq_disk->private_data; + struct bio_integrity_payload *bip; + struct t10_pi_tuple *pi; + void *p, *pmap; + u32 i, nlb, ts, phys, virt; + + if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3) + return; + + bip = bio_integrity(req->bio); + if (!bip) + return; + + pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; + if (!pmap) + return; + + p = pmap; + virt = bip_get_seed(bip); + phys = nvme_block_nr(ns, blk_rq_pos(req)); + nlb = (blk_rq_bytes(req) >> ns->lba_shift); + ts = ns->disk->integrity->tuple_size; + + for (i = 0; i < nlb; i++, virt++, phys++) { + pi = (struct t10_pi_tuple *)p; + dif_swap(phys, virt, pi); + p += ts; + } + kunmap_atomic(pmap); +} + static void req_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { @@ -512,9 +568,16 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx, "completing aborted command with status:%04x\n", status); - if (iod->nents) + if (iod->nents) { dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents, rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + if (blk_integrity_rq(req)) { + if (!rq_data_dir(req)) + nvme_dif_remap(req, nvme_dif_complete); + dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->meta_sg, 1, + rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + } + } nvme_free_iod(nvmeq->dev, iod); blk_mq_complete_request(req); @@ -670,6 +733,24 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + + if (blk_integrity_rq(req)) { + cmnd->rw.metadata = cpu_to_le64(sg_dma_address(iod->meta_sg)); + switch (ns->pi_type) { + case NVME_NS_DPS_PI_TYPE3: + control |= NVME_RW_PRINFO_PRCHK_GUARD; + break; + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + control |= NVME_RW_PRINFO_PRCHK_GUARD | + NVME_RW_PRINFO_PRCHK_REF; + cmnd->rw.reftag = cpu_to_le32( + nvme_block_nr(ns, blk_rq_pos(req))); + break; + } + } else if (ns->ms) + control |= NVME_RW_PRINFO_PRACT; + cmnd->rw.control = cpu_to_le16(control); cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); @@ -690,6 +771,19 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_iod *iod; enum dma_data_direction dma_dir; + /* + * If formated with metadata, require the block layer provide a buffer + * unless this namespace is formated such that the metadata can be + * stripped/generated by the controller with PRACT=1. + */ + if (ns->ms && !blk_integrity_rq(req)) { + if (!(ns->pi_type && ns->ms == 8)) { + req->errors = -EFAULT; + blk_mq_complete_request(req); + return BLK_MQ_RQ_QUEUE_OK; + } + } + iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC); if (!iod) return BLK_MQ_RQ_QUEUE_BUSY; @@ -725,6 +819,21 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, iod->nents, dma_dir); goto retry_cmd; } + if (blk_integrity_rq(req)) { + if (blk_rq_count_integrity_sg(req->q, req->bio) != 1) + goto error_cmd; + + sg_init_table(iod->meta_sg, 1); + if (blk_rq_map_integrity_sg( + req->q, req->bio, iod->meta_sg) != 1) + goto error_cmd; + + if (rq_data_dir(req)) + nvme_dif_remap(req, nvme_dif_prep); + + if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir)) + goto error_cmd; + } } nvme_set_info(cmd, iod, req_completion); @@ -817,14 +926,6 @@ static irqreturn_t nvme_irq_check(int irq, void *data) return IRQ_WAKE_THREAD; } -static void nvme_abort_cmd_info(struct nvme_queue *nvmeq, struct nvme_cmd_info * - cmd_info) -{ - spin_lock_irq(&nvmeq->q_lock); - cancel_cmd_info(cmd_info, NULL); - spin_unlock_irq(&nvmeq->q_lock); -} - struct sync_cmd_info { struct task_struct *task; u32 result; @@ -847,7 +948,6 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx, static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd, u32 *result, unsigned timeout) { - int ret; struct sync_cmd_info cmdinfo; struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = cmd_rq->nvmeq; @@ -859,29 +959,12 @@ static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd, nvme_set_info(cmd_rq, &cmdinfo, sync_completion); - set_current_state(TASK_KILLABLE); - ret = nvme_submit_cmd(nvmeq, cmd); - if (ret) { - nvme_finish_cmd(nvmeq, req->tag, NULL); - set_current_state(TASK_RUNNING); - } - ret = schedule_timeout(timeout); - - /* - * Ensure that sync_completion has either run, or that it will - * never run. - */ - nvme_abort_cmd_info(nvmeq, blk_mq_rq_to_pdu(req)); - - /* - * We never got the completion - */ - if (cmdinfo.status == -EINTR) - return -EINTR; + set_current_state(TASK_UNINTERRUPTIBLE); + nvme_submit_cmd(nvmeq, cmd); + schedule(); if (result) *result = cmdinfo.result; - return cmdinfo.status; } @@ -1158,29 +1241,18 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = cmd->nvmeq; - /* - * The aborted req will be completed on receiving the abort req. - * We enable the timer again. If hit twice, it'll cause a device reset, - * as the device then is in a faulty state. - */ - int ret = BLK_EH_RESET_TIMER; - dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag, nvmeq->qid); - spin_lock_irq(&nvmeq->q_lock); - if (!nvmeq->dev->initialized) { - /* - * Force cancelled command frees the request, which requires we - * return BLK_EH_NOT_HANDLED. - */ - nvme_cancel_queue_ios(nvmeq->hctx, req, nvmeq, reserved); - ret = BLK_EH_NOT_HANDLED; - } else - nvme_abort_req(req); + nvme_abort_req(req); spin_unlock_irq(&nvmeq->q_lock); - return ret; + /* + * The aborted req will be completed on receiving the abort req. + * We enable the timer again. If hit twice, it'll cause a device reset, + * as the device then is in a faulty state. + */ + return BLK_EH_RESET_TIMER; } static void nvme_free_queue(struct nvme_queue *nvmeq) @@ -1233,7 +1305,6 @@ static void nvme_clear_queue(struct nvme_queue *nvmeq) struct blk_mq_hw_ctx *hctx = nvmeq->hctx; spin_lock_irq(&nvmeq->q_lock); - nvme_process_cq(nvmeq); if (hctx && hctx->tags) blk_mq_tag_busy_iter(hctx, nvme_cancel_queue_ios, nvmeq); spin_unlock_irq(&nvmeq->q_lock); @@ -1256,7 +1327,10 @@ static void nvme_disable_queue(struct nvme_dev *dev, int qid) } if (!qid && dev->admin_q) blk_mq_freeze_queue_start(dev->admin_q); - nvme_clear_queue(nvmeq); + + spin_lock_irq(&nvmeq->q_lock); + nvme_process_cq(nvmeq); + spin_unlock_irq(&nvmeq->q_lock); } static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, @@ -1875,13 +1949,61 @@ static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo) return 0; } +static void nvme_config_discard(struct nvme_ns *ns) +{ + u32 logical_block_size = queue_logical_block_size(ns->queue); + ns->queue->limits.discard_zeroes_data = 0; + ns->queue->limits.discard_alignment = logical_block_size; + ns->queue->limits.discard_granularity = logical_block_size; + ns->queue->limits.max_discard_sectors = 0xffffffff; + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); +} + +static int nvme_noop_verify(struct blk_integrity_iter *iter) +{ + return 0; +} + +static int nvme_noop_generate(struct blk_integrity_iter *iter) +{ + return 0; +} + +struct blk_integrity nvme_meta_noop = { + .name = "NVME_META_NOOP", + .generate_fn = nvme_noop_generate, + .verify_fn = nvme_noop_verify, +}; + +static void nvme_init_integrity(struct nvme_ns *ns) +{ + struct blk_integrity integrity; + + switch (ns->pi_type) { + case NVME_NS_DPS_PI_TYPE3: + integrity = t10_pi_type3_crc; + break; + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + integrity = t10_pi_type1_crc; + break; + default: + integrity = nvme_meta_noop; + break; + } + integrity.tuple_size = ns->ms; + blk_integrity_register(ns->disk, &integrity); + blk_queue_max_integrity_segments(ns->queue, 1); +} + static int nvme_revalidate_disk(struct gendisk *disk) { struct nvme_ns *ns = disk->private_data; struct nvme_dev *dev = ns->dev; struct nvme_id_ns *id; dma_addr_t dma_addr; - int lbaf; + int lbaf, pi_type, old_ms; + unsigned short bs; id = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr, GFP_KERNEL); @@ -1890,16 +2012,50 @@ static int nvme_revalidate_disk(struct gendisk *disk) __func__); return 0; } + if (nvme_identify(dev, ns->ns_id, 0, dma_addr)) { + dev_warn(&dev->pci_dev->dev, + "identify failed ns:%d, setting capacity to 0\n", + ns->ns_id); + memset(id, 0, sizeof(*id)); + } - if (nvme_identify(dev, ns->ns_id, 0, dma_addr)) - goto free; - - lbaf = id->flbas & 0xf; + old_ms = ns->ms; + lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; ns->lba_shift = id->lbaf[lbaf].ds; + ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); + + /* + * If identify namespace failed, use default 512 byte block size so + * block layer can use before failing read/write for 0 capacity. + */ + if (ns->lba_shift == 0) + ns->lba_shift = 9; + bs = 1 << ns->lba_shift; + + /* XXX: PI implementation requires metadata equal t10 pi tuple size */ + pi_type = ns->ms == sizeof(struct t10_pi_tuple) ? + id->dps & NVME_NS_DPS_PI_MASK : 0; + + if (disk->integrity && (ns->pi_type != pi_type || ns->ms != old_ms || + bs != queue_logical_block_size(disk->queue) || + (ns->ms && id->flbas & NVME_NS_FLBAS_META_EXT))) + blk_integrity_unregister(disk); + + ns->pi_type = pi_type; + blk_queue_logical_block_size(ns->queue, bs); + + if (ns->ms && !disk->integrity && (disk->flags & GENHD_FL_UP) && + !(id->flbas & NVME_NS_FLBAS_META_EXT)) + nvme_init_integrity(ns); + + if (id->ncap == 0 || (ns->ms && !disk->integrity)) + set_capacity(disk, 0); + else + set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); + + if (dev->oncs & NVME_CTRL_ONCS_DSM) + nvme_config_discard(ns); - blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); - set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); - free: dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr); return 0; } @@ -1923,8 +2079,7 @@ static int nvme_kthread(void *data) spin_lock(&dev_list_lock); list_for_each_entry_safe(dev, next, &dev_list, node) { int i; - if (readl(&dev->bar->csts) & NVME_CSTS_CFS && - dev->initialized) { + if (readl(&dev->bar->csts) & NVME_CSTS_CFS) { if (work_busy(&dev->reset_work)) continue; list_del_init(&dev->node); @@ -1956,30 +2111,16 @@ static int nvme_kthread(void *data) return 0; } -static void nvme_config_discard(struct nvme_ns *ns) -{ - u32 logical_block_size = queue_logical_block_size(ns->queue); - ns->queue->limits.discard_zeroes_data = 0; - ns->queue->limits.discard_alignment = logical_block_size; - ns->queue->limits.discard_granularity = logical_block_size; - ns->queue->limits.max_discard_sectors = 0xffffffff; - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); -} - -static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, - struct nvme_id_ns *id, struct nvme_lba_range_type *rt) +static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) { struct nvme_ns *ns; struct gendisk *disk; int node = dev_to_node(&dev->pci_dev->dev); - int lbaf; - - if (rt->attributes & NVME_LBART_ATTRIB_HIDE) - return NULL; ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) - return NULL; + return; + ns->queue = blk_mq_init_queue(&dev->tagset); if (IS_ERR(ns->queue)) goto out_free_ns; @@ -1995,9 +2136,9 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, ns->ns_id = nsid; ns->disk = disk; - lbaf = id->flbas & 0xf; - ns->lba_shift = id->lbaf[lbaf].ds; - ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); + ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ + list_add_tail(&ns->list, &dev->namespaces); + blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); if (dev->max_hw_sectors) blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); @@ -2011,21 +2152,26 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, disk->fops = &nvme_fops; disk->private_data = ns; disk->queue = ns->queue; - disk->driverfs_dev = &dev->pci_dev->dev; + disk->driverfs_dev = dev->device; disk->flags = GENHD_FL_EXT_DEVT; sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); - set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); - - if (dev->oncs & NVME_CTRL_ONCS_DSM) - nvme_config_discard(ns); - - return ns; + /* + * Initialize capacity to 0 until we establish the namespace format and + * setup integrity extentions if necessary. The revalidate_disk after + * add_disk allows the driver to register with integrity if the format + * requires it. + */ + set_capacity(disk, 0); + nvme_revalidate_disk(ns->disk); + add_disk(ns->disk); + if (ns->ms) + revalidate_disk(ns->disk); + return; out_free_queue: blk_cleanup_queue(ns->queue); out_free_ns: kfree(ns); - return NULL; } static void nvme_create_io_queues(struct nvme_dev *dev) @@ -2150,22 +2296,20 @@ static int nvme_dev_add(struct nvme_dev *dev) struct pci_dev *pdev = dev->pci_dev; int res; unsigned nn, i; - struct nvme_ns *ns; struct nvme_id_ctrl *ctrl; - struct nvme_id_ns *id_ns; void *mem; dma_addr_t dma_addr; int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; - mem = dma_alloc_coherent(&pdev->dev, 8192, &dma_addr, GFP_KERNEL); + mem = dma_alloc_coherent(&pdev->dev, 4096, &dma_addr, GFP_KERNEL); if (!mem) return -ENOMEM; res = nvme_identify(dev, 0, 1, dma_addr); if (res) { dev_err(&pdev->dev, "Identify Controller failed (%d)\n", res); - res = -EIO; - goto out; + dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr); + return -EIO; } ctrl = mem; @@ -2191,6 +2335,7 @@ static int nvme_dev_add(struct nvme_dev *dev) } else dev->max_hw_sectors = max_hw_sectors; } + dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr); dev->tagset.ops = &nvme_mq_ops; dev->tagset.nr_hw_queues = dev->online_queues - 1; @@ -2203,33 +2348,12 @@ static int nvme_dev_add(struct nvme_dev *dev) dev->tagset.driver_data = dev; if (blk_mq_alloc_tag_set(&dev->tagset)) - goto out; - - id_ns = mem; - for (i = 1; i <= nn; i++) { - res = nvme_identify(dev, i, 0, dma_addr); - if (res) - continue; - - if (id_ns->ncap == 0) - continue; - - res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i, - dma_addr + 4096, NULL); - if (res) - memset(mem + 4096, 0, 4096); + return 0; - ns = nvme_alloc_ns(dev, i, mem, mem + 4096); - if (ns) - list_add_tail(&ns->list, &dev->namespaces); - } - list_for_each_entry(ns, &dev->namespaces, list) - add_disk(ns->disk); - res = 0; + for (i = 1; i <= nn; i++) + nvme_alloc_ns(dev, i); - out: - dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr); - return res; + return 0; } static int nvme_dev_map(struct nvme_dev *dev) @@ -2358,8 +2482,6 @@ static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq) static void nvme_del_queue_end(struct nvme_queue *nvmeq) { struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx; - - nvme_clear_queue(nvmeq); nvme_put_dq(dq); } @@ -2502,7 +2624,6 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) int i; u32 csts = -1; - dev->initialized = 0; nvme_dev_list_remove(dev); if (dev->bar) { @@ -2513,7 +2634,6 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) for (i = dev->queue_count - 1; i >= 0; i--) { struct nvme_queue *nvmeq = dev->queues[i]; nvme_suspend_queue(nvmeq); - nvme_clear_queue(nvmeq); } } else { nvme_disable_io_queues(dev); @@ -2521,6 +2641,9 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) nvme_disable_queue(dev, 0); } nvme_dev_unmap(dev); + + for (i = dev->queue_count - 1; i >= 0; i--) + nvme_clear_queue(dev->queues[i]); } static void nvme_dev_remove(struct nvme_dev *dev) @@ -2528,8 +2651,11 @@ static void nvme_dev_remove(struct nvme_dev *dev) struct nvme_ns *ns; list_for_each_entry(ns, &dev->namespaces, list) { - if (ns->disk->flags & GENHD_FL_UP) + if (ns->disk->flags & GENHD_FL_UP) { + if (ns->disk->integrity) + blk_integrity_unregister(ns->disk); del_gendisk(ns->disk); + } if (!blk_queue_dying(ns->queue)) { blk_mq_abort_requeue_list(ns->queue); blk_cleanup_queue(ns->queue); @@ -2611,6 +2737,7 @@ static void nvme_free_dev(struct kref *kref) struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); pci_dev_put(dev->pci_dev); + put_device(dev->device); nvme_free_namespaces(dev); nvme_release_instance(dev); blk_mq_free_tag_set(&dev->tagset); @@ -2622,11 +2749,27 @@ static void nvme_free_dev(struct kref *kref) static int nvme_dev_open(struct inode *inode, struct file *f) { - struct nvme_dev *dev = container_of(f->private_data, struct nvme_dev, - miscdev); - kref_get(&dev->kref); - f->private_data = dev; - return 0; + struct nvme_dev *dev; + int instance = iminor(inode); + int ret = -ENODEV; + + spin_lock(&dev_list_lock); + list_for_each_entry(dev, &dev_list, node) { + if (dev->instance == instance) { + if (!dev->admin_q) { + ret = -EWOULDBLOCK; + break; + } + if (!kref_get_unless_zero(&dev->kref)) + break; + f->private_data = dev; + ret = 0; + break; + } + } + spin_unlock(&dev_list_lock); + + return ret; } static int nvme_dev_release(struct inode *inode, struct file *f) @@ -2768,7 +2911,6 @@ static int nvme_dev_resume(struct nvme_dev *dev) nvme_unfreeze_queues(dev); nvme_set_irq_hints(dev); } - dev->initialized = 1; return 0; } @@ -2799,6 +2941,7 @@ static void nvme_reset_workfn(struct work_struct *work) dev->reset_workfn(work); } +static void nvme_async_probe(struct work_struct *work); static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) { int node, result = -ENOMEM; @@ -2834,37 +2977,20 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto release; kref_init(&dev->kref); - result = nvme_dev_start(dev); - if (result) + dev->device = device_create(nvme_class, &pdev->dev, + MKDEV(nvme_char_major, dev->instance), + dev, "nvme%d", dev->instance); + if (IS_ERR(dev->device)) { + result = PTR_ERR(dev->device); goto release_pools; + } + get_device(dev->device); - if (dev->online_queues > 1) - result = nvme_dev_add(dev); - if (result) - goto shutdown; - - scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance); - dev->miscdev.minor = MISC_DYNAMIC_MINOR; - dev->miscdev.parent = &pdev->dev; - dev->miscdev.name = dev->name; - dev->miscdev.fops = &nvme_dev_fops; - result = misc_register(&dev->miscdev); - if (result) - goto remove; - - nvme_set_irq_hints(dev); - - dev->initialized = 1; + INIT_WORK(&dev->probe_work, nvme_async_probe); + schedule_work(&dev->probe_work); return 0; - remove: - nvme_dev_remove(dev); - nvme_dev_remove_admin(dev); - nvme_free_namespaces(dev); - shutdown: - nvme_dev_shutdown(dev); release_pools: - nvme_free_queues(dev, 0); nvme_release_prp_pools(dev); release: nvme_release_instance(dev); @@ -2877,6 +3003,29 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) return result; } +static void nvme_async_probe(struct work_struct *work) +{ + struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work); + int result; + + result = nvme_dev_start(dev); + if (result) + goto reset; + + if (dev->online_queues > 1) + result = nvme_dev_add(dev); + if (result) + goto reset; + + nvme_set_irq_hints(dev); + return; + reset: + if (!work_busy(&dev->reset_work)) { + dev->reset_workfn = nvme_reset_failed_dev; + queue_work(nvme_workq, &dev->reset_work); + } +} + static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) { struct nvme_dev *dev = pci_get_drvdata(pdev); @@ -2902,11 +3051,12 @@ static void nvme_remove(struct pci_dev *pdev) spin_unlock(&dev_list_lock); pci_set_drvdata(pdev, NULL); + flush_work(&dev->probe_work); flush_work(&dev->reset_work); - misc_deregister(&dev->miscdev); nvme_dev_shutdown(dev); nvme_dev_remove(dev); nvme_dev_remove_admin(dev); + device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance)); nvme_free_queues(dev, 0); nvme_release_prp_pools(dev); kref_put(&dev->kref, nvme_free_dev); @@ -2990,11 +3140,26 @@ static int __init nvme_init(void) else if (result > 0) nvme_major = result; + result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", + &nvme_dev_fops); + if (result < 0) + goto unregister_blkdev; + else if (result > 0) + nvme_char_major = result; + + nvme_class = class_create(THIS_MODULE, "nvme"); + if (!nvme_class) + goto unregister_chrdev; + result = pci_register_driver(&nvme_driver); if (result) - goto unregister_blkdev; + goto destroy_class; return 0; + destroy_class: + class_destroy(nvme_class); + unregister_chrdev: + __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); unregister_blkdev: unregister_blkdev(nvme_major, "nvme"); kill_workq: @@ -3005,9 +3170,10 @@ static int __init nvme_init(void) static void __exit nvme_exit(void) { pci_unregister_driver(&nvme_driver); - unregister_hotcpu_notifier(&nvme_nb); unregister_blkdev(nvme_major, "nvme"); destroy_workqueue(nvme_workq); + class_destroy(nvme_class); + __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); BUG_ON(nvme_thread && !IS_ERR(nvme_thread)); _nvme_check_size(); } diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index 5e78568026c3..e10196e0182d 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -779,10 +779,8 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, struct nvme_dev *dev = ns->dev; dma_addr_t dma_addr; void *mem; - struct nvme_id_ctrl *id_ctrl; int res = SNTI_TRANSLATION_SUCCESS; int nvme_sc; - u8 ieee[4]; int xfer_len; __be32 tmp_id = cpu_to_be32(ns->ns_id); @@ -793,46 +791,60 @@ static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, goto out_dma; } - /* nvme controller identify */ - nvme_sc = nvme_identify(dev, 0, 1, dma_addr); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out_free; - if (nvme_sc) { - res = nvme_sc; - goto out_free; - } - id_ctrl = mem; - - /* Since SCSI tried to save 4 bits... [SPC-4(r34) Table 591] */ - ieee[0] = id_ctrl->ieee[0] << 4; - ieee[1] = id_ctrl->ieee[0] >> 4 | id_ctrl->ieee[1] << 4; - ieee[2] = id_ctrl->ieee[1] >> 4 | id_ctrl->ieee[2] << 4; - ieee[3] = id_ctrl->ieee[2] >> 4; - - memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); + memset(inq_response, 0, alloc_len); inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; /* Page Code */ - inq_response[3] = 20; /* Page Length */ - /* Designation Descriptor start */ - inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */ - inq_response[5] = 0x03; /* PIV=0b | Asso=00b | Designator Type=3h */ - inq_response[6] = 0x00; /* Rsvd */ - inq_response[7] = 16; /* Designator Length */ - /* Designator start */ - inq_response[8] = 0x60 | ieee[3]; /* NAA=6h | IEEE ID MSB, High nibble*/ - inq_response[9] = ieee[2]; /* IEEE ID */ - inq_response[10] = ieee[1]; /* IEEE ID */ - inq_response[11] = ieee[0]; /* IEEE ID| Vendor Specific ID... */ - inq_response[12] = (dev->pci_dev->vendor & 0xFF00) >> 8; - inq_response[13] = (dev->pci_dev->vendor & 0x00FF); - inq_response[14] = dev->serial[0]; - inq_response[15] = dev->serial[1]; - inq_response[16] = dev->model[0]; - inq_response[17] = dev->model[1]; - memcpy(&inq_response[18], &tmp_id, sizeof(u32)); - /* Last 2 bytes are zero */ + if (readl(&dev->bar->vs) >= NVME_VS(1, 1)) { + struct nvme_id_ns *id_ns = mem; + void *eui = id_ns->eui64; + int len = sizeof(id_ns->eui64); - xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); + nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_free; + if (nvme_sc) { + res = nvme_sc; + goto out_free; + } + + if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) { + if (bitmap_empty(eui, len * 8)) { + eui = id_ns->nguid; + len = sizeof(id_ns->nguid); + } + } + if (bitmap_empty(eui, len * 8)) + goto scsi_string; + + inq_response[3] = 4 + len; /* Page Length */ + /* Designation Descriptor start */ + inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */ + inq_response[5] = 0x02; /* PIV=0b | Asso=00b | Designator Type=2h */ + inq_response[6] = 0x00; /* Rsvd */ + inq_response[7] = len; /* Designator Length */ + memcpy(&inq_response[8], eui, len); + } else { + scsi_string: + if (alloc_len < 72) { + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out_free; + } + inq_response[3] = 0x48; /* Page Length */ + /* Designation Descriptor start */ + inq_response[4] = 0x03; /* Proto ID=0h | Code set=3h */ + inq_response[5] = 0x08; /* PIV=0b | Asso=00b | Designator Type=8h */ + inq_response[6] = 0x00; /* Rsvd */ + inq_response[7] = 0x44; /* Designator Length */ + + sprintf(&inq_response[8], "%04x", dev->pci_dev->vendor); + memcpy(&inq_response[12], dev->model, sizeof(dev->model)); + sprintf(&inq_response[52], "%04x", tmp_id); + memcpy(&inq_response[56], dev->serial, sizeof(dev->serial)); + } + xfer_len = alloc_len; res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); out_free: @@ -1600,7 +1612,7 @@ static inline void nvme_trans_modesel_get_bd_len(u8 *parm_list, u8 cdb10, /* 10 Byte CDB */ *bd_len = (parm_list[MODE_SELECT_10_BD_OFFSET] << 8) + parm_list[MODE_SELECT_10_BD_OFFSET + 1]; - *llbaa = parm_list[MODE_SELECT_10_LLBAA_OFFSET] && + *llbaa = parm_list[MODE_SELECT_10_LLBAA_OFFSET] & MODE_SELECT_10_LLBAA_MASK; } else { /* 6 Byte CDB */ @@ -2222,7 +2234,7 @@ static int nvme_trans_inquiry(struct nvme_ns *ns, struct sg_io_hdr *hdr, page_code = GET_INQ_PAGE_CODE(cmd); alloc_len = GET_INQ_ALLOC_LENGTH(cmd); - inq_response = kmalloc(STANDARD_INQUIRY_LENGTH, GFP_KERNEL); + inq_response = kmalloc(alloc_len, GFP_KERNEL); if (inq_response == NULL) { res = -ENOMEM; goto out_mem; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 19a5d4b23209..0adad4a5419b 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -17,7 +17,6 @@ #include <uapi/linux/nvme.h> #include <linux/pci.h> -#include <linux/miscdevice.h> #include <linux/kref.h> #include <linux/blk-mq.h> @@ -62,8 +61,6 @@ enum { NVME_CSTS_SHST_MASK = 3 << 2, }; -#define NVME_VS(major, minor) (major << 16 | minor) - extern unsigned char nvme_io_timeout; #define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) @@ -91,9 +88,10 @@ struct nvme_dev { struct nvme_bar __iomem *bar; struct list_head namespaces; struct kref kref; - struct miscdevice miscdev; + struct device *device; work_func_t reset_workfn; struct work_struct reset_work; + struct work_struct probe_work; char name[12]; char serial[20]; char model[40]; @@ -105,7 +103,6 @@ struct nvme_dev { u16 abort_limit; u8 event_limit; u8 vwc; - u8 initialized; }; /* @@ -121,6 +118,7 @@ struct nvme_ns { unsigned ns_id; int lba_shift; int ms; + int pi_type; u64 mode_select_num_blocks; u32 mode_select_block_len; }; @@ -138,6 +136,7 @@ struct nvme_iod { int nents; /* Used in scatterlist */ int length; /* Of data, in bytes */ dma_addr_t first_dma; + struct scatterlist meta_sg[1]; /* metadata requires single contiguous buffer */ struct scatterlist sg[0]; }; diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h index 26386cf3db44..aef9a81b2d75 100644 --- a/include/uapi/linux/nvme.h +++ b/include/uapi/linux/nvme.h @@ -115,7 +115,13 @@ struct nvme_id_ns { __le16 nawun; __le16 nawupf; __le16 nacwu; - __u8 rsvd40[80]; + __le16 nabsn; + __le16 nabo; + __le16 nabspf; + __u16 rsvd46; + __le64 nvmcap[2]; + __u8 rsvd64[40]; + __u8 nguid[16]; __u8 eui64[8]; struct nvme_lbaf lbaf[16]; __u8 rsvd192[192]; @@ -124,10 +130,22 @@ struct nvme_id_ns { enum { NVME_NS_FEAT_THIN = 1 << 0, + NVME_NS_FLBAS_LBA_MASK = 0xf, + NVME_NS_FLBAS_META_EXT = 0x10, NVME_LBAF_RP_BEST = 0, NVME_LBAF_RP_BETTER = 1, NVME_LBAF_RP_GOOD = 2, NVME_LBAF_RP_DEGRADED = 3, + NVME_NS_DPC_PI_LAST = 1 << 4, + NVME_NS_DPC_PI_FIRST = 1 << 3, + NVME_NS_DPC_PI_TYPE3 = 1 << 2, + NVME_NS_DPC_PI_TYPE2 = 1 << 1, + NVME_NS_DPC_PI_TYPE1 = 1 << 0, + NVME_NS_DPS_PI_FIRST = 1 << 3, + NVME_NS_DPS_PI_MASK = 0x7, + NVME_NS_DPS_PI_TYPE1 = 1, + NVME_NS_DPS_PI_TYPE2 = 2, + NVME_NS_DPS_PI_TYPE3 = 3, }; struct nvme_smart_log { @@ -261,6 +279,10 @@ enum { NVME_RW_DSM_LATENCY_LOW = 3 << 4, NVME_RW_DSM_SEQ_REQ = 1 << 6, NVME_RW_DSM_COMPRESSED = 1 << 7, + NVME_RW_PRINFO_PRCHK_REF = 1 << 10, + NVME_RW_PRINFO_PRCHK_APP = 1 << 11, + NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12, + NVME_RW_PRINFO_PRACT = 1 << 13, }; struct nvme_dsm_cmd { @@ -549,6 +571,8 @@ struct nvme_passthru_cmd { __u32 result; }; +#define NVME_VS(major, minor) (((major) << 16) | ((minor) << 8)) + #define nvme_admin_cmd nvme_passthru_cmd #define NVME_IOCTL_ID _IO('N', 0x40) |