summaryrefslogtreecommitdiff
path: root/drivers/block
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/null_blk.c97
-rw-r--r--drivers/block/nvme-core.c610
-rw-r--r--drivers/block/nvme-scsi.c147
-rw-r--r--drivers/block/virtio_blk.c7
-rw-r--r--drivers/block/xen-blkback/blkback.c81
-rw-r--r--drivers/block/xen-blkback/common.h5
-rw-r--r--drivers/block/xen-blkback/xenbus.c14
-rw-r--r--drivers/block/xen-blkfront.c11
8 files changed, 766 insertions, 206 deletions
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 3107282a9741..091b9ea14feb 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -60,7 +60,9 @@ enum {
NULL_IRQ_NONE = 0,
NULL_IRQ_SOFTIRQ = 1,
NULL_IRQ_TIMER = 2,
+};
+enum {
NULL_Q_BIO = 0,
NULL_Q_RQ = 1,
NULL_Q_MQ = 2,
@@ -172,18 +174,20 @@ static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait)
static void end_cmd(struct nullb_cmd *cmd)
{
- if (cmd->rq) {
- if (queue_mode == NULL_Q_MQ)
- blk_mq_end_io(cmd->rq, 0);
- else {
- INIT_LIST_HEAD(&cmd->rq->queuelist);
- blk_end_request_all(cmd->rq, 0);
- }
- } else if (cmd->bio)
+ switch (queue_mode) {
+ case NULL_Q_MQ:
+ blk_mq_end_io(cmd->rq, 0);
+ return;
+ case NULL_Q_RQ:
+ INIT_LIST_HEAD(&cmd->rq->queuelist);
+ blk_end_request_all(cmd->rq, 0);
+ break;
+ case NULL_Q_BIO:
bio_endio(cmd->bio, 0);
+ break;
+ }
- if (queue_mode != NULL_Q_MQ)
- free_cmd(cmd);
+ free_cmd(cmd);
}
static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
@@ -195,6 +199,7 @@ static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
cq = &per_cpu(completion_queues, smp_processor_id());
while ((entry = llist_del_all(&cq->list)) != NULL) {
+ entry = llist_reverse_order(entry);
do {
cmd = container_of(entry, struct nullb_cmd, ll_list);
end_cmd(cmd);
@@ -221,61 +226,31 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd)
static void null_softirq_done_fn(struct request *rq)
{
- blk_end_request_all(rq, 0);
-}
-
-#ifdef CONFIG_SMP
-
-static void null_ipi_cmd_end_io(void *data)
-{
- struct completion_queue *cq;
- struct llist_node *entry, *next;
- struct nullb_cmd *cmd;
-
- cq = &per_cpu(completion_queues, smp_processor_id());
-
- entry = llist_del_all(&cq->list);
-
- while (entry) {
- next = entry->next;
- cmd = llist_entry(entry, struct nullb_cmd, ll_list);
- end_cmd(cmd);
- entry = next;
- }
-}
-
-static void null_cmd_end_ipi(struct nullb_cmd *cmd)
-{
- struct call_single_data *data = &cmd->csd;
- int cpu = get_cpu();
- struct completion_queue *cq = &per_cpu(completion_queues, cpu);
-
- cmd->ll_list.next = NULL;
-
- if (llist_add(&cmd->ll_list, &cq->list)) {
- data->func = null_ipi_cmd_end_io;
- data->flags = 0;
- __smp_call_function_single(cpu, data, 0);
- }
-
- put_cpu();
+ end_cmd(rq->special);
}
-#endif /* CONFIG_SMP */
-
static inline void null_handle_cmd(struct nullb_cmd *cmd)
{
/* Complete IO by inline, softirq or timer */
switch (irqmode) {
- case NULL_IRQ_NONE:
- end_cmd(cmd);
- break;
case NULL_IRQ_SOFTIRQ:
-#ifdef CONFIG_SMP
- null_cmd_end_ipi(cmd);
-#else
+ switch (queue_mode) {
+ case NULL_Q_MQ:
+ blk_mq_complete_request(cmd->rq);
+ break;
+ case NULL_Q_RQ:
+ blk_complete_request(cmd->rq);
+ break;
+ case NULL_Q_BIO:
+ /*
+ * XXX: no proper submitting cpu information available.
+ */
+ end_cmd(cmd);
+ break;
+ }
+ break;
+ case NULL_IRQ_NONE:
end_cmd(cmd);
-#endif
break;
case NULL_IRQ_TIMER:
null_cmd_end_timer(cmd);
@@ -411,6 +386,7 @@ static struct blk_mq_ops null_mq_ops = {
.queue_rq = null_queue_rq,
.map_queue = blk_mq_map_queue,
.init_hctx = null_init_hctx,
+ .complete = null_softirq_done_fn,
};
static struct blk_mq_reg null_mq_reg = {
@@ -609,13 +585,6 @@ static int __init null_init(void)
{
unsigned int i;
-#if !defined(CONFIG_SMP)
- if (irqmode == NULL_IRQ_SOFTIRQ) {
- pr_warn("null_blk: softirq completions not available.\n");
- pr_warn("null_blk: using direct completions.\n");
- irqmode = NULL_IRQ_NONE;
- }
-#endif
if (bs > PAGE_SIZE) {
pr_warn("null_blk: invalid block size\n");
pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE);
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 1f14ac403945..51824d1f23ea 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -46,7 +46,6 @@
#define NVME_Q_DEPTH 1024
#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
-#define NVME_MINORS 64
#define ADMIN_TIMEOUT (60 * HZ)
static int nvme_major;
@@ -58,6 +57,17 @@ module_param(use_threaded_interrupts, int, 0);
static DEFINE_SPINLOCK(dev_list_lock);
static LIST_HEAD(dev_list);
static struct task_struct *nvme_thread;
+static struct workqueue_struct *nvme_workq;
+
+static void nvme_reset_failed_dev(struct work_struct *ws);
+
+struct async_cmd_info {
+ struct kthread_work work;
+ struct kthread_worker *worker;
+ u32 result;
+ int status;
+ void *ctx;
+};
/*
* An NVM Express queue. Each device has at least two (one for admin
@@ -66,6 +76,7 @@ static struct task_struct *nvme_thread;
struct nvme_queue {
struct device *q_dmadev;
struct nvme_dev *dev;
+ char irqname[24]; /* nvme4294967295-65535\0 */
spinlock_t q_lock;
struct nvme_command *sq_cmds;
volatile struct nvme_completion *cqes;
@@ -80,9 +91,11 @@ struct nvme_queue {
u16 sq_head;
u16 sq_tail;
u16 cq_head;
+ u16 qid;
u8 cq_phase;
u8 cqe_seen;
u8 q_suspended;
+ struct async_cmd_info cmdinfo;
unsigned long cmdid_data[];
};
@@ -97,6 +110,7 @@ static inline void _nvme_check_size(void)
BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096);
BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
@@ -111,6 +125,7 @@ struct nvme_cmd_info {
nvme_completion_fn fn;
void *ctx;
unsigned long timeout;
+ int aborted;
};
static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
@@ -154,6 +169,7 @@ static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx,
info[cmdid].fn = handler;
info[cmdid].ctx = ctx;
info[cmdid].timeout = jiffies + timeout;
+ info[cmdid].aborted = 0;
return cmdid;
}
@@ -172,6 +188,7 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE)
#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE)
#define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE)
+#define CMD_CTX_ABORT (0x31C + CMD_CTX_BASE)
static void special_completion(struct nvme_dev *dev, void *ctx,
struct nvme_completion *cqe)
@@ -180,6 +197,10 @@ static void special_completion(struct nvme_dev *dev, void *ctx,
return;
if (ctx == CMD_CTX_FLUSH)
return;
+ if (ctx == CMD_CTX_ABORT) {
+ ++dev->abort_limit;
+ return;
+ }
if (ctx == CMD_CTX_COMPLETED) {
dev_warn(&dev->pci_dev->dev,
"completed id %d twice on queue %d\n",
@@ -196,6 +217,15 @@ static void special_completion(struct nvme_dev *dev, void *ctx,
dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx);
}
+static void async_completion(struct nvme_dev *dev, void *ctx,
+ struct nvme_completion *cqe)
+{
+ struct async_cmd_info *cmdinfo = ctx;
+ cmdinfo->result = le32_to_cpup(&cqe->result);
+ cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
+ queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
+}
+
/*
* Called with local interrupts disabled and the q_lock held. May not sleep.
*/
@@ -693,7 +723,7 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
return 0;
- writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride));
+ writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
nvmeq->cq_head = head;
nvmeq->cq_phase = phase;
@@ -804,12 +834,34 @@ int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
return cmdinfo.status;
}
+static int nvme_submit_async_cmd(struct nvme_queue *nvmeq,
+ struct nvme_command *cmd,
+ struct async_cmd_info *cmdinfo, unsigned timeout)
+{
+ int cmdid;
+
+ cmdid = alloc_cmdid_killable(nvmeq, cmdinfo, async_completion, timeout);
+ if (cmdid < 0)
+ return cmdid;
+ cmdinfo->status = -EINTR;
+ cmd->common.command_id = cmdid;
+ nvme_submit_cmd(nvmeq, cmd);
+ return 0;
+}
+
int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
u32 *result)
{
return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT);
}
+static int nvme_submit_admin_cmd_async(struct nvme_dev *dev,
+ struct nvme_command *cmd, struct async_cmd_info *cmdinfo)
+{
+ return nvme_submit_async_cmd(dev->queues[0], cmd, cmdinfo,
+ ADMIN_TIMEOUT);
+}
+
static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
{
int status;
@@ -920,6 +972,56 @@ int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
}
/**
+ * nvme_abort_cmd - Attempt aborting a command
+ * @cmdid: Command id of a timed out IO
+ * @queue: The queue with timed out IO
+ *
+ * Schedule controller reset if the command was already aborted once before and
+ * still hasn't been returned to the driver, or if this is the admin queue.
+ */
+static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq)
+{
+ int a_cmdid;
+ struct nvme_command cmd;
+ struct nvme_dev *dev = nvmeq->dev;
+ struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
+
+ if (!nvmeq->qid || info[cmdid].aborted) {
+ if (work_busy(&dev->reset_work))
+ return;
+ list_del_init(&dev->node);
+ dev_warn(&dev->pci_dev->dev,
+ "I/O %d QID %d timeout, reset controller\n", cmdid,
+ nvmeq->qid);
+ PREPARE_WORK(&dev->reset_work, nvme_reset_failed_dev);
+ queue_work(nvme_workq, &dev->reset_work);
+ return;
+ }
+
+ if (!dev->abort_limit)
+ return;
+
+ a_cmdid = alloc_cmdid(dev->queues[0], CMD_CTX_ABORT, special_completion,
+ ADMIN_TIMEOUT);
+ if (a_cmdid < 0)
+ return;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.abort.opcode = nvme_admin_abort_cmd;
+ cmd.abort.cid = cmdid;
+ cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
+ cmd.abort.command_id = a_cmdid;
+
+ --dev->abort_limit;
+ info[cmdid].aborted = 1;
+ info[cmdid].timeout = jiffies + ADMIN_TIMEOUT;
+
+ dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", cmdid,
+ nvmeq->qid);
+ nvme_submit_cmd(dev->queues[0], &cmd);
+}
+
+/**
* nvme_cancel_ios - Cancel outstanding I/Os
* @queue: The queue to cancel I/Os on
* @timeout: True to only cancel I/Os which have timed out
@@ -942,7 +1044,12 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
continue;
if (info[cmdid].ctx == CMD_CTX_CANCELLED)
continue;
- dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d\n", cmdid);
+ if (timeout && nvmeq->dev->initialized) {
+ nvme_abort_cmd(cmdid, nvmeq);
+ continue;
+ }
+ dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", cmdid,
+ nvmeq->qid);
ctx = cancel_cmdid(nvmeq, cmdid, &fn);
fn(nvmeq->dev, ctx, &cqe);
}
@@ -964,26 +1071,31 @@ static void nvme_free_queue(struct nvme_queue *nvmeq)
kfree(nvmeq);
}
-static void nvme_free_queues(struct nvme_dev *dev)
+static void nvme_free_queues(struct nvme_dev *dev, int lowest)
{
int i;
- for (i = dev->queue_count - 1; i >= 0; i--) {
+ for (i = dev->queue_count - 1; i >= lowest; i--) {
nvme_free_queue(dev->queues[i]);
dev->queue_count--;
dev->queues[i] = NULL;
}
}
-static void nvme_disable_queue(struct nvme_dev *dev, int qid)
+/**
+ * nvme_suspend_queue - put queue into suspended state
+ * @nvmeq - queue to suspend
+ *
+ * Returns 1 if already suspended, 0 otherwise.
+ */
+static int nvme_suspend_queue(struct nvme_queue *nvmeq)
{
- struct nvme_queue *nvmeq = dev->queues[qid];
- int vector = dev->entry[nvmeq->cq_vector].vector;
+ int vector = nvmeq->dev->entry[nvmeq->cq_vector].vector;
spin_lock_irq(&nvmeq->q_lock);
if (nvmeq->q_suspended) {
spin_unlock_irq(&nvmeq->q_lock);
- return;
+ return 1;
}
nvmeq->q_suspended = 1;
spin_unlock_irq(&nvmeq->q_lock);
@@ -991,18 +1103,35 @@ static void nvme_disable_queue(struct nvme_dev *dev, int qid)
irq_set_affinity_hint(vector, NULL);
free_irq(vector, nvmeq);
- /* Don't tell the adapter to delete the admin queue */
- if (qid) {
- adapter_delete_sq(dev, qid);
- adapter_delete_cq(dev, qid);
- }
+ return 0;
+}
+static void nvme_clear_queue(struct nvme_queue *nvmeq)
+{
spin_lock_irq(&nvmeq->q_lock);
nvme_process_cq(nvmeq);
nvme_cancel_ios(nvmeq, false);
spin_unlock_irq(&nvmeq->q_lock);
}
+static void nvme_disable_queue(struct nvme_dev *dev, int qid)
+{
+ struct nvme_queue *nvmeq = dev->queues[qid];
+
+ if (!nvmeq)
+ return;
+ if (nvme_suspend_queue(nvmeq))
+ return;
+
+ /* Don't tell the adapter to delete the admin queue.
+ * Don't tell a removed adapter to delete IO queues. */
+ if (qid && readl(&dev->bar->csts) != -1) {
+ adapter_delete_sq(dev, qid);
+ adapter_delete_cq(dev, qid);
+ }
+ nvme_clear_queue(nvmeq);
+}
+
static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
int depth, int vector)
{
@@ -1025,15 +1154,18 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
nvmeq->q_dmadev = dmadev;
nvmeq->dev = dev;
+ snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d",
+ dev->instance, qid);
spin_lock_init(&nvmeq->q_lock);
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
init_waitqueue_head(&nvmeq->sq_full);
init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
bio_list_init(&nvmeq->sq_cong);
- nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)];
+ nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
nvmeq->q_depth = depth;
nvmeq->cq_vector = vector;
+ nvmeq->qid = qid;
nvmeq->q_suspended = 1;
dev->queue_count++;
@@ -1052,11 +1184,10 @@ static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
{
if (use_threaded_interrupts)
return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector,
- nvme_irq_check, nvme_irq,
- IRQF_DISABLED | IRQF_SHARED,
+ nvme_irq_check, nvme_irq, IRQF_SHARED,
name, nvmeq);
return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq,
- IRQF_DISABLED | IRQF_SHARED, name, nvmeq);
+ IRQF_SHARED, name, nvmeq);
}
static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
@@ -1067,7 +1198,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
nvmeq->sq_tail = 0;
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
- nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)];
+ nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
memset(nvmeq->cmdid_data, 0, extra);
memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
nvme_cancel_ios(nvmeq, false);
@@ -1087,13 +1218,13 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
if (result < 0)
goto release_cq;
- result = queue_request_irq(dev, nvmeq, "nvme");
+ result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
if (result < 0)
goto release_sq;
- spin_lock(&nvmeq->q_lock);
+ spin_lock_irq(&nvmeq->q_lock);
nvme_init_queue(nvmeq, qid);
- spin_unlock(&nvmeq->q_lock);
+ spin_unlock_irq(&nvmeq->q_lock);
return result;
@@ -1205,13 +1336,13 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
if (result)
return result;
- result = queue_request_irq(dev, nvmeq, "nvme admin");
+ result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
if (result)
return result;
- spin_lock(&nvmeq->q_lock);
+ spin_lock_irq(&nvmeq->q_lock);
nvme_init_queue(nvmeq, 0);
- spin_unlock(&nvmeq->q_lock);
+ spin_unlock_irq(&nvmeq->q_lock);
return result;
}
@@ -1487,10 +1618,47 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
}
}
+#ifdef CONFIG_COMPAT
+static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ struct nvme_ns *ns = bdev->bd_disk->private_data;
+
+ switch (cmd) {
+ case SG_IO:
+ return nvme_sg_io32(ns, arg);
+ }
+ return nvme_ioctl(bdev, mode, cmd, arg);
+}
+#else
+#define nvme_compat_ioctl NULL
+#endif
+
+static int nvme_open(struct block_device *bdev, fmode_t mode)
+{
+ struct nvme_ns *ns = bdev->bd_disk->private_data;
+ struct nvme_dev *dev = ns->dev;
+
+ kref_get(&dev->kref);
+ return 0;
+}
+
+static void nvme_free_dev(struct kref *kref);
+
+static void nvme_release(struct gendisk *disk, fmode_t mode)
+{
+ struct nvme_ns *ns = disk->private_data;
+ struct nvme_dev *dev = ns->dev;
+
+ kref_put(&dev->kref, nvme_free_dev);
+}
+
static const struct block_device_operations nvme_fops = {
.owner = THIS_MODULE,
.ioctl = nvme_ioctl,
- .compat_ioctl = nvme_ioctl,
+ .compat_ioctl = nvme_compat_ioctl,
+ .open = nvme_open,
+ .release = nvme_release,
};
static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
@@ -1514,13 +1682,25 @@ static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
static int nvme_kthread(void *data)
{
- struct nvme_dev *dev;
+ struct nvme_dev *dev, *next;
while (!kthread_should_stop()) {
set_current_state(TASK_INTERRUPTIBLE);
spin_lock(&dev_list_lock);
- list_for_each_entry(dev, &dev_list, node) {
+ list_for_each_entry_safe(dev, next, &dev_list, node) {
int i;
+ if (readl(&dev->bar->csts) & NVME_CSTS_CFS &&
+ dev->initialized) {
+ if (work_busy(&dev->reset_work))
+ continue;
+ list_del_init(&dev->node);
+ dev_warn(&dev->pci_dev->dev,
+ "Failed status, reset controller\n");
+ PREPARE_WORK(&dev->reset_work,
+ nvme_reset_failed_dev);
+ queue_work(nvme_workq, &dev->reset_work);
+ continue;
+ }
for (i = 0; i < dev->queue_count; i++) {
struct nvme_queue *nvmeq = dev->queues[i];
if (!nvmeq)
@@ -1541,33 +1721,6 @@ static int nvme_kthread(void *data)
return 0;
}
-static DEFINE_IDA(nvme_index_ida);
-
-static int nvme_get_ns_idx(void)
-{
- int index, error;
-
- do {
- if (!ida_pre_get(&nvme_index_ida, GFP_KERNEL))
- return -1;
-
- spin_lock(&dev_list_lock);
- error = ida_get_new(&nvme_index_ida, &index);
- spin_unlock(&dev_list_lock);
- } while (error == -EAGAIN);
-
- if (error)
- index = -1;
- return index;
-}
-
-static void nvme_put_ns_idx(int index)
-{
- spin_lock(&dev_list_lock);
- ida_remove(&nvme_index_ida, index);
- spin_unlock(&dev_list_lock);
-}
-
static void nvme_config_discard(struct nvme_ns *ns)
{
u32 logical_block_size = queue_logical_block_size(ns->queue);
@@ -1601,7 +1754,7 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
ns->dev = dev;
ns->queue->queuedata = ns;
- disk = alloc_disk(NVME_MINORS);
+ disk = alloc_disk(0);
if (!disk)
goto out_free_queue;
ns->ns_id = nsid;
@@ -1614,12 +1767,12 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
disk->major = nvme_major;
- disk->minors = NVME_MINORS;
- disk->first_minor = NVME_MINORS * nvme_get_ns_idx();
+ disk->first_minor = 0;
disk->fops = &nvme_fops;
disk->private_data = ns;
disk->queue = ns->queue;
disk->driverfs_dev = &dev->pci_dev->dev;
+ disk->flags = GENHD_FL_EXT_DEVT;
sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid);
set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
@@ -1635,15 +1788,6 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
return NULL;
}
-static void nvme_ns_free(struct nvme_ns *ns)
-{
- int index = ns->disk->first_minor / NVME_MINORS;
- put_disk(ns->disk);
- nvme_put_ns_idx(index);
- blk_cleanup_queue(ns->queue);
- kfree(ns);
-}
-
static int set_queue_count(struct nvme_dev *dev, int count)
{
int status;
@@ -1659,11 +1803,12 @@ static int set_queue_count(struct nvme_dev *dev, int count)
static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
{
- return 4096 + ((nr_io_queues + 1) << (dev->db_stride + 3));
+ return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
}
static int nvme_setup_io_queues(struct nvme_dev *dev)
{
+ struct nvme_queue *adminq = dev->queues[0];
struct pci_dev *pdev = dev->pci_dev;
int result, cpu, i, vecs, nr_io_queues, size, q_depth;
@@ -1690,7 +1835,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
}
/* Deregister the admin queue's interrupt */
- free_irq(dev->entry[0].vector, dev->queues[0]);
+ free_irq(dev->entry[0].vector, adminq);
vecs = nr_io_queues;
for (i = 0; i < vecs; i++)
@@ -1728,9 +1873,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
*/
nr_io_queues = vecs;
- result = queue_request_irq(dev, dev->queues[0], "nvme admin");
+ result = queue_request_irq(dev, adminq, adminq->irqname);
if (result) {
- dev->queues[0]->q_suspended = 1;
+ adminq->q_suspended = 1;
goto free_queues;
}
@@ -1739,9 +1884,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
for (i = dev->queue_count - 1; i > nr_io_queues; i--) {
struct nvme_queue *nvmeq = dev->queues[i];
- spin_lock(&nvmeq->q_lock);
+ spin_lock_irq(&nvmeq->q_lock);
nvme_cancel_ios(nvmeq, false);
- spin_unlock(&nvmeq->q_lock);
+ spin_unlock_irq(&nvmeq->q_lock);
nvme_free_queue(nvmeq);
dev->queue_count--;
@@ -1782,7 +1927,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
return 0;
free_queues:
- nvme_free_queues(dev);
+ nvme_free_queues(dev, 1);
return result;
}
@@ -1794,6 +1939,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
*/
static int nvme_dev_add(struct nvme_dev *dev)
{
+ struct pci_dev *pdev = dev->pci_dev;
int res;
unsigned nn, i;
struct nvme_ns *ns;
@@ -1803,8 +1949,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
dma_addr_t dma_addr;
int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
- mem = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr,
- GFP_KERNEL);
+ mem = dma_alloc_coherent(&pdev->dev, 8192, &dma_addr, GFP_KERNEL);
if (!mem)
return -ENOMEM;
@@ -1817,13 +1962,14 @@ static int nvme_dev_add(struct nvme_dev *dev)
ctrl = mem;
nn = le32_to_cpup(&ctrl->nn);
dev->oncs = le16_to_cpup(&ctrl->oncs);
+ dev->abort_limit = ctrl->acl + 1;
memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
if (ctrl->mdts)
dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9);
- if ((dev->pci_dev->vendor == PCI_VENDOR_ID_INTEL) &&
- (dev->pci_dev->device == 0x0953) && ctrl->vs[3])
+ if ((pdev->vendor == PCI_VENDOR_ID_INTEL) &&
+ (pdev->device == 0x0953) && ctrl->vs[3])
dev->stripe_size = 1 << (ctrl->vs[3] + shift);
id_ns = mem;
@@ -1871,16 +2017,21 @@ static int nvme_dev_map(struct nvme_dev *dev)
dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)))
goto disable;
- pci_set_drvdata(pdev, dev);
dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
if (!dev->bar)
goto disable;
-
- dev->db_stride = NVME_CAP_STRIDE(readq(&dev->bar->cap));
+ if (readl(&dev->bar->csts) == -1) {
+ result = -ENODEV;
+ goto unmap;
+ }
+ dev->db_stride = 1 << NVME_CAP_STRIDE(readq(&dev->bar->cap));
dev->dbs = ((void __iomem *)dev->bar) + 4096;
return 0;
+ unmap:
+ iounmap(dev->bar);
+ dev->bar = NULL;
disable:
pci_release_regions(pdev);
disable_pci:
@@ -1898,37 +2049,183 @@ static void nvme_dev_unmap(struct nvme_dev *dev)
if (dev->bar) {
iounmap(dev->bar);
dev->bar = NULL;
+ pci_release_regions(dev->pci_dev);
}
- pci_release_regions(dev->pci_dev);
if (pci_is_enabled(dev->pci_dev))
pci_disable_device(dev->pci_dev);
}
+struct nvme_delq_ctx {
+ struct task_struct *waiter;
+ struct kthread_worker *worker;
+ atomic_t refcount;
+};
+
+static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev)
+{
+ dq->waiter = current;
+ mb();
+
+ for (;;) {
+ set_current_state(TASK_KILLABLE);
+ if (!atomic_read(&dq->refcount))
+ break;
+ if (!schedule_timeout(ADMIN_TIMEOUT) ||
+ fatal_signal_pending(current)) {
+ set_current_state(TASK_RUNNING);
+
+ nvme_disable_ctrl(dev, readq(&dev->bar->cap));
+ nvme_disable_queue(dev, 0);
+
+ send_sig(SIGKILL, dq->worker->task, 1);
+ flush_kthread_worker(dq->worker);
+ return;
+ }
+ }
+ set_current_state(TASK_RUNNING);
+}
+
+static void nvme_put_dq(struct nvme_delq_ctx *dq)
+{
+ atomic_dec(&dq->refcount);
+ if (dq->waiter)
+ wake_up_process(dq->waiter);
+}
+
+static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq)
+{
+ atomic_inc(&dq->refcount);
+ return dq;
+}
+
+static void nvme_del_queue_end(struct nvme_queue *nvmeq)
+{
+ struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx;
+
+ nvme_clear_queue(nvmeq);
+ nvme_put_dq(dq);
+}
+
+static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode,
+ kthread_work_func_t fn)
+{
+ struct nvme_command c;
+
+ memset(&c, 0, sizeof(c));
+ c.delete_queue.opcode = opcode;
+ c.delete_queue.qid = cpu_to_le16(nvmeq->qid);
+
+ init_kthread_work(&nvmeq->cmdinfo.work, fn);
+ return nvme_submit_admin_cmd_async(nvmeq->dev, &c, &nvmeq->cmdinfo);
+}
+
+static void nvme_del_cq_work_handler(struct kthread_work *work)
+{
+ struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
+ cmdinfo.work);
+ nvme_del_queue_end(nvmeq);
+}
+
+static int nvme_delete_cq(struct nvme_queue *nvmeq)
+{
+ return adapter_async_del_queue(nvmeq, nvme_admin_delete_cq,
+ nvme_del_cq_work_handler);
+}
+
+static void nvme_del_sq_work_handler(struct kthread_work *work)
+{
+ struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
+ cmdinfo.work);
+ int status = nvmeq->cmdinfo.status;
+
+ if (!status)
+ status = nvme_delete_cq(nvmeq);
+ if (status)
+ nvme_del_queue_end(nvmeq);
+}
+
+static int nvme_delete_sq(struct nvme_queue *nvmeq)
+{
+ return adapter_async_del_queue(nvmeq, nvme_admin_delete_sq,
+ nvme_del_sq_work_handler);
+}
+
+static void nvme_del_queue_start(struct kthread_work *work)
+{
+ struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
+ cmdinfo.work);
+ allow_signal(SIGKILL);
+ if (nvme_delete_sq(nvmeq))
+ nvme_del_queue_end(nvmeq);
+}
+
+static void nvme_disable_io_queues(struct nvme_dev *dev)
+{
+ int i;
+ DEFINE_KTHREAD_WORKER_ONSTACK(worker);
+ struct nvme_delq_ctx dq;
+ struct task_struct *kworker_task = kthread_run(kthread_worker_fn,
+ &worker, "nvme%d", dev->instance);
+
+ if (IS_ERR(kworker_task)) {
+ dev_err(&dev->pci_dev->dev,
+ "Failed to create queue del task\n");
+ for (i = dev->queue_count - 1; i > 0; i--)
+ nvme_disable_queue(dev, i);
+ return;
+ }
+
+ dq.waiter = NULL;
+ atomic_set(&dq.refcount, 0);
+ dq.worker = &worker;
+ for (i = dev->queue_count - 1; i > 0; i--) {
+ struct nvme_queue *nvmeq = dev->queues[i];
+
+ if (nvme_suspend_queue(nvmeq))
+ continue;
+ nvmeq->cmdinfo.ctx = nvme_get_dq(&dq);
+ nvmeq->cmdinfo.worker = dq.worker;
+ init_kthread_work(&nvmeq->cmdinfo.work, nvme_del_queue_start);
+ queue_kthread_work(dq.worker, &nvmeq->cmdinfo.work);
+ }
+ nvme_wait_dq(&dq, dev);
+ kthread_stop(kworker_task);
+}
+
static void nvme_dev_shutdown(struct nvme_dev *dev)
{
int i;
- for (i = dev->queue_count - 1; i >= 0; i--)
- nvme_disable_queue(dev, i);
+ dev->initialized = 0;
spin_lock(&dev_list_lock);
list_del_init(&dev->node);
spin_unlock(&dev_list_lock);
- if (dev->bar)
+ if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) {
+ for (i = dev->queue_count - 1; i >= 0; i--) {
+ struct nvme_queue *nvmeq = dev->queues[i];
+ nvme_suspend_queue(nvmeq);
+ nvme_clear_queue(nvmeq);
+ }
+ } else {
+ nvme_disable_io_queues(dev);
nvme_shutdown_ctrl(dev);
+ nvme_disable_queue(dev, 0);
+ }
nvme_dev_unmap(dev);
}
static void nvme_dev_remove(struct nvme_dev *dev)
{
- struct nvme_ns *ns, *next;
+ struct nvme_ns *ns;
- list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
- list_del(&ns->list);
- del_gendisk(ns->disk);
- nvme_ns_free(ns);
+ list_for_each_entry(ns, &dev->namespaces, list) {
+ if (ns->disk->flags & GENHD_FL_UP)
+ del_gendisk(ns->disk);
+ if (!blk_queue_dying(ns->queue))
+ blk_cleanup_queue(ns->queue);
}
}
@@ -1985,14 +2282,22 @@ static void nvme_release_instance(struct nvme_dev *dev)
spin_unlock(&dev_list_lock);
}
+static void nvme_free_namespaces(struct nvme_dev *dev)
+{
+ struct nvme_ns *ns, *next;
+
+ list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
+ list_del(&ns->list);
+ put_disk(ns->disk);
+ kfree(ns);
+ }
+}
+
static void nvme_free_dev(struct kref *kref)
{
struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
- nvme_dev_remove(dev);
- nvme_dev_shutdown(dev);
- nvme_free_queues(dev);
- nvme_release_instance(dev);
- nvme_release_prp_pools(dev);
+
+ nvme_free_namespaces(dev);
kfree(dev->queues);
kfree(dev->entry);
kfree(dev);
@@ -2056,6 +2361,7 @@ static int nvme_dev_start(struct nvme_dev *dev)
return result;
disable:
+ nvme_disable_queue(dev, 0);
spin_lock(&dev_list_lock);
list_del_init(&dev->node);
spin_unlock(&dev_list_lock);
@@ -2064,6 +2370,71 @@ static int nvme_dev_start(struct nvme_dev *dev)
return result;
}
+static int nvme_remove_dead_ctrl(void *arg)
+{
+ struct nvme_dev *dev = (struct nvme_dev *)arg;
+ struct pci_dev *pdev = dev->pci_dev;
+
+ if (pci_get_drvdata(pdev))
+ pci_stop_and_remove_bus_device(pdev);
+ kref_put(&dev->kref, nvme_free_dev);
+ return 0;
+}
+
+static void nvme_remove_disks(struct work_struct *ws)
+{
+ int i;
+ struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
+
+ nvme_dev_remove(dev);
+ spin_lock(&dev_list_lock);
+ for (i = dev->queue_count - 1; i > 0; i--) {
+ BUG_ON(!dev->queues[i] || !dev->queues[i]->q_suspended);
+ nvme_free_queue(dev->queues[i]);
+ dev->queue_count--;
+ dev->queues[i] = NULL;
+ }
+ spin_unlock(&dev_list_lock);
+}
+
+static int nvme_dev_resume(struct nvme_dev *dev)
+{
+ int ret;
+
+ ret = nvme_dev_start(dev);
+ if (ret && ret != -EBUSY)
+ return ret;
+ if (ret == -EBUSY) {
+ spin_lock(&dev_list_lock);
+ PREPARE_WORK(&dev->reset_work, nvme_remove_disks);
+ queue_work(nvme_workq, &dev->reset_work);
+ spin_unlock(&dev_list_lock);
+ }
+ dev->initialized = 1;
+ return 0;
+}
+
+static void nvme_dev_reset(struct nvme_dev *dev)
+{
+ nvme_dev_shutdown(dev);
+ if (nvme_dev_resume(dev)) {
+ dev_err(&dev->pci_dev->dev, "Device failed to resume\n");
+ kref_get(&dev->kref);
+ if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d",
+ dev->instance))) {
+ dev_err(&dev->pci_dev->dev,
+ "Failed to start controller remove task\n");
+ kref_put(&dev->kref, nvme_free_dev);
+ }
+ }
+}
+
+static void nvme_reset_failed_dev(struct work_struct *ws)
+{
+ struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
+ nvme_dev_reset(dev);
+}
+
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
int result = -ENOMEM;
@@ -2082,8 +2453,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
goto free;
INIT_LIST_HEAD(&dev->namespaces);
+ INIT_WORK(&dev->reset_work, nvme_reset_failed_dev);
dev->pci_dev = pdev;
-
+ pci_set_drvdata(pdev, dev);
result = nvme_set_instance(dev);
if (result)
goto free;
@@ -2099,6 +2471,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
goto release_pools;
}
+ kref_init(&dev->kref);
result = nvme_dev_add(dev);
if (result)
goto shutdown;
@@ -2113,15 +2486,16 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (result)
goto remove;
- kref_init(&dev->kref);
+ dev->initialized = 1;
return 0;
remove:
nvme_dev_remove(dev);
+ nvme_free_namespaces(dev);
shutdown:
nvme_dev_shutdown(dev);
release_pools:
- nvme_free_queues(dev);
+ nvme_free_queues(dev, 0);
nvme_release_prp_pools(dev);
release:
nvme_release_instance(dev);
@@ -2132,10 +2506,28 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
return result;
}
+static void nvme_shutdown(struct pci_dev *pdev)
+{
+ struct nvme_dev *dev = pci_get_drvdata(pdev);
+ nvme_dev_shutdown(dev);
+}
+
static void nvme_remove(struct pci_dev *pdev)
{
struct nvme_dev *dev = pci_get_drvdata(pdev);
+
+ spin_lock(&dev_list_lock);
+ list_del_init(&dev->node);
+ spin_unlock(&dev_list_lock);
+
+ pci_set_drvdata(pdev, NULL);
+ flush_work(&dev->reset_work);
misc_deregister(&dev->miscdev);
+ nvme_dev_remove(dev);
+ nvme_dev_shutdown(dev);
+ nvme_free_queues(dev, 0);
+ nvme_release_instance(dev);
+ nvme_release_prp_pools(dev);
kref_put(&dev->kref, nvme_free_dev);
}
@@ -2159,13 +2551,12 @@ static int nvme_resume(struct device *dev)
{
struct pci_dev *pdev = to_pci_dev(dev);
struct nvme_dev *ndev = pci_get_drvdata(pdev);
- int ret;
- ret = nvme_dev_start(ndev);
- /* XXX: should remove gendisks if resume fails */
- if (ret)
- nvme_free_queues(ndev);
- return ret;
+ if (nvme_dev_resume(ndev) && !work_busy(&ndev->reset_work)) {
+ PREPARE_WORK(&ndev->reset_work, nvme_reset_failed_dev);
+ queue_work(nvme_workq, &ndev->reset_work);
+ }
+ return 0;
}
static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume);
@@ -2192,6 +2583,7 @@ static struct pci_driver nvme_driver = {
.id_table = nvme_id_table,
.probe = nvme_probe,
.remove = nvme_remove,
+ .shutdown = nvme_shutdown,
.driver = {
.pm = &nvme_dev_pm_ops,
},
@@ -2206,9 +2598,14 @@ static int __init nvme_init(void)
if (IS_ERR(nvme_thread))
return PTR_ERR(nvme_thread);
+ result = -ENOMEM;
+ nvme_workq = create_singlethread_workqueue("nvme");
+ if (!nvme_workq)
+ goto kill_kthread;
+
result = register_blkdev(nvme_major, "nvme");
if (result < 0)
- goto kill_kthread;
+ goto kill_workq;
else if (result > 0)
nvme_major = result;
@@ -2219,6 +2616,8 @@ static int __init nvme_init(void)
unregister_blkdev:
unregister_blkdev(nvme_major, "nvme");
+ kill_workq:
+ destroy_workqueue(nvme_workq);
kill_kthread:
kthread_stop(nvme_thread);
return result;
@@ -2228,6 +2627,7 @@ static void __exit nvme_exit(void)
{
pci_unregister_driver(&nvme_driver);
unregister_blkdev(nvme_major, "nvme");
+ destroy_workqueue(nvme_workq);
kthread_stop(nvme_thread);
}
diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c
index 4a4ff4eb8e23..4a0ceb64e269 100644
--- a/drivers/block/nvme-scsi.c
+++ b/drivers/block/nvme-scsi.c
@@ -25,6 +25,7 @@
#include <linux/bio.h>
#include <linux/bitops.h>
#include <linux/blkdev.h>
+#include <linux/compat.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/fs.h>
@@ -3038,6 +3039,152 @@ int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr)
return retcode;
}
+#ifdef CONFIG_COMPAT
+typedef struct sg_io_hdr32 {
+ compat_int_t interface_id; /* [i] 'S' for SCSI generic (required) */
+ compat_int_t dxfer_direction; /* [i] data transfer direction */
+ unsigned char cmd_len; /* [i] SCSI command length ( <= 16 bytes) */
+ unsigned char mx_sb_len; /* [i] max length to write to sbp */
+ unsigned short iovec_count; /* [i] 0 implies no scatter gather */
+ compat_uint_t dxfer_len; /* [i] byte count of data transfer */
+ compat_uint_t dxferp; /* [i], [*io] points to data transfer memory
+ or scatter gather list */
+ compat_uptr_t cmdp; /* [i], [*i] points to command to perform */
+ compat_uptr_t sbp; /* [i], [*o] points to sense_buffer memory */
+ compat_uint_t timeout; /* [i] MAX_UINT->no timeout (unit: millisec) */
+ compat_uint_t flags; /* [i] 0 -> default, see SG_FLAG... */
+ compat_int_t pack_id; /* [i->o] unused internally (normally) */
+ compat_uptr_t usr_ptr; /* [i->o] unused internally */
+ unsigned char status; /* [o] scsi status */
+ unsigned char masked_status; /* [o] shifted, masked scsi status */
+ unsigned char msg_status; /* [o] messaging level data (optional) */
+ unsigned char sb_len_wr; /* [o] byte count actually written to sbp */
+ unsigned short host_status; /* [o] errors from host adapter */
+ unsigned short driver_status; /* [o] errors from software driver */
+ compat_int_t resid; /* [o] dxfer_len - actual_transferred */
+ compat_uint_t duration; /* [o] time taken by cmd (unit: millisec) */
+ compat_uint_t info; /* [o] auxiliary information */
+} sg_io_hdr32_t; /* 64 bytes long (on sparc32) */
+
+typedef struct sg_iovec32 {
+ compat_uint_t iov_base;
+ compat_uint_t iov_len;
+} sg_iovec32_t;
+
+static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iovec_count)
+{
+ sg_iovec_t __user *iov = (sg_iovec_t __user *) (sgio + 1);
+ sg_iovec32_t __user *iov32 = dxferp;
+ int i;
+
+ for (i = 0; i < iovec_count; i++) {
+ u32 base, len;
+
+ if (get_user(base, &iov32[i].iov_base) ||
+ get_user(len, &iov32[i].iov_len) ||
+ put_user(compat_ptr(base), &iov[i].iov_base) ||
+ put_user(len, &iov[i].iov_len))
+ return -EFAULT;
+ }
+
+ if (put_user(iov, &sgio->dxferp))
+ return -EFAULT;
+ return 0;
+}
+
+int nvme_sg_io32(struct nvme_ns *ns, unsigned long arg)
+{
+ sg_io_hdr32_t __user *sgio32 = (sg_io_hdr32_t __user *)arg;
+ sg_io_hdr_t __user *sgio;
+ u16 iovec_count;
+ u32 data;
+ void __user *dxferp;
+ int err;
+ int interface_id;
+
+ if (get_user(interface_id, &sgio32->interface_id))
+ return -EFAULT;
+ if (interface_id != 'S')
+ return -EINVAL;
+
+ if (get_user(iovec_count, &sgio32->iovec_count))
+ return -EFAULT;
+
+ {
+ void __user *top = compat_alloc_user_space(0);
+ void __user *new = compat_alloc_user_space(sizeof(sg_io_hdr_t) +
+ (iovec_count * sizeof(sg_iovec_t)));
+ if (new > top)
+ return -EINVAL;
+
+ sgio = new;
+ }
+
+ /* Ok, now construct. */
+ if (copy_in_user(&sgio->interface_id, &sgio32->interface_id,
+ (2 * sizeof(int)) +
+ (2 * sizeof(unsigned char)) +
+ (1 * sizeof(unsigned short)) +
+ (1 * sizeof(unsigned int))))
+ return -EFAULT;
+
+ if (get_user(data, &sgio32->dxferp))
+ return -EFAULT;
+ dxferp = compat_ptr(data);
+ if (iovec_count) {
+ if (sg_build_iovec(sgio, dxferp, iovec_count))
+ return -EFAULT;
+ } else {
+ if (put_user(dxferp, &sgio->dxferp))
+ return -EFAULT;
+ }
+
+ {
+ unsigned char __user *cmdp;
+ unsigned char __user *sbp;
+
+ if (get_user(data, &sgio32->cmdp))
+ return -EFAULT;
+ cmdp = compat_ptr(data);
+
+ if (get_user(data, &sgio32->sbp))
+ return -EFAULT;
+ sbp = compat_ptr(data);
+
+ if (put_user(cmdp, &sgio->cmdp) ||
+ put_user(sbp, &sgio->sbp))
+ return -EFAULT;
+ }
+
+ if (copy_in_user(&sgio->timeout, &sgio32->timeout,
+ 3 * sizeof(int)))
+ return -EFAULT;
+
+ if (get_user(data, &sgio32->usr_ptr))
+ return -EFAULT;
+ if (put_user(compat_ptr(data), &sgio->usr_ptr))
+ return -EFAULT;
+
+ err = nvme_sg_io(ns, sgio);
+ if (err >= 0) {
+ void __user *datap;
+
+ if (copy_in_user(&sgio32->pack_id, &sgio->pack_id,
+ sizeof(int)) ||
+ get_user(datap, &sgio->usr_ptr) ||
+ put_user((u32)(unsigned long)datap,
+ &sgio32->usr_ptr) ||
+ copy_in_user(&sgio32->status, &sgio->status,
+ (4 * sizeof(unsigned char)) +
+ (2 * sizeof(unsigned short)) +
+ (3 * sizeof(int))))
+ err = -EFAULT;
+ }
+
+ return err;
+}
+#endif
+
int nvme_sg_get_version_num(int __user *ip)
{
return put_user(sg_version_num, ip);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 6a680d4de7f1..b1cb3f4c4db4 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -110,9 +110,9 @@ static int __virtblk_add_req(struct virtqueue *vq,
return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
}
-static inline void virtblk_request_done(struct virtblk_req *vbr)
+static inline void virtblk_request_done(struct request *req)
{
- struct request *req = vbr->req;
+ struct virtblk_req *vbr = req->special;
int error = virtblk_result(vbr);
if (req->cmd_type == REQ_TYPE_BLOCK_PC) {
@@ -138,7 +138,7 @@ static void virtblk_done(struct virtqueue *vq)
do {
virtqueue_disable_cb(vq);
while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) {
- virtblk_request_done(vbr);
+ blk_mq_complete_request(vbr->req);
req_done = true;
}
if (unlikely(virtqueue_is_broken(vq)))
@@ -479,6 +479,7 @@ static struct blk_mq_ops virtio_mq_ops = {
.map_queue = blk_mq_map_queue,
.alloc_hctx = blk_mq_alloc_single_hw_queue,
.free_hctx = blk_mq_free_single_hw_queue,
+ .complete = virtblk_request_done,
};
static struct blk_mq_reg virtio_mq_reg = {
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index da18046d0e07..64c60edcdfbc 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -285,7 +285,8 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST ||
!rb_next(&persistent_gnt->node)) {
- ret = gnttab_unmap_refs(unmap, pages, segs_to_unmap);
+ ret = gnttab_unmap_refs(unmap, NULL, pages,
+ segs_to_unmap);
BUG_ON(ret);
put_free_pages(blkif, pages, segs_to_unmap);
segs_to_unmap = 0;
@@ -298,7 +299,7 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
BUG_ON(num != 0);
}
-static void unmap_purged_grants(struct work_struct *work)
+void xen_blkbk_unmap_purged_grants(struct work_struct *work)
{
struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
@@ -320,7 +321,8 @@ static void unmap_purged_grants(struct work_struct *work)
pages[segs_to_unmap] = persistent_gnt->page;
if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
- ret = gnttab_unmap_refs(unmap, pages, segs_to_unmap);
+ ret = gnttab_unmap_refs(unmap, NULL, pages,
+ segs_to_unmap);
BUG_ON(ret);
put_free_pages(blkif, pages, segs_to_unmap);
segs_to_unmap = 0;
@@ -328,7 +330,7 @@ static void unmap_purged_grants(struct work_struct *work)
kfree(persistent_gnt);
}
if (segs_to_unmap > 0) {
- ret = gnttab_unmap_refs(unmap, pages, segs_to_unmap);
+ ret = gnttab_unmap_refs(unmap, NULL, pages, segs_to_unmap);
BUG_ON(ret);
put_free_pages(blkif, pages, segs_to_unmap);
}
@@ -373,7 +375,7 @@ static void purge_persistent_gnt(struct xen_blkif *blkif)
pr_debug(DRV_PFX "Going to purge %u persistent grants\n", num_clean);
- INIT_LIST_HEAD(&blkif->persistent_purge_list);
+ BUG_ON(!list_empty(&blkif->persistent_purge_list));
root = &blkif->persistent_gnts;
purge_list:
foreach_grant_safe(persistent_gnt, n, root, node) {
@@ -418,7 +420,6 @@ finished:
blkif->vbd.overflow_max_grants = 0;
/* We can defer this work */
- INIT_WORK(&blkif->persistent_purge_work, unmap_purged_grants);
schedule_work(&blkif->persistent_purge_work);
pr_debug(DRV_PFX "Purged %u/%u\n", (total - num_clean), total);
return;
@@ -623,9 +624,23 @@ purge_gnt_list:
print_stats(blkif);
}
- /* Since we are shutting down remove all pages from the buffer */
- shrink_free_pagepool(blkif, 0 /* All */);
+ /* Drain pending purge work */
+ flush_work(&blkif->persistent_purge_work);
+ if (log_stats)
+ print_stats(blkif);
+
+ blkif->xenblkd = NULL;
+ xen_blkif_put(blkif);
+
+ return 0;
+}
+
+/*
+ * Remove persistent grants and empty the pool of free pages
+ */
+void xen_blkbk_free_caches(struct xen_blkif *blkif)
+{
/* Free all persistent grant pages */
if (!RB_EMPTY_ROOT(&blkif->persistent_gnts))
free_persistent_gnts(blkif, &blkif->persistent_gnts,
@@ -634,13 +649,8 @@ purge_gnt_list:
BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
blkif->persistent_gnt_c = 0;
- if (log_stats)
- print_stats(blkif);
-
- blkif->xenblkd = NULL;
- xen_blkif_put(blkif);
-
- return 0;
+ /* Since we are shutting down remove all pages from the buffer */
+ shrink_free_pagepool(blkif, 0 /* All */);
}
/*
@@ -668,14 +678,15 @@ static void xen_blkbk_unmap(struct xen_blkif *blkif,
GNTMAP_host_map, pages[i]->handle);
pages[i]->handle = BLKBACK_INVALID_HANDLE;
if (++invcount == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
- ret = gnttab_unmap_refs(unmap, unmap_pages, invcount);
+ ret = gnttab_unmap_refs(unmap, NULL, unmap_pages,
+ invcount);
BUG_ON(ret);
put_free_pages(blkif, unmap_pages, invcount);
invcount = 0;
}
}
if (invcount) {
- ret = gnttab_unmap_refs(unmap, unmap_pages, invcount);
+ ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
BUG_ON(ret);
put_free_pages(blkif, unmap_pages, invcount);
}
@@ -737,7 +748,7 @@ again:
}
if (segs_to_map) {
- ret = gnttab_map_refs(map, pages_to_gnt, segs_to_map);
+ ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map);
BUG_ON(ret);
}
@@ -835,7 +846,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
struct grant_page **pages = pending_req->indirect_pages;
struct xen_blkif *blkif = pending_req->blkif;
int indirect_grefs, rc, n, nseg, i;
- struct blkif_request_segment_aligned *segments = NULL;
+ struct blkif_request_segment *segments = NULL;
nseg = pending_req->nr_pages;
indirect_grefs = INDIRECT_PAGES(nseg);
@@ -931,9 +942,7 @@ static void xen_blk_drain_io(struct xen_blkif *blkif)
{
atomic_set(&blkif->drain, 1);
do {
- /* The initial value is one, and one refcnt taken at the
- * start of the xen_blkif_schedule thread. */
- if (atomic_read(&blkif->refcnt) <= 2)
+ if (atomic_read(&blkif->inflight) == 0)
break;
wait_for_completion_interruptible_timeout(
&blkif->drain_complete, HZ);
@@ -973,17 +982,30 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
* the proper response on the ring.
*/
if (atomic_dec_and_test(&pending_req->pendcnt)) {
- xen_blkbk_unmap(pending_req->blkif,
+ struct xen_blkif *blkif = pending_req->blkif;
+
+ xen_blkbk_unmap(blkif,
pending_req->segments,
pending_req->nr_pages);
- make_response(pending_req->blkif, pending_req->id,
+ make_response(blkif, pending_req->id,
pending_req->operation, pending_req->status);
- xen_blkif_put(pending_req->blkif);
- if (atomic_read(&pending_req->blkif->refcnt) <= 2) {
- if (atomic_read(&pending_req->blkif->drain))
- complete(&pending_req->blkif->drain_complete);
+ free_req(blkif, pending_req);
+ /*
+ * Make sure the request is freed before releasing blkif,
+ * or there could be a race between free_req and the
+ * cleanup done in xen_blkif_free during shutdown.
+ *
+ * NB: The fact that we might try to wake up pending_free_wq
+ * before drain_complete (in case there's a drain going on)
+ * it's not a problem with our current implementation
+ * because we can assure there's no thread waiting on
+ * pending_free_wq if there's a drain going on, but it has
+ * to be taken into account if the current model is changed.
+ */
+ if (atomic_dec_and_test(&blkif->inflight) && atomic_read(&blkif->drain)) {
+ complete(&blkif->drain_complete);
}
- free_req(pending_req->blkif, pending_req);
+ xen_blkif_put(blkif);
}
}
@@ -1237,6 +1259,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
* below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
*/
xen_blkif_get(blkif);
+ atomic_inc(&blkif->inflight);
for (i = 0; i < nseg; i++) {
while ((bio == NULL) ||
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 8d8807563d99..be052773ad03 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -57,7 +57,7 @@
#define MAX_INDIRECT_SEGMENTS 256
#define SEGS_PER_INDIRECT_FRAME \
- (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned))
+ (PAGE_SIZE/sizeof(struct blkif_request_segment))
#define MAX_INDIRECT_PAGES \
((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
#define INDIRECT_PAGES(_segs) \
@@ -278,6 +278,7 @@ struct xen_blkif {
/* for barrier (drain) requests */
struct completion drain_complete;
atomic_t drain;
+ atomic_t inflight;
/* One thread per one blkif. */
struct task_struct *xenblkd;
unsigned int waiting_reqs;
@@ -376,6 +377,7 @@ int xen_blkif_xenbus_init(void);
irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
int xen_blkif_schedule(void *arg);
int xen_blkif_purge_persistent(void *arg);
+void xen_blkbk_free_caches(struct xen_blkif *blkif);
int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
struct backend_info *be, int state);
@@ -383,6 +385,7 @@ int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
int xen_blkbk_barrier(struct xenbus_transaction xbt,
struct backend_info *be, int state);
struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be);
+void xen_blkbk_unmap_purged_grants(struct work_struct *work);
static inline void blkif_get_x86_32_req(struct blkif_request *dst,
struct blkif_x86_32_request *src)
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index c2014a0aa206..9a547e6b6ebf 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -125,8 +125,11 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
blkif->persistent_gnts.rb_node = NULL;
spin_lock_init(&blkif->free_pages_lock);
INIT_LIST_HEAD(&blkif->free_pages);
+ INIT_LIST_HEAD(&blkif->persistent_purge_list);
blkif->free_pages_num = 0;
atomic_set(&blkif->persistent_gnt_in_use, 0);
+ atomic_set(&blkif->inflight, 0);
+ INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants);
INIT_LIST_HEAD(&blkif->pending_free);
@@ -259,6 +262,17 @@ static void xen_blkif_free(struct xen_blkif *blkif)
if (!atomic_dec_and_test(&blkif->refcnt))
BUG();
+ /* Remove all persistent grants and the cache of ballooned pages. */
+ xen_blkbk_free_caches(blkif);
+
+ /* Make sure everything is drained before shutting down */
+ BUG_ON(blkif->persistent_gnt_c != 0);
+ BUG_ON(atomic_read(&blkif->persistent_gnt_in_use) != 0);
+ BUG_ON(blkif->free_pages_num != 0);
+ BUG_ON(!list_empty(&blkif->persistent_purge_list));
+ BUG_ON(!list_empty(&blkif->free_pages));
+ BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
+
/* Check that there is no request in use */
list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
list_del(&req->free_list);
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 8dcfb54f1603..efe1b4761735 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -162,7 +162,7 @@ static DEFINE_SPINLOCK(minor_lock);
#define DEV_NAME "xvd" /* name in /dev */
#define SEGS_PER_INDIRECT_FRAME \
- (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned))
+ (PAGE_SIZE/sizeof(struct blkif_request_segment))
#define INDIRECT_GREFS(_segs) \
((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
@@ -393,7 +393,7 @@ static int blkif_queue_request(struct request *req)
unsigned long id;
unsigned int fsect, lsect;
int i, ref, n;
- struct blkif_request_segment_aligned *segments = NULL;
+ struct blkif_request_segment *segments = NULL;
/*
* Used to store if we are able to queue the request by just using
@@ -550,7 +550,7 @@ static int blkif_queue_request(struct request *req)
} else {
n = i % SEGS_PER_INDIRECT_FRAME;
segments[n] =
- (struct blkif_request_segment_aligned) {
+ (struct blkif_request_segment) {
.gref = ref,
.first_sect = fsect,
.last_sect = lsect };
@@ -1904,13 +1904,16 @@ static void blkback_changed(struct xenbus_device *dev,
case XenbusStateReconfiguring:
case XenbusStateReconfigured:
case XenbusStateUnknown:
- case XenbusStateClosed:
break;
case XenbusStateConnected:
blkfront_connect(info);
break;
+ case XenbusStateClosed:
+ if (dev->state == XenbusStateClosed)
+ break;
+ /* Missed the backend's Closing state -- fallthrough */
case XenbusStateClosing:
blkfront_closing(info);
break;