From b2b7e00148a203e9934bbd17aebffae3f447ade7 Mon Sep 17 00:00:00 2001
From: Matias Bjørling <m@bjorling.me>
Date: Thu, 12 Nov 2015 20:25:10 +0100
Subject: null_blk: register as a LightNVM device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for registering as a LightNVM device. This allows us to
evaluate the performance of the LightNVM subsystem.

In /drivers/Makefile, LightNVM is moved above block device drivers
to make sure that the LightNVM media managers have been initialized
before drivers under /drivers/block are initialized.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Fix by Jens Axboe to remove unneeded slab cache and the following
memory leak.
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/null_blk.c | 160 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 154 insertions(+), 6 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 6255d1c4bba4..816525143a74 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -8,6 +8,7 @@
 #include <linux/slab.h>
 #include <linux/blk-mq.h>
 #include <linux/hrtimer.h>
+#include <linux/lightnvm.h>
 
 struct nullb_cmd {
 	struct list_head list;
@@ -39,6 +40,7 @@ struct nullb {
 
 	struct nullb_queue *queues;
 	unsigned int nr_queues;
+	char disk_name[DISK_NAME_LEN];
 };
 
 static LIST_HEAD(nullb_list);
@@ -119,6 +121,10 @@ static int nr_devices = 2;
 module_param(nr_devices, int, S_IRUGO);
 MODULE_PARM_DESC(nr_devices, "Number of devices to register");
 
+static bool use_lightnvm;
+module_param(use_lightnvm, bool, S_IRUGO);
+MODULE_PARM_DESC(use_lightnvm, "Register as a LightNVM device");
+
 static int irqmode = NULL_IRQ_SOFTIRQ;
 
 static int null_set_irqmode(const char *str, const struct kernel_param *kp)
@@ -427,6 +433,8 @@ static void null_del_dev(struct nullb *nullb)
 {
 	list_del_init(&nullb->list);
 
+	if (use_lightnvm)
+		nvm_unregister(nullb->disk->disk_name);
 	del_gendisk(nullb->disk);
 	blk_cleanup_queue(nullb->q);
 	if (queue_mode == NULL_Q_MQ)
@@ -436,6 +444,125 @@ static void null_del_dev(struct nullb *nullb)
 	kfree(nullb);
 }
 
+#ifdef CONFIG_NVM
+
+static void null_lnvm_end_io(struct request *rq, int error)
+{
+	struct nvm_rq *rqd = rq->end_io_data;
+	struct nvm_dev *dev = rqd->dev;
+
+	dev->mt->end_io(rqd, error);
+
+	blk_put_request(rq);
+}
+
+static int null_lnvm_submit_io(struct request_queue *q, struct nvm_rq *rqd)
+{
+	struct request *rq;
+	struct bio *bio = rqd->bio;
+
+	rq = blk_mq_alloc_request(q, bio_rw(bio), GFP_KERNEL, 0);
+	if (IS_ERR(rq))
+		return -ENOMEM;
+
+	rq->cmd_type = REQ_TYPE_DRV_PRIV;
+	rq->__sector = bio->bi_iter.bi_sector;
+	rq->ioprio = bio_prio(bio);
+
+	if (bio_has_data(bio))
+		rq->nr_phys_segments = bio_phys_segments(q, bio);
+
+	rq->__data_len = bio->bi_iter.bi_size;
+	rq->bio = rq->biotail = bio;
+
+	rq->end_io_data = rqd;
+
+	blk_execute_rq_nowait(q, NULL, rq, 0, null_lnvm_end_io);
+
+	return 0;
+}
+
+static int null_lnvm_id(struct request_queue *q, struct nvm_id *id)
+{
+	sector_t size = gb * 1024 * 1024 * 1024ULL;
+	struct nvm_id_group *grp;
+
+	id->ver_id = 0x1;
+	id->vmnt = 0;
+	id->cgrps = 1;
+	id->cap = 0x3;
+	id->dom = 0x1;
+	id->ppat = NVM_ADDRMODE_LINEAR;
+
+	do_div(size, bs); /* convert size to pages */
+	grp = &id->groups[0];
+	grp->mtype = 0;
+	grp->fmtype = 1;
+	grp->num_ch = 1;
+	grp->num_lun = 1;
+	grp->num_pln = 1;
+	grp->num_blk = size / 256;
+	grp->num_pg = 256;
+	grp->fpg_sz = bs;
+	grp->csecs = bs;
+	grp->trdt = 25000;
+	grp->trdm = 25000;
+	grp->tprt = 500000;
+	grp->tprm = 500000;
+	grp->tbet = 1500000;
+	grp->tbem = 1500000;
+	grp->mpos = 0x010101; /* single plane rwe */
+	grp->cpar = hw_queue_depth;
+
+	return 0;
+}
+
+static void *null_lnvm_create_dma_pool(struct request_queue *q, char *name)
+{
+	mempool_t *virtmem_pool;
+
+	virtmem_pool = mempool_create_page_pool(64, 0);
+	if (!virtmem_pool) {
+		pr_err("null_blk: Unable to create virtual memory pool\n");
+		return NULL;
+	}
+
+	return virtmem_pool;
+}
+
+static void null_lnvm_destroy_dma_pool(void *pool)
+{
+	mempool_destroy(pool);
+}
+
+static void *null_lnvm_dev_dma_alloc(struct request_queue *q, void *pool,
+				gfp_t mem_flags, dma_addr_t *dma_handler)
+{
+	return mempool_alloc(pool, mem_flags);
+}
+
+static void null_lnvm_dev_dma_free(void *pool, void *entry,
+							dma_addr_t dma_handler)
+{
+	mempool_free(entry, pool);
+}
+
+static struct nvm_dev_ops null_lnvm_dev_ops = {
+	.identity		= null_lnvm_id,
+	.submit_io		= null_lnvm_submit_io,
+
+	.create_dma_pool	= null_lnvm_create_dma_pool,
+	.destroy_dma_pool	= null_lnvm_destroy_dma_pool,
+	.dev_dma_alloc		= null_lnvm_dev_dma_alloc,
+	.dev_dma_free		= null_lnvm_dev_dma_free,
+
+	/* Simulate nvme protocol restriction */
+	.max_phys_sect		= 64,
+};
+#else
+static struct nvm_dev_ops null_lnvm_dev_ops;
+#endif /* CONFIG_NVM */
+
 static int null_open(struct block_device *bdev, fmode_t mode)
 {
 	return 0;
@@ -575,11 +702,6 @@ static int null_add_dev(void)
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q);
 	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, nullb->q);
 
-	disk = nullb->disk = alloc_disk_node(1, home_node);
-	if (!disk) {
-		rv = -ENOMEM;
-		goto out_cleanup_blk_queue;
-	}
 
 	mutex_lock(&lock);
 	list_add_tail(&nullb->list, &nullb_list);
@@ -589,6 +711,21 @@ static int null_add_dev(void)
 	blk_queue_logical_block_size(nullb->q, bs);
 	blk_queue_physical_block_size(nullb->q, bs);
 
+	sprintf(nullb->disk_name, "nullb%d", nullb->index);
+
+	if (use_lightnvm) {
+		rv = nvm_register(nullb->q, nullb->disk_name,
+							&null_lnvm_dev_ops);
+		if (rv)
+			goto out_cleanup_blk_queue;
+		goto done;
+	}
+
+	disk = nullb->disk = alloc_disk_node(1, home_node);
+	if (!disk) {
+		rv = -ENOMEM;
+		goto out_cleanup_lightnvm;
+	}
 	size = gb * 1024 * 1024 * 1024ULL;
 	set_capacity(disk, size >> 9);
 
@@ -598,10 +735,15 @@ static int null_add_dev(void)
 	disk->fops		= &null_fops;
 	disk->private_data	= nullb;
 	disk->queue		= nullb->q;
-	sprintf(disk->disk_name, "nullb%d", nullb->index);
+	strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN);
+
 	add_disk(disk);
+done:
 	return 0;
 
+out_cleanup_lightnvm:
+	if (use_lightnvm)
+		nvm_unregister(nullb->disk_name);
 out_cleanup_blk_queue:
 	blk_cleanup_queue(nullb->q);
 out_cleanup_tags:
@@ -625,6 +767,12 @@ static int __init null_init(void)
 		bs = PAGE_SIZE;
 	}
 
+	if (use_lightnvm && queue_mode != NULL_Q_MQ) {
+		pr_warn("null_blk: LightNVM only supported for blk-mq\n");
+		pr_warn("null_blk: defaults queue mode to blk-mq\n");
+		queue_mode = NULL_Q_MQ;
+	}
+
 	if (queue_mode == NULL_Q_MQ && use_per_node_hctx) {
 		if (submit_queues < nr_online_nodes) {
 			pr_warn("null_blk: submit_queues param is set to %u.",
-- 
cgit v1.2.3


From 6bb9535bc3f59194a0ae17b17ca71aecd0f7e3a2 Mon Sep 17 00:00:00 2001
From: Matias Bjørling <m@bjorling.me>
Date: Thu, 19 Nov 2015 12:50:08 +0100
Subject: null_blk: use ppa_cache pool
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of using a page pool, we can save memory by only allocating room
for 64 entries for the ppa command. Introduce a ppa_cache to allocate only
the required memory for the ppa list.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/null_blk.c | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 816525143a74..3e9c4f94b5be 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -47,6 +47,7 @@ static LIST_HEAD(nullb_list);
 static struct mutex lock;
 static int null_major;
 static int nullb_indexes;
+static struct kmem_cache *ppa_cache;
 
 struct completion_queue {
 	struct llist_head list;
@@ -521,7 +522,7 @@ static void *null_lnvm_create_dma_pool(struct request_queue *q, char *name)
 {
 	mempool_t *virtmem_pool;
 
-	virtmem_pool = mempool_create_page_pool(64, 0);
+	virtmem_pool = mempool_create_slab_pool(64, ppa_cache);
 	if (!virtmem_pool) {
 		pr_err("null_blk: Unable to create virtual memory pool\n");
 		return NULL;
@@ -767,6 +768,12 @@ static int __init null_init(void)
 		bs = PAGE_SIZE;
 	}
 
+	if (use_lightnvm && bs != 4096) {
+		pr_warn("null_blk: LightNVM only supports 4k block size\n");
+		pr_warn("null_blk: defaults block size to 4k\n");
+		bs = 4096;
+	}
+
 	if (use_lightnvm && queue_mode != NULL_Q_MQ) {
 		pr_warn("null_blk: LightNVM only supported for blk-mq\n");
 		pr_warn("null_blk: defaults queue mode to blk-mq\n");
@@ -803,15 +810,27 @@ static int __init null_init(void)
 	if (null_major < 0)
 		return null_major;
 
+	if (use_lightnvm) {
+		ppa_cache = kmem_cache_create("ppa_cache", 64 * sizeof(u64),
+								0, 0, NULL);
+		if (!ppa_cache) {
+			pr_err("null_blk: unable to create ppa cache\n");
+			return -ENOMEM;
+		}
+	}
+
 	for (i = 0; i < nr_devices; i++) {
 		if (null_add_dev()) {
 			unregister_blkdev(null_major, "nullb");
-			return -EINVAL;
+			goto err_ppa;
 		}
 	}
 
 	pr_info("null: module loaded\n");
 	return 0;
+err_ppa:
+	kmem_cache_destroy(ppa_cache);
+	return -EINVAL;
 }
 
 static void __exit null_exit(void)
@@ -826,6 +845,8 @@ static void __exit null_exit(void)
 		null_del_dev(nullb);
 	}
 	mutex_unlock(&lock);
+
+	kmem_cache_destroy(ppa_cache);
 }
 
 module_init(null_init);
-- 
cgit v1.2.3


From 5b40db99099ddebe31e9b1b759894cf09c0c6679 Mon Sep 17 00:00:00 2001
From: Matias Bjørling <m@bjorling.me>
Date: Thu, 19 Nov 2015 12:50:09 +0100
Subject: null_blk: use device addressing mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The linear addressing mode was removed in 7386af2. Make null_blk instead
expose the ppa format geometry and support the generic addressing mode.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/null_blk.c | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 3e9c4f94b5be..d51c24ac529f 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -486,6 +486,7 @@ static int null_lnvm_submit_io(struct request_queue *q, struct nvm_rq *rqd)
 static int null_lnvm_id(struct request_queue *q, struct nvm_id *id)
 {
 	sector_t size = gb * 1024 * 1024 * 1024ULL;
+	sector_t blksize;
 	struct nvm_id_group *grp;
 
 	id->ver_id = 0x1;
@@ -493,17 +494,34 @@ static int null_lnvm_id(struct request_queue *q, struct nvm_id *id)
 	id->cgrps = 1;
 	id->cap = 0x3;
 	id->dom = 0x1;
-	id->ppat = NVM_ADDRMODE_LINEAR;
+
+	id->ppaf.blk_offset = 0;
+	id->ppaf.blk_len = 16;
+	id->ppaf.pg_offset = 16;
+	id->ppaf.pg_len = 16;
+	id->ppaf.sect_offset = 32;
+	id->ppaf.sect_len = 8;
+	id->ppaf.pln_offset = 40;
+	id->ppaf.pln_len = 8;
+	id->ppaf.lun_offset = 48;
+	id->ppaf.lun_len = 8;
+	id->ppaf.ch_offset = 56;
+	id->ppaf.ch_len = 8;
 
 	do_div(size, bs); /* convert size to pages */
+	do_div(size, 256); /* concert size to pgs pr blk */
 	grp = &id->groups[0];
 	grp->mtype = 0;
-	grp->fmtype = 1;
+	grp->fmtype = 0;
 	grp->num_ch = 1;
-	grp->num_lun = 1;
-	grp->num_pln = 1;
-	grp->num_blk = size / 256;
 	grp->num_pg = 256;
+	blksize = size;
+	do_div(size, (1 << 16));
+	grp->num_lun = size + 1;
+	do_div(blksize, grp->num_lun);
+	grp->num_blk = blksize;
+	grp->num_pln = 1;
+
 	grp->fpg_sz = bs;
 	grp->csecs = bs;
 	grp->trdt = 25000;
-- 
cgit v1.2.3


From 54514aa465e94316a4bf1c5dfe970536bec3e76f Mon Sep 17 00:00:00 2001
From: Matias Bjørling <m@bjorling.me>
Date: Thu, 19 Nov 2015 12:50:10 +0100
Subject: null_blk: do not del gendisk with lightnvm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The gendisk structure has not been initialized when using lightnvm.
Make sure to not delete it upon exit. Also make sure that we use the
appropriate disk_name at unregistration.

Signed-off-by: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/null_blk.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index d51c24ac529f..5c8ba5484d86 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -435,12 +435,14 @@ static void null_del_dev(struct nullb *nullb)
 	list_del_init(&nullb->list);
 
 	if (use_lightnvm)
-		nvm_unregister(nullb->disk->disk_name);
-	del_gendisk(nullb->disk);
+		nvm_unregister(nullb->disk_name);
+	else
+		del_gendisk(nullb->disk);
 	blk_cleanup_queue(nullb->q);
 	if (queue_mode == NULL_Q_MQ)
 		blk_mq_free_tag_set(&nullb->tag_set);
-	put_disk(nullb->disk);
+	if (!use_lightnvm)
+		put_disk(nullb->disk);
 	cleanup_queues(nullb);
 	kfree(nullb);
 }
-- 
cgit v1.2.3


From 8aeea03195ee6e33fcb00039c414eabfc37a4eb8 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Fri, 20 Nov 2015 10:46:49 +0100
Subject: mtip32xx: use formatting capability of kthread_create_on_node

kthread_create_on_node takes format+args, so there's no need to do the
pretty-printing in advance. Moreover, "mtip_svc_thd_99" (including its
'\0') only just fits in 16 bytes, so if index could ever go above 99
we'd have a stack buffer overflow.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/mtip32xx/mtip32xx.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index a28a562f7b7f..3457ac8c03e2 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3810,7 +3810,6 @@ static int mtip_block_initialize(struct driver_data *dd)
 	sector_t capacity;
 	unsigned int index = 0;
 	struct kobject *kobj;
-	unsigned char thd_name[16];
 
 	if (dd->disk)
 		goto skip_create_disk; /* hw init done, before rebuild */
@@ -3958,10 +3957,9 @@ skip_create_disk:
 	}
 
 start_service_thread:
-	sprintf(thd_name, "mtip_svc_thd_%02d", index);
 	dd->mtip_svc_handler = kthread_create_on_node(mtip_service_thread,
-						dd, dd->numa_node, "%s",
-						thd_name);
+						dd, dd->numa_node,
+						"mtip_svc_thd_%02d", index);
 
 	if (IS_ERR(dd->mtip_svc_handler)) {
 		dev_err(&dd->pdev->dev, "service thread failed to start\n");
-- 
cgit v1.2.3


From 3c395a969acc690f331d5fb91ec99ea8eb5155dd Mon Sep 17 00:00:00 2001
From: Paolo Valente <paolo.valente@unimore.it>
Date: Tue, 1 Dec 2015 11:48:17 +0100
Subject: null_blk: set a separate timer for each command

For the Timer IRQ mode (i.e., when command completions are delayed),
there is one timer for each CPU. Each of these timers
. has a completion queue associated with it, containing all the
  command completions to be executed when the timer fires;
. is set, and a new completion-to-execute is inserted into its
  completion queue, every time the dispatch code for a new command
  happens to be executed on the CPU related to the timer.

This implies that, if the dispatch of a new command happens to be
executed on a CPU whose timer has already been set, but has not yet
fired, then the timer is set again, to the completion time of the
newly arrived command. When the timer eventually fires, all its queued
completions are executed.

This way of handling delayed command completions entails the following
problem: if more than one command completion is inserted into the
queue of a timer before the timer fires, then the expiration time for
the timer is moved forward every time each of these completions is
enqueued. As a consequence, only the last completion enqueued enjoys a
correct execution time, while all previous completions are unjustly
delayed until the last completion is executed (and at that time they
are executed all together).

Specifically, if all the above completions are enqueued almost at the
same time, then the problem is negligible. On the opposite end, if
every completion is enqueued a while after the previous completion was
enqueued (in the extreme case, it is enqueued only right before the
timer would have expired), then every enqueued completion, except for
the last one, experiences an inflated delay, proportional to the number
of completions enqueued after it. In the end, commands, and thus I/O
requests, may be completed at an arbitrarily lower rate than the
desired one.

This commit addresses this issue by replacing per-CPU timers with
per-command timers, i.e., by associating an individual timer with each
command.

Signed-off-by: Paolo Valente <paolo.valente@unimore.it>
Signed-off-by: Arianna Avanzini <avanzini@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/null_blk.c | 79 +++++++++++++++---------------------------------
 1 file changed, 24 insertions(+), 55 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 5c8ba5484d86..08932f5ea9f3 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -18,6 +18,7 @@ struct nullb_cmd {
 	struct bio *bio;
 	unsigned int tag;
 	struct nullb_queue *nq;
+	struct hrtimer timer;
 };
 
 struct nullb_queue {
@@ -49,17 +50,6 @@ static int null_major;
 static int nullb_indexes;
 static struct kmem_cache *ppa_cache;
 
-struct completion_queue {
-	struct llist_head list;
-	struct hrtimer timer;
-};
-
-/*
- * These are per-cpu for now, they will need to be configured by the
- * complete_queues parameter and appropriately mapped.
- */
-static DEFINE_PER_CPU(struct completion_queue, completion_queues);
-
 enum {
 	NULL_IRQ_NONE		= 0,
 	NULL_IRQ_SOFTIRQ	= 1,
@@ -180,6 +170,8 @@ static void free_cmd(struct nullb_cmd *cmd)
 	put_tag(cmd->nq, cmd->tag);
 }
 
+static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer);
+
 static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq)
 {
 	struct nullb_cmd *cmd;
@@ -190,6 +182,11 @@ static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq)
 		cmd = &nq->cmds[tag];
 		cmd->tag = tag;
 		cmd->nq = nq;
+		if (irqmode == NULL_IRQ_TIMER) {
+			hrtimer_init(&cmd->timer, CLOCK_MONOTONIC,
+				     HRTIMER_MODE_REL);
+			cmd->timer.function = null_cmd_timer_expired;
+		}
 		return cmd;
 	}
 
@@ -238,47 +235,28 @@ static void end_cmd(struct nullb_cmd *cmd)
 
 static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
 {
-	struct completion_queue *cq;
-	struct llist_node *entry;
-	struct nullb_cmd *cmd;
-
-	cq = &per_cpu(completion_queues, smp_processor_id());
-
-	while ((entry = llist_del_all(&cq->list)) != NULL) {
-		entry = llist_reverse_order(entry);
-		do {
-			struct request_queue *q = NULL;
+	struct nullb_cmd *cmd = container_of(timer, struct nullb_cmd, timer);
+	struct request_queue *q = NULL;
 
-			cmd = container_of(entry, struct nullb_cmd, ll_list);
-			entry = entry->next;
-			if (cmd->rq)
-				q = cmd->rq->q;
-			end_cmd(cmd);
+	if (cmd->rq)
+		q = cmd->rq->q;
 
-			if (q && !q->mq_ops && blk_queue_stopped(q)) {
-				spin_lock(q->queue_lock);
-				if (blk_queue_stopped(q))
-					blk_start_queue(q);
-				spin_unlock(q->queue_lock);
-			}
-		} while (entry);
+	if (q && !q->mq_ops && blk_queue_stopped(q)) {
+		spin_lock(q->queue_lock);
+		if (blk_queue_stopped(q))
+			blk_start_queue(q);
+		spin_unlock(q->queue_lock);
 	}
+	end_cmd(cmd);
 
 	return HRTIMER_NORESTART;
 }
 
 static void null_cmd_end_timer(struct nullb_cmd *cmd)
 {
-	struct completion_queue *cq = &per_cpu(completion_queues, get_cpu());
-
-	cmd->ll_list.next = NULL;
-	if (llist_add(&cmd->ll_list, &cq->list)) {
-		ktime_t kt = ktime_set(0, completion_nsec);
+	ktime_t kt = ktime_set(0, completion_nsec);
 
-		hrtimer_start(&cq->timer, kt, HRTIMER_MODE_REL_PINNED);
-	}
-
-	put_cpu();
+	hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL);
 }
 
 static void null_softirq_done_fn(struct request *rq)
@@ -376,6 +354,10 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx,
 {
 	struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
 
+	if (irqmode == NULL_IRQ_TIMER) {
+		hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+		cmd->timer.function = null_cmd_timer_expired;
+	}
 	cmd->rq = bd->rq;
 	cmd->nq = hctx->driver_data;
 
@@ -813,19 +795,6 @@ static int __init null_init(void)
 
 	mutex_init(&lock);
 
-	/* Initialize a separate list for each CPU for issuing softirqs */
-	for_each_possible_cpu(i) {
-		struct completion_queue *cq = &per_cpu(completion_queues, i);
-
-		init_llist_head(&cq->list);
-
-		if (irqmode != NULL_IRQ_TIMER)
-			continue;
-
-		hrtimer_init(&cq->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-		cq->timer.function = null_cmd_timer_expired;
-	}
-
 	null_major = register_blkdev(0, "nullb");
 	if (null_major < 0)
 		return null_major;
-- 
cgit v1.2.3


From cf8ecc5a8455266f8d516426b2acd36f9bdfa061 Mon Sep 17 00:00:00 2001
From: Arianna Avanzini <avanzini@google.com>
Date: Tue, 1 Dec 2015 11:48:18 +0100
Subject: null_blk: guarantee device restart in all irq modes

In single-queue (block layer) mode,the function null_rq_prep_fn stops
the device if alloc_cmd fails. Then, once stopped, the device must be
restarted on the next command completion, so that the request(s) for
which alloc_cmd failed can be requeued. Otherwise the device hangs.

Unfortunately, device restart is currently performed only for delayed
completions, i.e., in irqmode==2. This fact causes hangs, for the
above reasons, with the other irqmodes in combination with single-queue
block layer.

This commits addresses this issue by making sure that, if stopped, the
device is properly restarted for all irqmodes on completions.

Signed-off-by: Paolo Valente <paolo.valente@unimore.it>
Signed-off-by: Arianna AVanzini <avanzini@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/null_blk.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 08932f5ea9f3..cf656198836c 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -217,6 +217,8 @@ static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait)
 
 static void end_cmd(struct nullb_cmd *cmd)
 {
+	struct request_queue *q = NULL;
+
 	switch (queue_mode)  {
 	case NULL_Q_MQ:
 		blk_mq_end_request(cmd->rq, 0);
@@ -227,27 +229,28 @@ static void end_cmd(struct nullb_cmd *cmd)
 		break;
 	case NULL_Q_BIO:
 		bio_endio(cmd->bio);
-		break;
+		goto free_cmd;
 	}
 
-	free_cmd(cmd);
-}
-
-static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
-{
-	struct nullb_cmd *cmd = container_of(timer, struct nullb_cmd, timer);
-	struct request_queue *q = NULL;
-
 	if (cmd->rq)
 		q = cmd->rq->q;
 
+	/* Restart queue if needed, as we are freeing a tag */
 	if (q && !q->mq_ops && blk_queue_stopped(q)) {
-		spin_lock(q->queue_lock);
+		unsigned long flags;
+
+		spin_lock_irqsave(q->queue_lock, flags);
 		if (blk_queue_stopped(q))
 			blk_start_queue(q);
-		spin_unlock(q->queue_lock);
+		spin_unlock_irqrestore(q->queue_lock, flags);
 	}
-	end_cmd(cmd);
+free_cmd:
+	free_cmd(cmd);
+}
+
+static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
+{
+	end_cmd(container_of(timer, struct nullb_cmd, timer));
 
 	return HRTIMER_NORESTART;
 }
-- 
cgit v1.2.3


From dbac117542b7e814245c43daa638a3626230cb2a Mon Sep 17 00:00:00 2001
From: Arianna Avanzini <avanzini@google.com>
Date: Tue, 1 Dec 2015 11:48:19 +0100
Subject: null_blk: change type of completion_nsec to unsigned long

This commit at least doubles the maximum value for
completion_nsec. This helps in special cases where one wants/needs to
emulate an extremely slow I/O (for example to spot bugs).

Signed-off-by: Paolo Valente <paolo.valente@unimore.it>
Signed-off-by: Arianna Avanzini <avanzini@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/null_blk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/block')

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index cf656198836c..0c3940ec5e62 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -132,8 +132,8 @@ static const struct kernel_param_ops null_irqmode_param_ops = {
 device_param_cb(irqmode, &null_irqmode_param_ops, &irqmode, S_IRUGO);
 MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer");
 
-static int completion_nsec = 10000;
-module_param(completion_nsec, int, S_IRUGO);
+static unsigned long completion_nsec = 10000;
+module_param(completion_nsec, ulong, S_IRUGO);
 MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns");
 
 static int hw_queue_depth = 64;
-- 
cgit v1.2.3


From 70b16db86f564977df074072143284aec2cb1162 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 27 Nov 2015 19:23:24 +0100
Subject: rbd: don't put snap_context twice in rbd_queue_workfn()

Commit 4e752f0ab0e8 ("rbd: access snapshot context and mapping size
safely") moved ceph_get_snap_context() out of rbd_img_request_create()
and into rbd_queue_workfn(), adding a ceph_put_snap_context() to the
error path in rbd_queue_workfn().  However, rbd_img_request_create()
consumes a ref on snapc, so calling ceph_put_snap_context() after
a successful rbd_img_request_create() leads to an extra put.  Fix it.

Cc: stable@vger.kernel.org # 3.18+
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Josh Durgin <jdurgin@redhat.com>
---
 drivers/block/rbd.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/block')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 235708c7c46e..81ea69fee7ca 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3442,6 +3442,7 @@ static void rbd_queue_workfn(struct work_struct *work)
 		goto err_rq;
 	}
 	img_request->rq = rq;
+	snapc = NULL; /* img_request consumes a ref */
 
 	if (op_type == OBJ_OP_DISCARD)
 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA,
-- 
cgit v1.2.3