From 5f3ea37c7716db4e894a480e0c18b24399595b6b Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 30 Oct 2008 08:34:33 +0100
Subject: blktrace: port to tracepoints

This was a forward port of work done by Mathieu Desnoyers, I changed it to
encode the 'what' parameter on the tracepoint name, so that one can register
interest in specific events and not on classes of events to then check the
'what' parameter.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/Kconfig    |   1 +
 block/blk-core.c |  33 +++---
 block/blktrace.c | 332 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 block/elevator.c |   7 +-
 4 files changed, 348 insertions(+), 25 deletions(-)

(limited to 'block')

diff --git a/block/Kconfig b/block/Kconfig
index 1ab7c15c8d7a..290b219fad9c 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -47,6 +47,7 @@ config BLK_DEV_IO_TRACE
 	depends on SYSFS
 	select RELAY
 	select DEBUG_FS
+	select TRACEPOINTS
 	help
 	  Say Y here if you want to be able to trace the block layer actions
 	  on a given queue. Tracing allows you to see any traffic happening
diff --git a/block/blk-core.c b/block/blk-core.c
index 10e8a64a5a5b..04267d66a2b9 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -28,6 +28,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
+#include <trace/block.h>
 
 #include "blk.h"
 
@@ -205,7 +206,7 @@ void blk_plug_device(struct request_queue *q)
 
 	if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) {
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
-		blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
+		trace_block_plug(q);
 	}
 }
 EXPORT_SYMBOL(blk_plug_device);
@@ -292,9 +293,7 @@ void blk_unplug_work(struct work_struct *work)
 	struct request_queue *q =
 		container_of(work, struct request_queue, unplug_work);
 
-	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
-				q->rq.count[READ] + q->rq.count[WRITE]);
-
+	trace_block_unplug_io(q);
 	q->unplug_fn(q);
 }
 
@@ -302,9 +301,7 @@ void blk_unplug_timeout(unsigned long data)
 {
 	struct request_queue *q = (struct request_queue *)data;
 
-	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
-				q->rq.count[READ] + q->rq.count[WRITE]);
-
+	trace_block_unplug_timer(q);
 	kblockd_schedule_work(q, &q->unplug_work);
 }
 
@@ -314,9 +311,7 @@ void blk_unplug(struct request_queue *q)
 	 * devices don't necessarily have an ->unplug_fn defined
 	 */
 	if (q->unplug_fn) {
-		blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
-					q->rq.count[READ] + q->rq.count[WRITE]);
-
+		trace_block_unplug_io(q);
 		q->unplug_fn(q);
 	}
 }
@@ -822,7 +817,7 @@ rq_starved:
 	if (ioc_batching(q, ioc))
 		ioc->nr_batch_requests--;
 
-	blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
+	trace_block_getrq(q, bio, rw);
 out:
 	return rq;
 }
@@ -848,7 +843,7 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 		prepare_to_wait_exclusive(&rl->wait[rw], &wait,
 				TASK_UNINTERRUPTIBLE);
 
-		blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
+		trace_block_sleeprq(q, bio, rw);
 
 		__generic_unplug_device(q);
 		spin_unlock_irq(q->queue_lock);
@@ -928,7 +923,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
 {
 	blk_delete_timer(rq);
 	blk_clear_rq_complete(rq);
-	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+	trace_block_rq_requeue(q, rq);
 
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
@@ -1167,7 +1162,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 		if (!ll_back_merge_fn(q, req, bio))
 			break;
 
-		blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+		trace_block_bio_backmerge(q, bio);
 
 		req->biotail->bi_next = bio;
 		req->biotail = bio;
@@ -1186,7 +1181,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 		if (!ll_front_merge_fn(q, req, bio))
 			break;
 
-		blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+		trace_block_bio_frontmerge(q, bio);
 
 		bio->bi_next = req->bio;
 		req->bio = bio;
@@ -1269,7 +1264,7 @@ static inline void blk_partition_remap(struct bio *bio)
 		bio->bi_sector += p->start_sect;
 		bio->bi_bdev = bdev->bd_contains;
 
-		blk_add_trace_remap(bdev_get_queue(bio->bi_bdev), bio,
+		trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
 				    bdev->bd_dev, bio->bi_sector,
 				    bio->bi_sector - p->start_sect);
 	}
@@ -1441,10 +1436,10 @@ end_io:
 			goto end_io;
 
 		if (old_sector != -1)
-			blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
+			trace_block_remap(q, bio, old_dev, bio->bi_sector,
 					    old_sector);
 
-		blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+		trace_block_bio_queue(q, bio);
 
 		old_sector = bio->bi_sector;
 		old_dev = bio->bi_bdev->bd_dev;
@@ -1656,7 +1651,7 @@ static int __end_that_request_first(struct request *req, int error,
 	int total_bytes, bio_nbytes, next_idx = 0;
 	struct bio *bio;
 
-	blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
+	trace_block_rq_complete(req->q, req);
 
 	/*
 	 * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual
diff --git a/block/blktrace.c b/block/blktrace.c
index 85049a7e7a17..b0a2cae886db 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -23,10 +23,18 @@
 #include <linux/mutex.h>
 #include <linux/debugfs.h>
 #include <linux/time.h>
+#include <trace/block.h>
 #include <asm/uaccess.h>
 
 static unsigned int blktrace_seq __read_mostly = 1;
 
+/* Global reference count of probes */
+static DEFINE_MUTEX(blk_probe_mutex);
+static atomic_t blk_probes_ref = ATOMIC_INIT(0);
+
+static int blk_register_tracepoints(void);
+static void blk_unregister_tracepoints(void);
+
 /*
  * Send out a notify message.
  */
@@ -119,7 +127,7 @@ static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK
  * The worker for the various blk_add_trace*() types. Fills out a
  * blk_io_trace structure and places it in a per-cpu subbuffer.
  */
-void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
+static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		     int rw, u32 what, int error, int pdu_len, void *pdu_data)
 {
 	struct task_struct *tsk = current;
@@ -177,8 +185,6 @@ void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	local_irq_restore(flags);
 }
 
-EXPORT_SYMBOL_GPL(__blk_add_trace);
-
 static struct dentry *blk_tree_root;
 static DEFINE_MUTEX(blk_tree_mutex);
 static unsigned int root_users;
@@ -237,6 +243,10 @@ static void blk_trace_cleanup(struct blk_trace *bt)
 	free_percpu(bt->sequence);
 	free_percpu(bt->msg_data);
 	kfree(bt);
+	mutex_lock(&blk_probe_mutex);
+	if (atomic_dec_and_test(&blk_probes_ref))
+		blk_unregister_tracepoints();
+	mutex_unlock(&blk_probe_mutex);
 }
 
 int blk_trace_remove(struct request_queue *q)
@@ -428,6 +438,14 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	bt->pid = buts->pid;
 	bt->trace_state = Blktrace_setup;
 
+	mutex_lock(&blk_probe_mutex);
+	if (atomic_add_return(1, &blk_probes_ref) == 1) {
+		ret = blk_register_tracepoints();
+		if (ret)
+			goto probe_err;
+	}
+	mutex_unlock(&blk_probe_mutex);
+
 	ret = -EBUSY;
 	old_bt = xchg(&q->blk_trace, bt);
 	if (old_bt) {
@@ -436,6 +454,9 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	}
 
 	return 0;
+probe_err:
+	atomic_dec(&blk_probes_ref);
+	mutex_unlock(&blk_probe_mutex);
 err:
 	if (dir)
 		blk_remove_tree(dir);
@@ -562,3 +583,308 @@ void blk_trace_shutdown(struct request_queue *q)
 		blk_trace_remove(q);
 	}
 }
+
+/*
+ * blktrace probes
+ */
+
+/**
+ * blk_add_trace_rq - Add a trace for a request oriented action
+ * @q:		queue the io is for
+ * @rq:		the source request
+ * @what:	the action
+ *
+ * Description:
+ *     Records an action against a request. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
+				    u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+	int rw = rq->cmd_flags & 0x03;
+
+	if (likely(!bt))
+		return;
+
+	if (blk_discard_rq(rq))
+		rw |= (1 << BIO_RW_DISCARD);
+
+	if (blk_pc_request(rq)) {
+		what |= BLK_TC_ACT(BLK_TC_PC);
+		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
+				sizeof(rq->cmd), rq->cmd);
+	} else  {
+		what |= BLK_TC_ACT(BLK_TC_FS);
+		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
+				rw, what, rq->errors, 0, NULL);
+	}
+}
+
+static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_ABORT);
+}
+
+static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+}
+
+static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+}
+
+static void blk_add_trace_rq_requeue(struct request_queue *q, struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+}
+
+static void blk_add_trace_rq_complete(struct request_queue *q, struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
+}
+
+/**
+ * blk_add_trace_bio - Add a trace for a bio oriented action
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @what:	the action
+ *
+ * Description:
+ *     Records an action against a bio. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
+				     u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
+			!bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+}
+
+static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
+}
+
+static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
+}
+
+static void blk_add_trace_bio_backmerge(struct request_queue *q, struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+}
+
+static void blk_add_trace_bio_frontmerge(struct request_queue *q, struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+}
+
+static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+}
+
+static void blk_add_trace_getrq(struct request_queue *q, struct bio *bio, int rw)
+{
+	if (bio)
+		blk_add_trace_bio(q, bio, BLK_TA_GETRQ);
+	else {
+		struct blk_trace *bt = q->blk_trace;
+
+		if (bt)
+			__blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
+	}
+}
+
+
+static void blk_add_trace_sleeprq(struct request_queue *q, struct bio *bio, int rw)
+{
+	if (bio)
+		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ);
+	else {
+		struct blk_trace *bt = q->blk_trace;
+
+		if (bt)
+			__blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, 0, 0, NULL);
+	}
+}
+
+static void blk_add_trace_plug(struct request_queue *q)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt)
+		__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+}
+
+static void blk_add_trace_unplug_io(struct request_queue *q)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt) {
+		unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
+		__be64 rpdu = cpu_to_be64(pdu);
+
+		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
+				sizeof(rpdu), &rpdu);
+	}
+}
+
+static void blk_add_trace_unplug_timer(struct request_queue *q)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt) {
+		unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
+		__be64 rpdu = cpu_to_be64(pdu);
+
+		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0,
+				sizeof(rpdu), &rpdu);
+	}
+}
+
+static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
+				unsigned int pdu)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt) {
+		__be64 rpdu = cpu_to_be64(pdu);
+
+		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
+				BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE),
+				sizeof(rpdu), &rpdu);
+	}
+}
+
+/**
+ * blk_add_trace_remap - Add a trace for a remap operation
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @dev:	target device
+ * @from:	source sector
+ * @to:		target sector
+ *
+ * Description:
+ *     Device mapper or raid target sometimes need to split a bio because
+ *     it spans a stripe (or similar). Add a trace for that action.
+ *
+ **/
+static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
+				       dev_t dev, sector_t from, sector_t to)
+{
+	struct blk_trace *bt = q->blk_trace;
+	struct blk_io_trace_remap r;
+
+	if (likely(!bt))
+		return;
+
+	r.device = cpu_to_be32(dev);
+	r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
+	r.sector = cpu_to_be64(to);
+
+	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
+			!bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+}
+
+/**
+ * blk_add_driver_data - Add binary message with driver-specific data
+ * @q:		queue the io is for
+ * @rq:		io request
+ * @data:	driver-specific data
+ * @len:	length of driver-specific data
+ *
+ * Description:
+ *     Some drivers might want to write driver-specific data per request.
+ *
+ **/
+void blk_add_driver_data(struct request_queue *q,
+			 struct request *rq,
+			 void *data, size_t len)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	if (blk_pc_request(rq))
+		__blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
+				rq->errors, len, data);
+	else
+		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
+				0, BLK_TA_DRV_DATA, rq->errors, len, data);
+}
+EXPORT_SYMBOL_GPL(blk_add_driver_data);
+
+static int blk_register_tracepoints(void)
+{
+	int ret;
+
+	ret = register_trace_block_rq_abort(blk_add_trace_rq_abort);
+	WARN_ON(ret);
+	ret = register_trace_block_rq_insert(blk_add_trace_rq_insert);
+	WARN_ON(ret);
+	ret = register_trace_block_rq_issue(blk_add_trace_rq_issue);
+	WARN_ON(ret);
+	ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+	WARN_ON(ret);
+	ret = register_trace_block_rq_complete(blk_add_trace_rq_complete);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_complete(blk_add_trace_bio_complete);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_queue(blk_add_trace_bio_queue);
+	WARN_ON(ret);
+	ret = register_trace_block_getrq(blk_add_trace_getrq);
+	WARN_ON(ret);
+	ret = register_trace_block_sleeprq(blk_add_trace_sleeprq);
+	WARN_ON(ret);
+	ret = register_trace_block_plug(blk_add_trace_plug);
+	WARN_ON(ret);
+	ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+	WARN_ON(ret);
+	ret = register_trace_block_unplug_io(blk_add_trace_unplug_io);
+	WARN_ON(ret);
+	ret = register_trace_block_split(blk_add_trace_split);
+	WARN_ON(ret);
+	ret = register_trace_block_remap(blk_add_trace_remap);
+	WARN_ON(ret);
+	return 0;
+}
+
+static void blk_unregister_tracepoints(void)
+{
+	unregister_trace_block_remap(blk_add_trace_remap);
+	unregister_trace_block_split(blk_add_trace_split);
+	unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
+	unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+	unregister_trace_block_plug(blk_add_trace_plug);
+	unregister_trace_block_sleeprq(blk_add_trace_sleeprq);
+	unregister_trace_block_getrq(blk_add_trace_getrq);
+	unregister_trace_block_bio_queue(blk_add_trace_bio_queue);
+	unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+	unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+	unregister_trace_block_bio_complete(blk_add_trace_bio_complete);
+	unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+	unregister_trace_block_rq_complete(blk_add_trace_rq_complete);
+	unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+	unregister_trace_block_rq_issue(blk_add_trace_rq_issue);
+	unregister_trace_block_rq_insert(blk_add_trace_rq_insert);
+	unregister_trace_block_rq_abort(blk_add_trace_rq_abort);
+
+	tracepoint_synchronize_unregister();
+}
diff --git a/block/elevator.c b/block/elevator.c
index 9ac82dde99dd..530fcfe2ef07 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -33,6 +33,7 @@
 #include <linux/compiler.h>
 #include <linux/delay.h>
 #include <linux/blktrace_api.h>
+#include <trace/block.h>
 #include <linux/hash.h>
 #include <linux/uaccess.h>
 
@@ -586,7 +587,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 	unsigned ordseq;
 	int unplug_it = 1;
 
-	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+	trace_block_rq_insert(q, rq);
 
 	rq->q = q;
 
@@ -772,7 +773,7 @@ struct request *elv_next_request(struct request_queue *q)
 			 * not be passed by new incoming requests
 			 */
 			rq->cmd_flags |= REQ_STARTED;
-			blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+			trace_block_rq_issue(q, rq);
 		}
 
 		if (!q->boundary_rq || q->boundary_rq == rq) {
@@ -921,7 +922,7 @@ void elv_abort_queue(struct request_queue *q)
 	while (!list_empty(&q->queue_head)) {
 		rq = list_entry_rq(q->queue_head.next);
 		rq->cmd_flags |= REQ_QUIET;
-		blk_add_trace_rq(q, rq, BLK_TA_ABORT);
+		trace_block_rq_abort(q, rq);
 		__blk_end_request(rq, -EIO, blk_rq_bytes(rq));
 	}
 }
-- 
cgit v1.2.3


From 0bfc24559d7945506184d86739fe365a181f06b7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 26 Nov 2008 11:59:56 +0100
Subject: blktrace: port to tracepoints, update

Port to the new tracepoints API: split DEFINE_TRACE() and DECLARE_TRACE()
sites. Spread them out to the usage sites, as suggested by
Mathieu Desnoyers.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
---
 block/blk-core.c      | 13 ++++++++
 block/elevator.c      |  5 +++
 drivers/md/dm.c       |  2 ++
 fs/bio.c              |  2 ++
 include/trace/block.h | 84 ++++++++++++++++++++++++++++++---------------------
 mm/bounce.c           |  2 ++
 6 files changed, 74 insertions(+), 34 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 04267d66a2b9..0c06cf5aaaf8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -32,6 +32,19 @@
 
 #include "blk.h"
 
+DEFINE_TRACE(block_plug);
+DEFINE_TRACE(block_unplug_io);
+DEFINE_TRACE(block_unplug_timer);
+DEFINE_TRACE(block_getrq);
+DEFINE_TRACE(block_sleeprq);
+DEFINE_TRACE(block_rq_requeue);
+DEFINE_TRACE(block_bio_backmerge);
+DEFINE_TRACE(block_bio_frontmerge);
+DEFINE_TRACE(block_bio_queue);
+DEFINE_TRACE(block_rq_complete);
+DEFINE_TRACE(block_remap);	/* Also used in drivers/md/dm.c */
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
+
 static int __make_request(struct request_queue *q, struct bio *bio);
 
 /*
diff --git a/block/elevator.c b/block/elevator.c
index 530fcfe2ef07..e5677fe4f412 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -42,6 +42,8 @@
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
 
+DEFINE_TRACE(block_rq_abort);
+
 /*
  * Merge hash stuff.
  */
@@ -53,6 +55,9 @@ static const int elv_hash_shift = 6;
 #define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
 #define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
 
+DEFINE_TRACE(block_rq_insert);
+DEFINE_TRACE(block_rq_issue);
+
 /*
  * Query io scheduler to see if the current process issuing bio may be
  * merged with rq.
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index d23fda178163..343094c3feeb 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -52,6 +52,8 @@ struct dm_target_io {
 	union map_info info;
 };
 
+DEFINE_TRACE(block_bio_complete);
+
 union map_info *dm_get_mapinfo(struct bio *bio)
 {
 	if (bio && bio->bi_private)
diff --git a/fs/bio.c b/fs/bio.c
index 060859c69092..df99c882b807 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -29,6 +29,8 @@
 #include <trace/block.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
+DEFINE_TRACE(block_split);
+
 static struct kmem_cache *bio_slab __read_mostly;
 
 static mempool_t *bio_split_pool __read_mostly;
diff --git a/include/trace/block.h b/include/trace/block.h
index 3cc2675ebf01..25c6a1fd5b77 100644
--- a/include/trace/block.h
+++ b/include/trace/block.h
@@ -4,57 +4,73 @@
 #include <linux/blkdev.h>
 #include <linux/tracepoint.h>
 
-DEFINE_TRACE(block_rq_abort,
+DECLARE_TRACE(block_rq_abort,
 	TPPROTO(struct request_queue *q, struct request *rq),
-	TPARGS(q, rq));
-DEFINE_TRACE(block_rq_insert,
+		TPARGS(q, rq));
+
+DECLARE_TRACE(block_rq_insert,
 	TPPROTO(struct request_queue *q, struct request *rq),
-	TPARGS(q, rq));
-DEFINE_TRACE(block_rq_issue,
+		TPARGS(q, rq));
+
+DECLARE_TRACE(block_rq_issue,
 	TPPROTO(struct request_queue *q, struct request *rq),
-	TPARGS(q, rq));
-DEFINE_TRACE(block_rq_requeue,
+		TPARGS(q, rq));
+
+DECLARE_TRACE(block_rq_requeue,
 	TPPROTO(struct request_queue *q, struct request *rq),
-	TPARGS(q, rq));
-DEFINE_TRACE(block_rq_complete,
+		TPARGS(q, rq));
+
+DECLARE_TRACE(block_rq_complete,
 	TPPROTO(struct request_queue *q, struct request *rq),
-	TPARGS(q, rq));
-DEFINE_TRACE(block_bio_bounce,
+		TPARGS(q, rq));
+
+DECLARE_TRACE(block_bio_bounce,
 	TPPROTO(struct request_queue *q, struct bio *bio),
-	TPARGS(q, bio));
-DEFINE_TRACE(block_bio_complete,
+		TPARGS(q, bio));
+
+DECLARE_TRACE(block_bio_complete,
 	TPPROTO(struct request_queue *q, struct bio *bio),
-	TPARGS(q, bio));
-DEFINE_TRACE(block_bio_backmerge,
+		TPARGS(q, bio));
+
+DECLARE_TRACE(block_bio_backmerge,
 	TPPROTO(struct request_queue *q, struct bio *bio),
-	TPARGS(q, bio));
-DEFINE_TRACE(block_bio_frontmerge,
+		TPARGS(q, bio));
+
+DECLARE_TRACE(block_bio_frontmerge,
 	TPPROTO(struct request_queue *q, struct bio *bio),
-	TPARGS(q, bio));
-DEFINE_TRACE(block_bio_queue,
+		TPARGS(q, bio));
+
+DECLARE_TRACE(block_bio_queue,
 	TPPROTO(struct request_queue *q, struct bio *bio),
-	TPARGS(q, bio));
-DEFINE_TRACE(block_getrq,
+		TPARGS(q, bio));
+
+DECLARE_TRACE(block_getrq,
 	TPPROTO(struct request_queue *q, struct bio *bio, int rw),
-	TPARGS(q, bio, rw));
-DEFINE_TRACE(block_sleeprq,
+		TPARGS(q, bio, rw));
+
+DECLARE_TRACE(block_sleeprq,
 	TPPROTO(struct request_queue *q, struct bio *bio, int rw),
-	TPARGS(q, bio, rw));
-DEFINE_TRACE(block_plug,
+		TPARGS(q, bio, rw));
+
+DECLARE_TRACE(block_plug,
 	TPPROTO(struct request_queue *q),
-	TPARGS(q));
-DEFINE_TRACE(block_unplug_timer,
+		TPARGS(q));
+
+DECLARE_TRACE(block_unplug_timer,
 	TPPROTO(struct request_queue *q),
-	TPARGS(q));
-DEFINE_TRACE(block_unplug_io,
+		TPARGS(q));
+
+DECLARE_TRACE(block_unplug_io,
 	TPPROTO(struct request_queue *q),
-	TPARGS(q));
-DEFINE_TRACE(block_split,
+		TPARGS(q));
+
+DECLARE_TRACE(block_split,
 	TPPROTO(struct request_queue *q, struct bio *bio, unsigned int pdu),
-	TPARGS(q, bio, pdu));
-DEFINE_TRACE(block_remap,
+		TPARGS(q, bio, pdu));
+
+DECLARE_TRACE(block_remap,
 	TPPROTO(struct request_queue *q, struct bio *bio, dev_t dev,
 		sector_t from, sector_t to),
-	TPARGS(q, bio, dev, from, to));
+		TPARGS(q, bio, dev, from, to));
 
 #endif
diff --git a/mm/bounce.c b/mm/bounce.c
index bd1caaa582b8..bf0cf7c8387b 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -22,6 +22,8 @@
 
 static mempool_t *page_pool, *isa_page_pool;
 
+DEFINE_TRACE(block_bio_bounce);
+
 #ifdef CONFIG_HIGHMEM
 static __init int init_emergency_pool(void)
 {
-- 
cgit v1.2.3


From 565e411d764eeda006738dfadbccca79d48381e1 Mon Sep 17 00:00:00 2001
From: "malahal@us.ibm.com" <malahal@us.ibm.com>
Date: Thu, 30 Oct 2008 08:51:58 +0100
Subject: block: optimizations in blk_rq_timed_out_timer()

Now the rq->deadline can't be zero if the request is in the
timeout_list, so there is no need to have next_set. There is no need to
access a request's deadline field if blk_rq_timed_out is called on it.

Signed-off-by: Malahal Naineni <malahal@us.ibm.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-timeout.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 69185ea9fae2..116bbf394fb5 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -111,7 +111,7 @@ static void blk_rq_timed_out(struct request *req)
 void blk_rq_timed_out_timer(unsigned long data)
 {
 	struct request_queue *q = (struct request_queue *) data;
-	unsigned long flags, uninitialized_var(next), next_set = 0;
+	unsigned long flags, next = 0;
 	struct request *rq, *tmp;
 
 	spin_lock_irqsave(q->queue_lock, flags);
@@ -126,12 +126,10 @@ void blk_rq_timed_out_timer(unsigned long data)
 			if (blk_mark_rq_complete(rq))
 				continue;
 			blk_rq_timed_out(rq);
+		} else {
+			if (!next || time_after(next, rq->deadline))
+				next = rq->deadline;
 		}
-		if (!next_set) {
-			next = rq->deadline;
-			next_set = 1;
-		} else if (time_after(next, rq->deadline))
-			next = rq->deadline;
 	}
 
 	if (next_set && !list_empty(&q->timeout_list))
-- 
cgit v1.2.3


From 65d3618ccfe686e8d7b3f01a838d0578182406df Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 30 Oct 2008 08:53:02 +0100
Subject: block: add comment in blk_rq_timed_out() about why next can not be 0

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-timeout.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 116bbf394fb5..99c3efc706b7 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -132,7 +132,12 @@ void blk_rq_timed_out_timer(unsigned long data)
 		}
 	}
 
-	if (next_set && !list_empty(&q->timeout_list))
+	/*
+	 * next can never be 0 here with the list non-empty, since we always
+	 * bump ->deadline to 1 so we can detect if the timer was ever added
+	 * or not. See comment in blk_add_timer()
+	 */
+	if (next)
 		mod_timer(&q->timeout, round_jiffies_up(next));
 
 	spin_unlock_irqrestore(q->queue_lock, flags);
-- 
cgit v1.2.3


From 70ed28b92a786f44750ab64117b03d126dd14656 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 19 Nov 2008 14:38:39 +0100
Subject: block: leave the request timeout timer running even on an empty list

For sync IO, we'll often do them serialized. This means we'll be touching
the queue timer for every IO, as opposed to only occasionally like we
do for queued IO. Instead of deleting the timer when the last request
is removed, just let continue running. If a new request comes up soon
we then don't have to readd the timer again. If no new requests arrive,
the timer will expire without side effect later.

This improves high iops sync IO by ~1%.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c    | 1 +
 block/blk-timeout.c | 4 ----
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 561e8a1b43a4..243d18b4ceb0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -404,6 +404,7 @@ EXPORT_SYMBOL(blk_stop_queue);
 void blk_sync_queue(struct request_queue *q)
 {
 	del_timer_sync(&q->unplug_timer);
+	del_timer_sync(&q->timeout);
 	kblockd_flush_work(&q->unplug_work);
 }
 EXPORT_SYMBOL(blk_sync_queue);
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 99c3efc706b7..a09535377a94 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -73,11 +73,7 @@ ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr,
  */
 void blk_delete_timer(struct request *req)
 {
-	struct request_queue *q = req->q;
-
 	list_del_init(&req->timeout_list);
-	if (list_empty(&q->timeout_list))
-		del_timer(&q->timeout);
 }
 
 static void blk_rq_timed_out(struct request *req)
-- 
cgit v1.2.3


From 2b91bafcc0fc545e489e9537a38f487706960ea5 Mon Sep 17 00:00:00 2001
From: Milton Miller <miltonm@bga.com>
Date: Mon, 17 Nov 2008 13:10:34 +0100
Subject: scsi-ioctl: use clock_t <> jiffies

Convert the timeout ioctl scalling to use the clock_t functions
which are much more accurate with some USER_HZ vs HZ combinations.

Signed-off-by: Milton Miller <miltonm@bga.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/scsi_ioctl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index d0bb92cbefb9..ee9c67d7e1be 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -60,7 +60,7 @@ static int scsi_get_bus(struct request_queue *q, int __user *p)
 
 static int sg_get_timeout(struct request_queue *q)
 {
-	return q->sg_timeout / (HZ / USER_HZ);
+	return jiffies_to_clock_t(q->sg_timeout);
 }
 
 static int sg_set_timeout(struct request_queue *q, int __user *p)
@@ -68,7 +68,7 @@ static int sg_set_timeout(struct request_queue *q, int __user *p)
 	int timeout, err = get_user(timeout, p);
 
 	if (!err)
-		q->sg_timeout = timeout * (HZ / USER_HZ);
+		q->sg_timeout = clock_t_to_jiffies(timeout);
 
 	return err;
 }
-- 
cgit v1.2.3


From c6a06f707cc29ea3a47588e4d2cd0bdcfa311a7d Mon Sep 17 00:00:00 2001
From: Qinghuang Feng <qhfeng.kernel@gmail.com>
Date: Mon, 24 Nov 2008 10:43:36 +0100
Subject: block/blk-tag.c: cleanup kernel-doc

There is no argument named @tags in blk_init_tags,
remove its' comment.

Signed-off-by: Qinghuang Feng <qhfeng.kernel@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-tag.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-tag.c b/block/blk-tag.c
index c0d419e84ce7..3c518e3303ae 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -158,7 +158,6 @@ fail:
 /**
  * blk_init_tags - initialize the tag info for an external tag map
  * @depth:	the maximum queue depth supported
- * @tags: the tag to use
  **/
 struct blk_queue_tag *blk_init_tags(int depth)
 {
-- 
cgit v1.2.3


From 7c239517d9f18427fc2e7ed259fb3b866595f5af Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 25 Nov 2008 09:08:39 +0100
Subject: block: don't take lock on changing ra_pages

There's no need to take queue_lock or kernel_lock when modifying
bdi->ra_pages. So remove them. Also remove out of date comment for
queue_max_sectors_store().

Signed-off-by: Wu Fengguang <wfg@linux.intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-sysfs.c    | 7 +------
 block/compat_ioctl.c | 2 --
 block/ioctl.c        | 2 --
 3 files changed, 1 insertion(+), 10 deletions(-)

(limited to 'block')

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 21e275d7eed9..a29cb788e408 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -88,9 +88,7 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count)
 	unsigned long ra_kb;
 	ssize_t ret = queue_var_store(&ra_kb, page, count);
 
-	spin_lock_irq(q->queue_lock);
 	q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);
-	spin_unlock_irq(q->queue_lock);
 
 	return ret;
 }
@@ -117,10 +115,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 
 	if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
 		return -EINVAL;
-	/*
-	 * Take the queue lock to update the readahead and max_sectors
-	 * values synchronously:
-	 */
+
 	spin_lock_irq(q->queue_lock);
 	q->max_sectors = max_sectors_kb << 1;
 	spin_unlock_irq(q->queue_lock);
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 67eb93cff699..f87615dea46b 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -774,9 +774,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 		bdi = blk_get_backing_dev_info(bdev);
 		if (bdi == NULL)
 			return -ENOTTY;
-		lock_kernel();
 		bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE;
-		unlock_kernel();
 		return 0;
 	case BLKGETSIZE:
 		size = bdev->bd_inode->i_size;
diff --git a/block/ioctl.c b/block/ioctl.c
index d03985b04d67..0f22e629b13c 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -323,9 +323,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		bdi = blk_get_backing_dev_info(bdev);
 		if (bdi == NULL)
 			return -ENOTTY;
-		lock_kernel();
 		bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE;
-		unlock_kernel();
 		return 0;
 	case BLKBSZSET:
 		/* set the logical block size */
-- 
cgit v1.2.3


From 08bafc0341f2f7920e9045bc32c40299cac8c21b Mon Sep 17 00:00:00 2001
From: Keith Mannthey <kmannth@us.ibm.com>
Date: Tue, 25 Nov 2008 10:24:35 +0100
Subject: block: Supress Buffer I/O errors when SCSI REQ_QUIET flag set

Allow the scsi request REQ_QUIET flag to be propagated to the buffer
file system layer. The basic ideas is to pass the flag from the scsi
request to the bio (block IO) and then to the buffer layer.  The buffer
layer can then suppress needless printks.

This patch declutters the kernel log by removed the 40-50 (per lun)
buffer io error messages seen during a boot in my multipath setup . It
is a good chance any real errors will be missed in the "noise" it the
logs without this patch.

During boot I see blocks of messages like
"
__ratelimit: 211 callbacks suppressed
Buffer I/O error on device sdm, logical block 5242879
Buffer I/O error on device sdm, logical block 5242879
Buffer I/O error on device sdm, logical block 5242847
Buffer I/O error on device sdm, logical block 1
Buffer I/O error on device sdm, logical block 5242878
Buffer I/O error on device sdm, logical block 5242879
Buffer I/O error on device sdm, logical block 5242879
Buffer I/O error on device sdm, logical block 5242879
Buffer I/O error on device sdm, logical block 5242879
Buffer I/O error on device sdm, logical block 5242872
"
in my logs.

My disk environment is multipath fiber channel using the SCSI_DH_RDAC
code and multipathd.  This topology includes an "active" and "ghost"
path for each lun. IO's to the "ghost" path will never complete and the
SCSI layer, via the scsi device handler rdac code, quick returns the IOs
to theses paths and sets the REQ_QUIET scsi flag to suppress the scsi
layer messages.

 I am wanting to extend the QUIET behavior to include the buffer file
system layer to deal with these errors as well. I have been running this
patch for a while now on several boxes without issue.  A few runs of
bonnie++ show no noticeable difference in performance in my setup.

Thanks for John Stultz for the quiet_error finalization.

Submitted-by:  Keith Mannthey <kmannth@us.ibm.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c            |  3 +++
 fs/buffer.c                 | 19 +++++++++++++++----
 include/linux/bio.h         |  1 +
 include/linux/buffer_head.h |  1 +
 4 files changed, 20 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 243d18b4ceb0..20e1724ccb4c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -153,6 +153,9 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 			nbytes = bio->bi_size;
 		}
 
+		if (unlikely(rq->cmd_flags & REQ_QUIET))
+			set_bit(BIO_QUIET, &bio->bi_flags);
+
 		bio->bi_size -= nbytes;
 		bio->bi_sector += (nbytes >> 9);
 
diff --git a/fs/buffer.c b/fs/buffer.c
index 10179cfa1152..776ae091d3b0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -99,10 +99,18 @@ __clear_page_buffers(struct page *page)
 	page_cache_release(page);
 }
 
+
+static int quiet_error(struct buffer_head *bh)
+{
+	if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
+		return 0;
+	return 1;
+}
+
+
 static void buffer_io_error(struct buffer_head *bh)
 {
 	char b[BDEVNAME_SIZE];
-
 	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
 			bdevname(bh->b_bdev, b),
 			(unsigned long long)bh->b_blocknr);
@@ -144,7 +152,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
+		if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {
 			buffer_io_error(bh);
 			printk(KERN_WARNING "lost page write due to "
 					"I/O error on %s\n",
@@ -394,7 +402,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		set_buffer_uptodate(bh);
 	} else {
 		clear_buffer_uptodate(bh);
-		if (printk_ratelimit())
+		if (!quiet_error(bh))
 			buffer_io_error(bh);
 		SetPageError(page);
 	}
@@ -455,7 +463,7 @@ static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (printk_ratelimit()) {
+		if (!quiet_error(bh)) {
 			buffer_io_error(bh);
 			printk(KERN_WARNING "lost page write due to "
 					"I/O error on %s\n",
@@ -2913,6 +2921,9 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
 		set_bit(BH_Eopnotsupp, &bh->b_state);
 	}
 
+	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
+		set_bit(BH_Quiet, &bh->b_state);
+
 	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
 	bio_put(bio);
 }
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 6a642098e5c3..cf132bfbbacf 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -117,6 +117,7 @@ struct bio {
 #define BIO_CPU_AFFINE	8	/* complete bio on same CPU as submitted */
 #define BIO_NULL_MAPPED 9	/* contains invalid user pages */
 #define BIO_FS_INTEGRITY 10	/* fs owns integrity data, not block layer */
+#define BIO_QUIET	11	/* Make BIO Quiet */
 #define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
 
 /*
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 3ce64b90118c..8605f8a74df9 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -35,6 +35,7 @@ enum bh_state_bits {
 	BH_Ordered,	/* ordered write */
 	BH_Eopnotsupp,	/* operation not supported (barrier) */
 	BH_Unwritten,	/* Buffer is allocated on disk but not written */
+	BH_Quiet,	/* Buffer Error Prinks to be quiet */
 
 	BH_PrivateStart,/* not a state bit, but the first bit available
 			 * for private allocation by other entities
-- 
cgit v1.2.3


From 64d01dc9e1927e6535627d73f2336c75d1dd3fe2 Mon Sep 17 00:00:00 2001
From: Cheng Renquan <crquan@gmail.com>
Date: Wed, 3 Dec 2008 12:41:39 +0100
Subject: block: use cancel_work_sync() instead of kblockd_flush_work()

After many improvements on kblockd_flush_work, it is now identical to
cancel_work_sync, so a direct call to cancel_work_sync is suggested.

The only difference is that cancel_work_sync is a GPL symbol,
so no non-GPL modules anymore.

Signed-off-by: Cheng Renquan <crquan@gmail.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/as-iosched.c     | 2 +-
 block/blk-core.c       | 8 +-------
 block/cfq-iosched.c    | 2 +-
 include/linux/blkdev.h | 1 -
 4 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'block')

diff --git a/block/as-iosched.c b/block/as-iosched.c
index 71f0abb219ee..802b5d0d8536 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -1344,7 +1344,7 @@ static void as_exit_queue(elevator_t *e)
 	struct as_data *ad = e->elevator_data;
 
 	del_timer_sync(&ad->antic_timer);
-	kblockd_flush_work(&ad->antic_work);
+	cancel_work_sync(&ad->antic_work);
 
 	BUG_ON(!list_empty(&ad->fifo_list[REQ_SYNC]));
 	BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC]));
diff --git a/block/blk-core.c b/block/blk-core.c
index 20e1724ccb4c..2fdcd0cff57f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -408,7 +408,7 @@ void blk_sync_queue(struct request_queue *q)
 {
 	del_timer_sync(&q->unplug_timer);
 	del_timer_sync(&q->timeout);
-	kblockd_flush_work(&q->unplug_work);
+	cancel_work_sync(&q->unplug_work);
 }
 EXPORT_SYMBOL(blk_sync_queue);
 
@@ -2147,12 +2147,6 @@ int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
 }
 EXPORT_SYMBOL(kblockd_schedule_work);
 
-void kblockd_flush_work(struct work_struct *work)
-{
-	cancel_work_sync(work);
-}
-EXPORT_SYMBOL(kblockd_flush_work);
-
 int __init blk_dev_init(void)
 {
 	kblockd_workqueue = create_workqueue("kblockd");
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 6a062eebbd15..a2bfec7d6b36 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2160,7 +2160,7 @@ out_cont:
 static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
 {
 	del_timer_sync(&cfqd->idle_slice_timer);
-	kblockd_flush_work(&cfqd->unplug_work);
+	cancel_work_sync(&cfqd->unplug_work);
 }
 
 static void cfq_put_async_queues(struct cfq_data *cfqd)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 482e9600f7a2..e9bb73ff1d64 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -978,7 +978,6 @@ static inline void put_dev_sector(Sector p)
 
 struct work_struct;
 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
-void kblockd_flush_work(struct work_struct *work);
 
 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
-- 
cgit v1.2.3


From 313e42999dbc0f234ca5909a236f78f082cb43b1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 28 Nov 2008 13:32:02 +0900
Subject: block: reorganize QUEUE_ORDERED_* constants

Separate out ordering type (drain,) and action masks (preflush,
postflush, fua) from visible ordering mode selectors
(QUEUE_ORDERED_*).  Ordering types are now named QUEUE_ORDERED_BY_*
while action masks are named QUEUE_ORDERED_DO_*.

This change is necessary to add QUEUE_ORDERED_DO_BAR and make it
optional to improve empty barrier implementation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c    | 20 ++++++++++----------
 include/linux/blkdev.h | 39 +++++++++++++++++++++++----------------
 2 files changed, 33 insertions(+), 26 deletions(-)

(limited to 'block')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 6e72d661ae42..1d7adc72c95d 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -24,8 +24,8 @@
 int blk_queue_ordered(struct request_queue *q, unsigned ordered,
 		      prepare_flush_fn *prepare_flush_fn)
 {
-	if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) &&
-	    prepare_flush_fn == NULL) {
+	if (!prepare_flush_fn && (ordered & (QUEUE_ORDERED_DO_PREFLUSH |
+					     QUEUE_ORDERED_DO_POSTFLUSH))) {
 		printk(KERN_ERR "%s: prepare_flush_fn required\n", __func__);
 		return -EINVAL;
 	}
@@ -134,7 +134,7 @@ static void queue_flush(struct request_queue *q, unsigned which)
 	struct request *rq;
 	rq_end_io_fn *end_io;
 
-	if (which == QUEUE_ORDERED_PREFLUSH) {
+	if (which == QUEUE_ORDERED_DO_PREFLUSH) {
 		rq = &q->pre_flush_rq;
 		end_io = pre_flush_end_io;
 	} else {
@@ -167,7 +167,7 @@ static inline struct request *start_ordered(struct request_queue *q,
 	blk_rq_init(q, rq);
 	if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
 		rq->cmd_flags |= REQ_RW;
-	if (q->ordered & QUEUE_ORDERED_FUA)
+	if (q->ordered & QUEUE_ORDERED_DO_FUA)
 		rq->cmd_flags |= REQ_FUA;
 	init_request_from_bio(rq, q->orig_bar_rq->bio);
 	rq->end_io = bar_end_io;
@@ -181,20 +181,20 @@ static inline struct request *start_ordered(struct request_queue *q,
 	 * there will be no data written between the pre and post flush.
 	 * Hence a single flush will suffice.
 	 */
-	if ((q->ordered & QUEUE_ORDERED_POSTFLUSH) && !blk_empty_barrier(rq))
-		queue_flush(q, QUEUE_ORDERED_POSTFLUSH);
+	if ((q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) && !blk_empty_barrier(rq))
+		queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
 	else
 		q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH;
 
 	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 
-	if (q->ordered & QUEUE_ORDERED_PREFLUSH) {
-		queue_flush(q, QUEUE_ORDERED_PREFLUSH);
+	if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) {
+		queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH);
 		rq = &q->pre_flush_rq;
 	} else
 		q->ordseq |= QUEUE_ORDSEQ_PREFLUSH;
 
-	if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0)
+	if ((q->ordered & QUEUE_ORDERED_BY_TAG) || q->in_flight == 0)
 		q->ordseq |= QUEUE_ORDSEQ_DRAIN;
 	else
 		rq = NULL;
@@ -237,7 +237,7 @@ int blk_do_ordered(struct request_queue *q, struct request **rqp)
 	    rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
 		return 1;
 
-	if (q->ordered & QUEUE_ORDERED_TAG) {
+	if (q->ordered & QUEUE_ORDERED_BY_TAG) {
 		/* Ordered by tag.  Blocking the next barrier is enough. */
 		if (is_barrier && rq != &q->bar_rq)
 			*rqp = NULL;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e9bb73ff1d64..5c92b4432399 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -523,22 +523,29 @@ enum {
 	 * TAG_FLUSH	: ordering by tag w/ pre and post flushes
 	 * TAG_FUA	: ordering by tag w/ pre flush and FUA write
 	 */
-	QUEUE_ORDERED_NONE	= 0x00,
-	QUEUE_ORDERED_DRAIN	= 0x01,
-	QUEUE_ORDERED_TAG	= 0x02,
-
-	QUEUE_ORDERED_PREFLUSH	= 0x10,
-	QUEUE_ORDERED_POSTFLUSH	= 0x20,
-	QUEUE_ORDERED_FUA	= 0x40,
-
-	QUEUE_ORDERED_DRAIN_FLUSH = QUEUE_ORDERED_DRAIN |
-			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH,
-	QUEUE_ORDERED_DRAIN_FUA	= QUEUE_ORDERED_DRAIN |
-			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA,
-	QUEUE_ORDERED_TAG_FLUSH	= QUEUE_ORDERED_TAG |
-			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH,
-	QUEUE_ORDERED_TAG_FUA	= QUEUE_ORDERED_TAG |
-			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA,
+	QUEUE_ORDERED_BY_DRAIN		= 0x01,
+	QUEUE_ORDERED_BY_TAG		= 0x02,
+	QUEUE_ORDERED_DO_PREFLUSH	= 0x10,
+	QUEUE_ORDERED_DO_POSTFLUSH	= 0x40,
+	QUEUE_ORDERED_DO_FUA		= 0x80,
+
+	QUEUE_ORDERED_NONE		= 0x00,
+
+	QUEUE_ORDERED_DRAIN		= QUEUE_ORDERED_BY_DRAIN,
+	QUEUE_ORDERED_DRAIN_FLUSH	= QUEUE_ORDERED_DRAIN |
+					  QUEUE_ORDERED_DO_PREFLUSH |
+					  QUEUE_ORDERED_DO_POSTFLUSH,
+	QUEUE_ORDERED_DRAIN_FUA		= QUEUE_ORDERED_DRAIN |
+					  QUEUE_ORDERED_DO_PREFLUSH |
+					  QUEUE_ORDERED_DO_FUA,
+
+	QUEUE_ORDERED_TAG		= QUEUE_ORDERED_BY_TAG,
+	QUEUE_ORDERED_TAG_FLUSH		= QUEUE_ORDERED_TAG |
+					  QUEUE_ORDERED_DO_PREFLUSH |
+					  QUEUE_ORDERED_DO_POSTFLUSH,
+	QUEUE_ORDERED_TAG_FUA		= QUEUE_ORDERED_TAG |
+					  QUEUE_ORDERED_DO_PREFLUSH |
+					  QUEUE_ORDERED_DO_FUA,
 
 	/*
 	 * Ordered operation sequence
-- 
cgit v1.2.3


From a7384677b2f4cd40948fd7ce024ba5e1821444ba Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 28 Nov 2008 13:32:03 +0900
Subject: block: remove duplicate or unused barrier/discard error paths

* Because barrier mode can be changed dynamically, whether barrier is
  supported or not can be determined only when actually issuing the
  barrier and there is no point in checking it earlier.  Drop barrier
  support check in generic_make_request() and __make_request(), and
  update comment around the support check in blk_do_ordered().

* There is no reason to check discard support in both
  generic_make_request() and __make_request().  Drop the check in
  __make_request().  While at it, move error action block to the end
  of the function and add unlikely() to q existence test.

* Barrier request, be it empty or not, is never passed to low level
  driver and thus it's meaningless to try to copy back req->sector to
  bio->bi_sector on error.  In addition, the notion of failed sector
  doesn't make any sense for empty barrier to begin with.  Drop the
  code block from __end_that_request_first().

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c |  4 ++--
 block/blk-core.c    | 44 +++++++++++---------------------------------
 2 files changed, 13 insertions(+), 35 deletions(-)

(limited to 'block')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 1d7adc72c95d..43d479a1e664 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -216,8 +216,8 @@ int blk_do_ordered(struct request_queue *q, struct request **rqp)
 			return 1;
 		} else {
 			/*
-			 * This can happen when the queue switches to
-			 * ORDERED_NONE while this request is on it.
+			 * Queue ordering not supported.  Terminate
+			 * with prejudice.
 			 */
 			elv_dequeue_request(q, rq);
 			if (__blk_end_request(rq, -EOPNOTSUPP,
diff --git a/block/blk-core.c b/block/blk-core.c
index 2fdcd0cff57f..b1fd4f5f07d3 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1139,7 +1139,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 static int __make_request(struct request_queue *q, struct bio *bio)
 {
 	struct request *req;
-	int el_ret, nr_sectors, barrier, discard, err;
+	int el_ret, nr_sectors;
 	const unsigned short prio = bio_prio(bio);
 	const int sync = bio_sync(bio);
 	int rw_flags;
@@ -1153,22 +1153,9 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 	 */
 	blk_queue_bounce(q, &bio);
 
-	barrier = bio_barrier(bio);
-	if (unlikely(barrier) && bio_has_data(bio) &&
-	    (q->next_ordered == QUEUE_ORDERED_NONE)) {
-		err = -EOPNOTSUPP;
-		goto end_io;
-	}
-
-	discard = bio_discard(bio);
-	if (unlikely(discard) && !q->prepare_discard_fn) {
-		err = -EOPNOTSUPP;
-		goto end_io;
-	}
-
 	spin_lock_irq(q->queue_lock);
 
-	if (unlikely(barrier) || elv_queue_empty(q))
+	if (unlikely(bio_barrier(bio)) || elv_queue_empty(q))
 		goto get_rq;
 
 	el_ret = elv_merge(q, &req, bio);
@@ -1262,10 +1249,6 @@ out:
 		__generic_unplug_device(q);
 	spin_unlock_irq(q->queue_lock);
 	return 0;
-
-end_io:
-	bio_endio(bio, err);
-	return 0;
 }
 
 /*
@@ -1418,15 +1401,13 @@ static inline void __generic_make_request(struct bio *bio)
 		char b[BDEVNAME_SIZE];
 
 		q = bdev_get_queue(bio->bi_bdev);
-		if (!q) {
+		if (unlikely(!q)) {
 			printk(KERN_ERR
 			       "generic_make_request: Trying to access "
 				"nonexistent block-device %s (%Lu)\n",
 				bdevname(bio->bi_bdev, b),
 				(long long) bio->bi_sector);
-end_io:
-			bio_endio(bio, err);
-			break;
+			goto end_io;
 		}
 
 		if (unlikely(nr_sectors > q->max_hw_sectors)) {
@@ -1463,14 +1444,19 @@ end_io:
 
 		if (bio_check_eod(bio, nr_sectors))
 			goto end_io;
-		if ((bio_empty_barrier(bio) && !q->prepare_flush_fn) ||
-		    (bio_discard(bio) && !q->prepare_discard_fn)) {
+
+		if (bio_discard(bio) && !q->prepare_discard_fn) {
 			err = -EOPNOTSUPP;
 			goto end_io;
 		}
 
 		ret = q->make_request_fn(q, bio);
 	} while (ret);
+
+	return;
+
+end_io:
+	bio_endio(bio, err);
 }
 
 /*
@@ -1720,14 +1706,6 @@ static int __end_that_request_first(struct request *req, int error,
 	while ((bio = req->bio) != NULL) {
 		int nbytes;
 
-		/*
-		 * For an empty barrier request, the low level driver must
-		 * store a potential error location in ->sector. We pass
-		 * that back up in ->bi_sector.
-		 */
-		if (blk_empty_barrier(req))
-			bio->bi_sector = req->sector;
-
 		if (nr_bytes >= bio->bi_size) {
 			req->bio = bio->bi_next;
 			nbytes = bio->bi_size;
-- 
cgit v1.2.3


From f671620e7d895af221bdfeda751d54fa55ed9546 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 28 Nov 2008 13:32:04 +0900
Subject: block: make every barrier action optional

In all barrier sequences, the barrier write itself was always assumed
to be issued and thus didn't have corresponding control flag.  This
patch adds QUEUE_ORDERED_DO_BAR and unify action mask handling in
start_ordered() such that any barrier action can be skipped.

This patch doesn't introduce any visible behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c    | 41 ++++++++++++++++++++++++-----------------
 include/linux/blkdev.h |  7 +++++--
 2 files changed, 29 insertions(+), 19 deletions(-)

(limited to 'block')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 43d479a1e664..1efabf829c53 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -158,19 +158,10 @@ static inline struct request *start_ordered(struct request_queue *q,
 	q->ordered = q->next_ordered;
 	q->ordseq |= QUEUE_ORDSEQ_STARTED;
 
-	/*
-	 * Prep proxy barrier request.
-	 */
+	/* stash away the original request */
 	elv_dequeue_request(q, rq);
 	q->orig_bar_rq = rq;
-	rq = &q->bar_rq;
-	blk_rq_init(q, rq);
-	if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
-		rq->cmd_flags |= REQ_RW;
-	if (q->ordered & QUEUE_ORDERED_DO_FUA)
-		rq->cmd_flags |= REQ_FUA;
-	init_request_from_bio(rq, q->orig_bar_rq->bio);
-	rq->end_io = bar_end_io;
+	rq = NULL;
 
 	/*
 	 * Queue ordered sequence.  As we stack them at the head, we
@@ -181,12 +172,28 @@ static inline struct request *start_ordered(struct request_queue *q,
 	 * there will be no data written between the pre and post flush.
 	 * Hence a single flush will suffice.
 	 */
-	if ((q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) && !blk_empty_barrier(rq))
+	if ((q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) &&
+	    !blk_empty_barrier(q->orig_bar_rq)) {
 		queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
-	else
+		rq = &q->post_flush_rq;
+	} else
 		q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH;
 
-	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
+	if (q->ordered & QUEUE_ORDERED_DO_BAR) {
+		rq = &q->bar_rq;
+
+		/* initialize proxy request and queue it */
+		blk_rq_init(q, rq);
+		if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
+			rq->cmd_flags |= REQ_RW;
+		if (q->ordered & QUEUE_ORDERED_DO_FUA)
+			rq->cmd_flags |= REQ_FUA;
+		init_request_from_bio(rq, q->orig_bar_rq->bio);
+		rq->end_io = bar_end_io;
+
+		elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
+	} else
+		q->ordseq |= QUEUE_ORDSEQ_BAR;
 
 	if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) {
 		queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH);
@@ -194,10 +201,10 @@ static inline struct request *start_ordered(struct request_queue *q,
 	} else
 		q->ordseq |= QUEUE_ORDSEQ_PREFLUSH;
 
-	if ((q->ordered & QUEUE_ORDERED_BY_TAG) || q->in_flight == 0)
-		q->ordseq |= QUEUE_ORDSEQ_DRAIN;
-	else
+	if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && q->in_flight)
 		rq = NULL;
+	else
+		q->ordseq |= QUEUE_ORDSEQ_DRAIN;
 
 	return rq;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5c92b4432399..b044267009ed 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -526,12 +526,14 @@ enum {
 	QUEUE_ORDERED_BY_DRAIN		= 0x01,
 	QUEUE_ORDERED_BY_TAG		= 0x02,
 	QUEUE_ORDERED_DO_PREFLUSH	= 0x10,
+	QUEUE_ORDERED_DO_BAR		= 0x20,
 	QUEUE_ORDERED_DO_POSTFLUSH	= 0x40,
 	QUEUE_ORDERED_DO_FUA		= 0x80,
 
 	QUEUE_ORDERED_NONE		= 0x00,
 
-	QUEUE_ORDERED_DRAIN		= QUEUE_ORDERED_BY_DRAIN,
+	QUEUE_ORDERED_DRAIN		= QUEUE_ORDERED_BY_DRAIN |
+					  QUEUE_ORDERED_DO_BAR,
 	QUEUE_ORDERED_DRAIN_FLUSH	= QUEUE_ORDERED_DRAIN |
 					  QUEUE_ORDERED_DO_PREFLUSH |
 					  QUEUE_ORDERED_DO_POSTFLUSH,
@@ -539,7 +541,8 @@ enum {
 					  QUEUE_ORDERED_DO_PREFLUSH |
 					  QUEUE_ORDERED_DO_FUA,
 
-	QUEUE_ORDERED_TAG		= QUEUE_ORDERED_BY_TAG,
+	QUEUE_ORDERED_TAG		= QUEUE_ORDERED_BY_TAG |
+					  QUEUE_ORDERED_DO_BAR,
 	QUEUE_ORDERED_TAG_FLUSH		= QUEUE_ORDERED_TAG |
 					  QUEUE_ORDERED_DO_PREFLUSH |
 					  QUEUE_ORDERED_DO_POSTFLUSH,
-- 
cgit v1.2.3


From 8f11b3e99a1136fcbb67316c3260f085299c0bff Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 28 Nov 2008 13:32:05 +0900
Subject: block: make barrier completion more robust

Barrier completion had the following assumptions.

* start_ordered() couldn't finish the whole sequence properly.  If all
  actions are to be skipped, q->ordseq is set correctly but the actual
  completion was never triggered thus hanging the barrier request.

* Drain completion in elv_complete_request() assumed that there's
  always at least one request in the queue when drain completes.

Both assumptions are true but these assumptions need to be removed to
improve empty barrier implementation.  This patch makes the following
changes.

* Make start_ordered() use blk_ordered_complete_seq() to mark skipped
  steps complete and notify __elv_next_request() that it should fetch
  the next request if the whole barrier has completed inside
  start_ordered().

* Make drain completion path in elv_complete_request() check whether
  the queue is empty.  Empty queue also indicates drain completion.

* While at it, convert 0/1 return from blk_do_ordered() to false/true.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c    | 45 +++++++++++++++++++++++++++------------------
 block/elevator.c       | 10 +++++++---
 include/linux/blkdev.h |  4 ++--
 3 files changed, 36 insertions(+), 23 deletions(-)

(limited to 'block')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 1efabf829c53..b03d88013e1e 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -88,7 +88,7 @@ unsigned blk_ordered_req_seq(struct request *rq)
 		return QUEUE_ORDSEQ_DONE;
 }
 
-void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
+bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
 {
 	struct request *rq;
 
@@ -99,7 +99,7 @@ void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
 	q->ordseq |= seq;
 
 	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
-		return;
+		return false;
 
 	/*
 	 * Okay, sequence complete.
@@ -109,6 +109,8 @@ void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
 
 	if (__blk_end_request(rq, q->orderr, blk_rq_bytes(rq)))
 		BUG();
+
+	return true;
 }
 
 static void pre_flush_end_io(struct request *rq, int error)
@@ -151,9 +153,11 @@ static void queue_flush(struct request_queue *q, unsigned which)
 	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 }
 
-static inline struct request *start_ordered(struct request_queue *q,
-					    struct request *rq)
+static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 {
+	struct request *rq = *rqp;
+	unsigned skip = 0;
+
 	q->orderr = 0;
 	q->ordered = q->next_ordered;
 	q->ordseq |= QUEUE_ORDSEQ_STARTED;
@@ -177,7 +181,7 @@ static inline struct request *start_ordered(struct request_queue *q,
 		queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
 		rq = &q->post_flush_rq;
 	} else
-		q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH;
+		skip |= QUEUE_ORDSEQ_POSTFLUSH;
 
 	if (q->ordered & QUEUE_ORDERED_DO_BAR) {
 		rq = &q->bar_rq;
@@ -193,35 +197,40 @@ static inline struct request *start_ordered(struct request_queue *q,
 
 		elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 	} else
-		q->ordseq |= QUEUE_ORDSEQ_BAR;
+		skip |= QUEUE_ORDSEQ_BAR;
 
 	if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) {
 		queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH);
 		rq = &q->pre_flush_rq;
 	} else
-		q->ordseq |= QUEUE_ORDSEQ_PREFLUSH;
+		skip |= QUEUE_ORDSEQ_PREFLUSH;
 
 	if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && q->in_flight)
 		rq = NULL;
 	else
-		q->ordseq |= QUEUE_ORDSEQ_DRAIN;
+		skip |= QUEUE_ORDSEQ_DRAIN;
+
+	*rqp = rq;
 
-	return rq;
+	/*
+	 * Complete skipped sequences.  If whole sequence is complete,
+	 * return false to tell elevator that this request is gone.
+	 */
+	return !blk_ordered_complete_seq(q, skip, 0);
 }
 
-int blk_do_ordered(struct request_queue *q, struct request **rqp)
+bool blk_do_ordered(struct request_queue *q, struct request **rqp)
 {
 	struct request *rq = *rqp;
 	const int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
 
 	if (!q->ordseq) {
 		if (!is_barrier)
-			return 1;
+			return true;
 
-		if (q->next_ordered != QUEUE_ORDERED_NONE) {
-			*rqp = start_ordered(q, rq);
-			return 1;
-		} else {
+		if (q->next_ordered != QUEUE_ORDERED_NONE)
+			return start_ordered(q, rqp);
+		else {
 			/*
 			 * Queue ordering not supported.  Terminate
 			 * with prejudice.
@@ -231,7 +240,7 @@ int blk_do_ordered(struct request_queue *q, struct request **rqp)
 					      blk_rq_bytes(rq)))
 				BUG();
 			*rqp = NULL;
-			return 0;
+			return false;
 		}
 	}
 
@@ -242,7 +251,7 @@ int blk_do_ordered(struct request_queue *q, struct request **rqp)
 	/* Special requests are not subject to ordering rules. */
 	if (!blk_fs_request(rq) &&
 	    rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
-		return 1;
+		return true;
 
 	if (q->ordered & QUEUE_ORDERED_BY_TAG) {
 		/* Ordered by tag.  Blocking the next barrier is enough. */
@@ -255,7 +264,7 @@ int blk_do_ordered(struct request_queue *q, struct request **rqp)
 			*rqp = NULL;
 	}
 
-	return 1;
+	return true;
 }
 
 static void bio_end_empty_barrier(struct bio *bio, int err)
diff --git a/block/elevator.c b/block/elevator.c
index 86836dd179c0..261ffaaf47bd 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -944,10 +944,14 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 	 * drained for flush sequence.
 	 */
 	if (unlikely(q->ordseq)) {
-		struct request *first_rq = list_entry_rq(q->queue_head.next);
-		if (q->in_flight == 0 &&
+		struct request *next = NULL;
+
+		if (!list_empty(&q->queue_head))
+			next = list_entry_rq(q->queue_head.next);
+
+		if (!q->in_flight &&
 		    blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
-		    blk_ordered_req_seq(first_rq) > QUEUE_ORDSEQ_DRAIN) {
+		    (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) {
 			blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
 			blk_start_queueing(q);
 		}
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b044267009ed..3c7078e0129d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -866,10 +866,10 @@ extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
 extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *);
-extern int blk_do_ordered(struct request_queue *, struct request **);
+extern bool blk_do_ordered(struct request_queue *, struct request **);
 extern unsigned blk_ordered_cur_seq(struct request_queue *);
 extern unsigned blk_ordered_req_seq(struct request *);
-extern void blk_ordered_complete_seq(struct request_queue *, unsigned, int);
+extern bool blk_ordered_complete_seq(struct request_queue *, unsigned, int);
 
 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
-- 
cgit v1.2.3


From 58eea927d2de43dc6f03d1ba2c46e55854b31540 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 28 Nov 2008 13:32:06 +0900
Subject: block: simplify empty barrier implementation

Empty barrier required special handling in __elv_next_request() to
complete it without letting the low level driver see it.

With previous changes, barrier code is now flexible enough to skip the
BAR step using the same barrier sequence selection mechanism.  Drop
the special handling and mask off q->ordered from start_ordered().

Remove blk_empty_barrier() test which now has no user.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c    | 16 ++++++++++------
 block/elevator.c       |  8 --------
 include/linux/blkdev.h |  1 -
 3 files changed, 10 insertions(+), 15 deletions(-)

(limited to 'block')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index b03d88013e1e..c63044e9c4c0 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -162,6 +162,14 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 	q->ordered = q->next_ordered;
 	q->ordseq |= QUEUE_ORDSEQ_STARTED;
 
+	/*
+	 * For an empty barrier, there's no actual BAR request, which
+	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
+	 */
+	if (!rq->hard_nr_sectors)
+		q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
+				QUEUE_ORDERED_DO_POSTFLUSH);
+
 	/* stash away the original request */
 	elv_dequeue_request(q, rq);
 	q->orig_bar_rq = rq;
@@ -171,13 +179,9 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 	 * Queue ordered sequence.  As we stack them at the head, we
 	 * need to queue in reverse order.  Note that we rely on that
 	 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
-	 * request gets inbetween ordered sequence. If this request is
-	 * an empty barrier, we don't need to do a postflush ever since
-	 * there will be no data written between the pre and post flush.
-	 * Hence a single flush will suffice.
+	 * request gets inbetween ordered sequence.
 	 */
-	if ((q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) &&
-	    !blk_empty_barrier(q->orig_bar_rq)) {
+	if (q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) {
 		queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
 		rq = &q->post_flush_rq;
 	} else
diff --git a/block/elevator.c b/block/elevator.c
index 261ffaaf47bd..ff60177a3bab 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -755,14 +755,6 @@ struct request *elv_next_request(struct request_queue *q)
 	int ret;
 
 	while ((rq = __elv_next_request(q)) != NULL) {
-		/*
-		 * Kill the empty barrier place holder, the driver must
-		 * not ever see it.
-		 */
-		if (blk_empty_barrier(rq)) {
-			__blk_end_request(rq, 0, blk_rq_bytes(rq));
-			continue;
-		}
 		if (!(rq->cmd_flags & REQ_STARTED)) {
 			/*
 			 * This is the first time the device driver
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3c7078e0129d..41bbadfd17f6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -596,7 +596,6 @@ enum {
 #define blk_fua_rq(rq)		((rq)->cmd_flags & REQ_FUA)
 #define blk_discard_rq(rq)	((rq)->cmd_flags & REQ_DISCARD)
 #define blk_bidi_rq(rq)		((rq)->next_rq != NULL)
-#define blk_empty_barrier(rq)	(blk_barrier_rq(rq) && blk_fs_request(rq) && !(rq)->hard_nr_sectors)
 /* rq->queuelist of dequeued request must be list_empty() */
 #define blk_queued_rq(rq)	(!list_empty(&(rq)->queuelist))
 
-- 
cgit v1.2.3


From a185eb4bc84155fff35b602ce99602c010de9634 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 28 Nov 2008 13:32:07 +0900
Subject: block: fix empty barrier on write-through w/ ordered tag

Empty barrier on write-through (or no cache) w/ ordered tag has no
command to execute and without any command to execute ordered tag is
never issued to the device and the ordering is never achieved.  Force
draining for such cases.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index c63044e9c4c0..8eba4e43bb0c 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -166,9 +166,21 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 	 * For an empty barrier, there's no actual BAR request, which
 	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
 	 */
-	if (!rq->hard_nr_sectors)
+	if (!rq->hard_nr_sectors) {
 		q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
 				QUEUE_ORDERED_DO_POSTFLUSH);
+		/*
+		 * Empty barrier on a write-through device w/ ordered
+		 * tag has no command to issue and without any command
+		 * to issue, ordering by tag can't be used.  Drain
+		 * instead.
+		 */
+		if ((q->ordered & QUEUE_ORDERED_BY_TAG) &&
+		    !(q->ordered & QUEUE_ORDERED_DO_PREFLUSH)) {
+			q->ordered &= ~QUEUE_ORDERED_BY_TAG;
+			q->ordered |= QUEUE_ORDERED_BY_DRAIN;
+		}
+	}
 
 	/* stash away the original request */
 	elv_dequeue_request(q, rq);
-- 
cgit v1.2.3


From a31a97381cdf7dceb03b797a8faf9bc8a01c65d1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 17 Oct 2008 13:58:29 +0200
Subject: block: don't use plugging on SSD devices

We just want to hand the first bits of IO to the device as fast
as possible. Gains a few percent on the IOPS rate.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index b1fd4f5f07d3..a824e49c0d0a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -268,8 +268,7 @@ void __generic_unplug_device(struct request_queue *q)
 {
 	if (unlikely(blk_queue_stopped(q)))
 		return;
-
-	if (!blk_remove_plug(q))
+	if (!blk_remove_plug(q) && !blk_queue_nonrot(q))
 		return;
 
 	q->request_fn(q);
@@ -1241,11 +1240,11 @@ get_rq:
 	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
 	    bio_flagged(bio, BIO_CPU_AFFINE))
 		req->cpu = blk_cpu_to_group(smp_processor_id());
-	if (elv_queue_empty(q))
+	if (!blk_queue_nonrot(q) && elv_queue_empty(q))
 		blk_plug_device(q);
 	add_request(q, req);
 out:
-	if (sync)
+	if (sync || blk_queue_nonrot(q))
 		__generic_unplug_device(q);
 	spin_unlock_irq(q->queue_lock);
 	return 0;
-- 
cgit v1.2.3


From b374d18a4bfce705e4a99ae9f501b53e86ecb283 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 31 Oct 2008 10:05:07 +0100
Subject: block: get rid of elevator_t typedef

Just use struct elevator_queue everywhere instead.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/as-iosched.c       |  8 +++----
 block/cfq-iosched.c      |  6 +++---
 block/deadline-iosched.c |  6 +++---
 block/elevator.c         | 55 +++++++++++++++++++++++++-----------------------
 block/noop-iosched.c     |  2 +-
 drivers/block/nbd.c      |  2 +-
 include/linux/blkdev.h   |  3 +--
 include/linux/elevator.h |  8 +++----
 8 files changed, 46 insertions(+), 44 deletions(-)

(limited to 'block')

diff --git a/block/as-iosched.c b/block/as-iosched.c
index 802b5d0d8536..631f6f44460a 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -1339,7 +1339,7 @@ static int as_may_queue(struct request_queue *q, int rw)
 	return ret;
 }
 
-static void as_exit_queue(elevator_t *e)
+static void as_exit_queue(struct elevator_queue *e)
 {
 	struct as_data *ad = e->elevator_data;
 
@@ -1409,7 +1409,7 @@ as_var_store(unsigned long *var, const char *page, size_t count)
 	return count;
 }
 
-static ssize_t est_time_show(elevator_t *e, char *page)
+static ssize_t est_time_show(struct elevator_queue *e, char *page)
 {
 	struct as_data *ad = e->elevator_data;
 	int pos = 0;
@@ -1427,7 +1427,7 @@ static ssize_t est_time_show(elevator_t *e, char *page)
 }
 
 #define SHOW_FUNCTION(__FUNC, __VAR)				\
-static ssize_t __FUNC(elevator_t *e, char *page)		\
+static ssize_t __FUNC(struct elevator_queue *e, char *page)	\
 {								\
 	struct as_data *ad = e->elevator_data;			\
 	return as_var_show(jiffies_to_msecs((__VAR)), (page));	\
@@ -1440,7 +1440,7 @@ SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[REQ_ASYNC]);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)				\
-static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)	\
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
 {									\
 	struct as_data *ad = e->elevator_data;				\
 	int ret = as_var_store(__PTR, (page), count);			\
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index a2bfec7d6b36..adaf93a9d19d 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2178,7 +2178,7 @@ static void cfq_put_async_queues(struct cfq_data *cfqd)
 		cfq_put_queue(cfqd->async_idle_cfqq);
 }
 
-static void cfq_exit_queue(elevator_t *e)
+static void cfq_exit_queue(struct elevator_queue *e)
 {
 	struct cfq_data *cfqd = e->elevator_data;
 	struct request_queue *q = cfqd->queue;
@@ -2288,7 +2288,7 @@ cfq_var_store(unsigned int *var, const char *page, size_t count)
 }
 
 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
-static ssize_t __FUNC(elevator_t *e, char *page)			\
+static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
 {									\
 	struct cfq_data *cfqd = e->elevator_data;			\
 	unsigned int __data = __VAR;					\
@@ -2308,7 +2308,7 @@ SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
-static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)	\
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
 {									\
 	struct cfq_data *cfqd = e->elevator_data;			\
 	unsigned int __data;						\
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index fd311179f44c..c4d991d4adef 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -334,7 +334,7 @@ static int deadline_queue_empty(struct request_queue *q)
 		&& list_empty(&dd->fifo_list[READ]);
 }
 
-static void deadline_exit_queue(elevator_t *e)
+static void deadline_exit_queue(struct elevator_queue *e)
 {
 	struct deadline_data *dd = e->elevator_data;
 
@@ -387,7 +387,7 @@ deadline_var_store(int *var, const char *page, size_t count)
 }
 
 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
-static ssize_t __FUNC(elevator_t *e, char *page)			\
+static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
 {									\
 	struct deadline_data *dd = e->elevator_data;			\
 	int __data = __VAR;						\
@@ -403,7 +403,7 @@ SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
-static ssize_t __FUNC(elevator_t *e, const char *page, size_t count)	\
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
 {									\
 	struct deadline_data *dd = e->elevator_data;			\
 	int __data;							\
diff --git a/block/elevator.c b/block/elevator.c
index ff60177a3bab..98259eda0ef6 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -65,7 +65,7 @@ DEFINE_TRACE(block_rq_issue);
 static int elv_iosched_allow_merge(struct request *rq, struct bio *bio)
 {
 	struct request_queue *q = rq->q;
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_allow_merge_fn)
 		return e->ops->elevator_allow_merge_fn(q, rq, bio);
@@ -208,13 +208,13 @@ __setup("elevator=", elevator_setup);
 
 static struct kobj_type elv_ktype;
 
-static elevator_t *elevator_alloc(struct request_queue *q,
+static struct elevator_queue *elevator_alloc(struct request_queue *q,
 				  struct elevator_type *e)
 {
-	elevator_t *eq;
+	struct elevator_queue *eq;
 	int i;
 
-	eq = kmalloc_node(sizeof(elevator_t), GFP_KERNEL | __GFP_ZERO, q->node);
+	eq = kmalloc_node(sizeof(*eq), GFP_KERNEL | __GFP_ZERO, q->node);
 	if (unlikely(!eq))
 		goto err;
 
@@ -240,8 +240,9 @@ err:
 
 static void elevator_release(struct kobject *kobj)
 {
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
+	struct elevator_queue *e;
 
+	e = container_of(kobj, struct elevator_queue, kobj);
 	elevator_put(e->elevator_type);
 	kfree(e->hash);
 	kfree(e);
@@ -297,7 +298,7 @@ int elevator_init(struct request_queue *q, char *name)
 }
 EXPORT_SYMBOL(elevator_init);
 
-void elevator_exit(elevator_t *e)
+void elevator_exit(struct elevator_queue *e)
 {
 	mutex_lock(&e->sysfs_lock);
 	if (e->ops->elevator_exit_fn)
@@ -311,7 +312,7 @@ EXPORT_SYMBOL(elevator_exit);
 
 static void elv_activate_rq(struct request_queue *q, struct request *rq)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_activate_req_fn)
 		e->ops->elevator_activate_req_fn(q, rq);
@@ -319,7 +320,7 @@ static void elv_activate_rq(struct request_queue *q, struct request *rq)
 
 static void elv_deactivate_rq(struct request_queue *q, struct request *rq)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_deactivate_req_fn)
 		e->ops->elevator_deactivate_req_fn(q, rq);
@@ -338,7 +339,7 @@ static void elv_rqhash_del(struct request_queue *q, struct request *rq)
 
 static void elv_rqhash_add(struct request_queue *q, struct request *rq)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	BUG_ON(ELV_ON_HASH(rq));
 	hlist_add_head(&rq->hash, &e->hash[ELV_HASH_FN(rq_hash_key(rq))]);
@@ -352,7 +353,7 @@ static void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
 
 static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 	struct hlist_head *hash_list = &e->hash[ELV_HASH_FN(offset)];
 	struct hlist_node *entry, *next;
 	struct request *rq;
@@ -494,7 +495,7 @@ EXPORT_SYMBOL(elv_dispatch_add_tail);
 
 int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 	struct request *__rq;
 	int ret;
 
@@ -529,7 +530,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 
 void elv_merged_request(struct request_queue *q, struct request *rq, int type)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_merged_fn)
 		e->ops->elevator_merged_fn(q, rq, type);
@@ -543,7 +544,7 @@ void elv_merged_request(struct request_queue *q, struct request *rq, int type)
 void elv_merge_requests(struct request_queue *q, struct request *rq,
 			     struct request *next)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_merge_req_fn)
 		e->ops->elevator_merge_req_fn(q, rq, next);
@@ -846,7 +847,7 @@ void elv_dequeue_request(struct request_queue *q, struct request *rq)
 
 int elv_queue_empty(struct request_queue *q)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (!list_empty(&q->queue_head))
 		return 0;
@@ -860,7 +861,7 @@ EXPORT_SYMBOL(elv_queue_empty);
 
 struct request *elv_latter_request(struct request_queue *q, struct request *rq)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_latter_req_fn)
 		return e->ops->elevator_latter_req_fn(q, rq);
@@ -869,7 +870,7 @@ struct request *elv_latter_request(struct request_queue *q, struct request *rq)
 
 struct request *elv_former_request(struct request_queue *q, struct request *rq)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_former_req_fn)
 		return e->ops->elevator_former_req_fn(q, rq);
@@ -878,7 +879,7 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
 
 int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_set_req_fn)
 		return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
@@ -889,7 +890,7 @@ int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 
 void elv_put_request(struct request_queue *q, struct request *rq)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_put_req_fn)
 		e->ops->elevator_put_req_fn(rq);
@@ -897,7 +898,7 @@ void elv_put_request(struct request_queue *q, struct request *rq)
 
 int elv_may_queue(struct request_queue *q, int rw)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	if (e->ops->elevator_may_queue_fn)
 		return e->ops->elevator_may_queue_fn(q, rw);
@@ -920,7 +921,7 @@ EXPORT_SYMBOL(elv_abort_queue);
 
 void elv_completed_request(struct request_queue *q, struct request *rq)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 
 	/*
 	 * request is released from the driver, io must be done
@@ -955,13 +956,14 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 static ssize_t
 elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 {
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
 	struct elv_fs_entry *entry = to_elv(attr);
+	struct elevator_queue *e;
 	ssize_t error;
 
 	if (!entry->show)
 		return -EIO;
 
+	e = container_of(kobj, struct elevator_queue, kobj);
 	mutex_lock(&e->sysfs_lock);
 	error = e->ops ? entry->show(e, page) : -ENOENT;
 	mutex_unlock(&e->sysfs_lock);
@@ -972,13 +974,14 @@ static ssize_t
 elv_attr_store(struct kobject *kobj, struct attribute *attr,
 	       const char *page, size_t length)
 {
-	elevator_t *e = container_of(kobj, elevator_t, kobj);
 	struct elv_fs_entry *entry = to_elv(attr);
+	struct elevator_queue *e;
 	ssize_t error;
 
 	if (!entry->store)
 		return -EIO;
 
+	e = container_of(kobj, struct elevator_queue, kobj);
 	mutex_lock(&e->sysfs_lock);
 	error = e->ops ? entry->store(e, page, length) : -ENOENT;
 	mutex_unlock(&e->sysfs_lock);
@@ -997,7 +1000,7 @@ static struct kobj_type elv_ktype = {
 
 int elv_register_queue(struct request_queue *q)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 	int error;
 
 	error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
@@ -1015,7 +1018,7 @@ int elv_register_queue(struct request_queue *q)
 	return error;
 }
 
-static void __elv_unregister_queue(elevator_t *e)
+static void __elv_unregister_queue(struct elevator_queue *e)
 {
 	kobject_uevent(&e->kobj, KOBJ_REMOVE);
 	kobject_del(&e->kobj);
@@ -1078,7 +1081,7 @@ EXPORT_SYMBOL_GPL(elv_unregister);
  */
 static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 {
-	elevator_t *old_elevator, *e;
+	struct elevator_queue *old_elevator, *e;
 	void *data;
 
 	/*
@@ -1184,7 +1187,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
 
 ssize_t elv_iosched_show(struct request_queue *q, char *name)
 {
-	elevator_t *e = q->elevator;
+	struct elevator_queue *e = q->elevator;
 	struct elevator_type *elv = e->elevator_type;
 	struct elevator_type *__e;
 	int len = 0;
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index c23e02969650..3a0d369d08c7 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -76,7 +76,7 @@ static void *noop_init_queue(struct request_queue *q)
 	return nd;
 }
 
-static void noop_exit_queue(elevator_t *e)
+static void noop_exit_queue(struct elevator_queue *e)
 {
 	struct noop_data *nd = e->elevator_data;
 
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index d3a91cacee8c..0766ce6187a9 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -722,7 +722,7 @@ static int __init nbd_init(void)
 
 	for (i = 0; i < nbds_max; i++) {
 		struct gendisk *disk = alloc_disk(1 << part_shift);
-		elevator_t *old_e;
+		struct elevator_queue *old_e;
 		if (!disk)
 			goto out;
 		nbd_dev[i].disk = disk;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 41bbadfd17f6..7035cec583b6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -26,7 +26,6 @@ struct scsi_ioctl_command;
 
 struct request_queue;
 struct elevator_queue;
-typedef struct elevator_queue elevator_t;
 struct request_pm_state;
 struct blk_trace;
 struct request;
@@ -313,7 +312,7 @@ struct request_queue
 	 */
 	struct list_head	queue_head;
 	struct request		*last_merge;
-	elevator_t		*elevator;
+	struct elevator_queue	*elevator;
 
 	/*
 	 * the queue request freelist, one for reads and one for writes
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 92f6f634e3e6..7a204256b155 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -28,7 +28,7 @@ typedef void (elevator_activate_req_fn) (struct request_queue *, struct request
 typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *);
 
 typedef void *(elevator_init_fn) (struct request_queue *);
-typedef void (elevator_exit_fn) (elevator_t *);
+typedef void (elevator_exit_fn) (struct elevator_queue *);
 
 struct elevator_ops
 {
@@ -62,8 +62,8 @@ struct elevator_ops
 
 struct elv_fs_entry {
 	struct attribute attr;
-	ssize_t (*show)(elevator_t *, char *);
-	ssize_t (*store)(elevator_t *, const char *, size_t);
+	ssize_t (*show)(struct elevator_queue *, char *);
+	ssize_t (*store)(struct elevator_queue *, const char *, size_t);
 };
 
 /*
@@ -130,7 +130,7 @@ extern ssize_t elv_iosched_show(struct request_queue *, char *);
 extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t);
 
 extern int elevator_init(struct request_queue *, char *);
-extern void elevator_exit(elevator_t *);
+extern void elevator_exit(struct elevator_queue *);
 extern int elv_rq_merge_ok(struct request *, struct bio *);
 
 /*
-- 
cgit v1.2.3


From 30e0dc28bff9dc456cdfc2aae4aca78b8b1a1cec Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 20 Oct 2008 15:44:28 +0200
Subject: cfq-iosched: remove limit of dispatch depth of max 4 times quantum

This basically limits the hardware queue depth to 4*quantum at any
point in time, which is 16 with the default settings. As CFQ uses
other means to shrink the hardware queue when necessary in the first
place, there's really no need for this extra heuristic. Additionally,
it ends up hurting performance in some cases.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index adaf93a9d19d..ee8a90c7c46c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1136,12 +1136,8 @@ static int cfq_dispatch_requests(struct request_queue *q, int force)
 		if (cfq_class_idle(cfqq))
 			max_dispatch = 1;
 
-		if (cfqq->dispatched >= max_dispatch) {
-			if (cfqd->busy_queues > 1)
-				break;
-			if (cfqq->dispatched >= 4 * max_dispatch)
-				break;
-		}
+		if (cfqq->dispatched >= max_dispatch && cfqd->busy_queues > 1)
+			break;
 
 		if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq))
 			break;
-- 
cgit v1.2.3


From a6f23657d3072bde6844055bbc2290e497f33fbc Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 24 Oct 2008 12:52:42 +0200
Subject: block: add one-hit cache for disk partition lookup

disk_map_sector_rcu() returns a partition from a sector offset,
which we use for IO statistics on a per-partition basis. The
lookup itself is an O(N) list lookup, where N is the number of
partitions. This actually hurts performance quite a bit, even
on the lower end partitions. On higher numbered partitions,
it can get pretty bad.

Solve this by adding a one-hit cache for partition lookup.
This makes the lookup O(1) for the case where we do most IO to
one partition. Even for mixed partition workloads, amortized cost
is pretty close to O(1) since the natural IO batching makes the
one-hit cache last for lots of IOs.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c         | 23 +++++++++++++++++++----
 include/linux/genhd.h |  1 +
 2 files changed, 20 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 2f7feda61e35..d84a7df1e2a0 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -181,6 +181,12 @@ void disk_part_iter_exit(struct disk_part_iter *piter)
 }
 EXPORT_SYMBOL_GPL(disk_part_iter_exit);
 
+static inline int sector_in_part(struct hd_struct *part, sector_t sector)
+{
+	return part->start_sect <= sector &&
+		sector < part->start_sect + part->nr_sects;
+}
+
 /**
  * disk_map_sector_rcu - map sector to partition
  * @disk: gendisk of interest
@@ -199,16 +205,22 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit);
 struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
 {
 	struct disk_part_tbl *ptbl;
+	struct hd_struct *part;
 	int i;
 
 	ptbl = rcu_dereference(disk->part_tbl);
 
+	part = rcu_dereference(ptbl->last_lookup);
+	if (part && sector_in_part(part, sector))
+		return part;
+
 	for (i = 1; i < ptbl->len; i++) {
-		struct hd_struct *part = rcu_dereference(ptbl->part[i]);
+		part = rcu_dereference(ptbl->part[i]);
 
-		if (part && part->start_sect <= sector &&
-		    sector < part->start_sect + part->nr_sects)
+		if (part && sector_in_part(part, sector)) {
+			rcu_assign_pointer(ptbl->last_lookup, part);
 			return part;
+		}
 	}
 	return &disk->part0;
 }
@@ -888,8 +900,11 @@ static void disk_replace_part_tbl(struct gendisk *disk,
 	struct disk_part_tbl *old_ptbl = disk->part_tbl;
 
 	rcu_assign_pointer(disk->part_tbl, new_ptbl);
-	if (old_ptbl)
+
+	if (old_ptbl) {
+		rcu_assign_pointer(old_ptbl->last_lookup, NULL);
 		call_rcu(&old_ptbl->rcu_head, disk_free_ptbl_rcu_cb);
+	}
 }
 
 /**
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 3df7742ce246..16948eaecae3 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -126,6 +126,7 @@ struct blk_scsi_cmd_filter {
 struct disk_part_tbl {
 	struct rcu_head rcu_head;
 	int len;
+	struct hd_struct *last_lookup;
 	struct hd_struct *part[];
 };
 
-- 
cgit v1.2.3


From 18af8b2ca34b831c32c6fa01e7ce33143c33bb63 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Thu, 4 Dec 2008 08:56:35 +0100
Subject: block: use min_not_zero in blk_queue_stack_limits

zero is invalid for max_phys_segments, max_hw_segments, and
max_segment_size. It's better to use use min_not_zero instead of
min. min() works though (because the commit 0e435ac makes sure that
these values are set to the default values, non zero, if a queue is
initialized properly).

With this patch, blk_queue_stack_limits does the almost same thing
that dm's combine_restrictions_low() does. I think that it's easy to
remove dm's combine_restrictions_low.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index afa55e14e278..59fd05d9f1d5 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -319,9 +319,9 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
 	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask);
 
-	t->max_phys_segments = min(t->max_phys_segments, b->max_phys_segments);
-	t->max_hw_segments = min(t->max_hw_segments, b->max_hw_segments);
-	t->max_segment_size = min(t->max_segment_size, b->max_segment_size);
+	t->max_phys_segments = min_not_zero(t->max_phys_segments, b->max_phys_segments);
+	t->max_hw_segments = min_not_zero(t->max_hw_segments, b->max_hw_segments);
+	t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size);
 	t->hardsect_size = max(t->hardsect_size, b->hardsect_size);
 	if (!t->queue_lock)
 		WARN_ON_ONCE(1);
-- 
cgit v1.2.3


From 3c18ce71af754cefae75103dbae28817e04b2db4 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Wed, 10 Dec 2008 15:47:33 +0100
Subject: block: make blk_softirq_init() static

Sparse asked whether these could be static.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-softirq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index e660d26ca656..ce0efc6b26dc 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -161,7 +161,7 @@ void blk_complete_request(struct request *req)
 }
 EXPORT_SYMBOL(blk_complete_request);
 
-__init int blk_softirq_init(void)
+static __init int blk_softirq_init(void)
 {
 	int i;
 
-- 
cgit v1.2.3


From b3a6ffe16b5cc48abe7db8d04882dc45280eb693 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 12 Dec 2008 09:51:16 +0100
Subject: Get rid of CONFIG_LSF

We have two seperate config entries for large devices/files. One
is CONFIG_LBD that guards just the devices, the other is CONFIG_LSF
that handles large files. This doesn't make a lot of sense, you typically
want both or none. So get rid of CONFIG_LSF and change CONFIG_LBD wording
to indicate that it covers both.

Acked-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/Kconfig         | 23 +++++------------------
 fs/ext4/super.c       |  8 ++++----
 include/linux/types.h | 11 +++--------
 3 files changed, 12 insertions(+), 30 deletions(-)

(limited to 'block')

diff --git a/block/Kconfig b/block/Kconfig
index 290b219fad9c..ac0956f77785 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -24,21 +24,17 @@ menuconfig BLOCK
 if BLOCK
 
 config LBD
-	bool "Support for Large Block Devices"
+	bool "Support for large block devices and files"
 	depends on !64BIT
 	help
-	  Enable block devices of size 2TB and larger.
+	  Enable block devices or files of size 2TB and larger.
 
 	  This option is required to support the full capacity of large
 	  (2TB+) block devices, including RAID, disk, Network Block Device,
 	  Logical Volume Manager (LVM) and loopback.
-
-	  For example, RAID devices are frequently bigger than the capacity
-	  of the largest individual hard drive.
-
-	  This option is not required if you have individual disk drives
-	  which total 2TB+ and you are not aggregating the capacity into
-	  a large block device (e.g. using RAID or LVM).
+	
+	  This option also enables support for single files larger than
+	  2TB.
 
 	  If unsure, say N.
 
@@ -58,15 +54,6 @@ config BLK_DEV_IO_TRACE
 
 	  If unsure, say N.
 
-config LSF
-	bool "Support for Large Single Files"
-	depends on !64BIT
-	help
-	  Say Y here if you want to be able to handle very large files (2TB
-	  and larger), otherwise say N.
-
-	  If unsure, say Y.
-
 config BLK_DEV_BSG
 	bool "Block layer SG support v4 (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e4a241c65dbe..04158ad74dbb 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1721,7 +1721,7 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
 	/* small i_blocks in vfs inode? */
 	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
 		/*
-		 * CONFIG_LSF is not enabled implies the inode
+		 * CONFIG_LBD is not enabled implies the inode
 		 * i_block represent total blocks in 512 bytes
 		 * 32 == size of vfs inode i_blocks * 8
 		 */
@@ -1764,7 +1764,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
 
 	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
 		/*
-		 * !has_huge_files or CONFIG_LSF is not enabled
+		 * !has_huge_files or CONFIG_LBD is not enabled
 		 * implies the inode i_block represent total blocks in
 		 * 512 bytes 32 == size of vfs inode i_blocks * 8
 		 */
@@ -2021,13 +2021,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (has_huge_files) {
 		/*
 		 * Large file size enabled file system can only be
-		 * mount if kernel is build with CONFIG_LSF
+		 * mount if kernel is build with CONFIG_LBD
 		 */
 		if (sizeof(root->i_blocks) < sizeof(u64) &&
 				!(sb->s_flags & MS_RDONLY)) {
 			printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge "
 					"files cannot be mounted read-write "
-					"without CONFIG_LSF.\n", sb->s_id);
+					"without CONFIG_LBD.\n", sb->s_id);
 			goto failed_mount;
 		}
 	}
diff --git a/include/linux/types.h b/include/linux/types.h
index 1d98330b1f2c..121f349cb7ec 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -135,19 +135,14 @@ typedef		__s64		int64_t;
  *
  * Linux always considers sectors to be 512 bytes long independently
  * of the devices real block size.
+ *
+ * blkcnt_t is the type of the inode's block count.
  */
 #ifdef CONFIG_LBD
 typedef u64 sector_t;
-#else
-typedef unsigned long sector_t;
-#endif
-
-/*
- * The type of the inode's block count.
- */
-#ifdef CONFIG_LSF
 typedef u64 blkcnt_t;
 #else
+typedef unsigned long sector_t;
 typedef unsigned long blkcnt_t;
 #endif
 
-- 
cgit v1.2.3


From 62c1fe9d9f0a676fce89185b1513f0e5f473c72c Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 15 Dec 2008 21:19:25 +0100
Subject: cfq-iosched: fix race between exiting queue and exiting task

Original patch from Nikanth Karthikesan <knikanth@suse.de>

When a queue exits the queue lock is taken and cfq_exit_queue() would free all
the cic's associated with the queue.

But when a task exits, cfq_exit_io_context() gets cic one by one and then
locks the associated queue to call __cfq_exit_single_io_context. It looks like
between getting a cic from the ioc and locking the queue, the queue might have
exited on another cpu.

Fix this by rechecking the cfq_io_context queue key inside the queue lock
again, and not calling into __cfq_exit_single_io_context() if somebody
beat us to it.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ee8a90c7c46c..e8525fa72823 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1314,7 +1314,15 @@ static void cfq_exit_single_io_context(struct io_context *ioc,
 		unsigned long flags;
 
 		spin_lock_irqsave(q->queue_lock, flags);
-		__cfq_exit_single_io_context(cfqd, cic);
+
+		/*
+		 * Ensure we get a fresh copy of the ->key to prevent
+		 * race between exiting task and queue
+		 */
+		smp_read_barrier_depends();
+		if (cic->key)
+			__cfq_exit_single_io_context(cfqd, cic);
+
 		spin_unlock_irqrestore(q->queue_lock, flags);
 	}
 }
-- 
cgit v1.2.3