From 1fa8455deb92e9ec7756df23030e73b2d28eeca7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Sun, 10 Nov 2013 21:55:27 -0800
Subject: bcache: Fix dirty_data accounting

Dirty data accounting wasn't quite right - firstly, we were adding the key we're
inserting after it could have merged with another dirty key already in the
btree, and secondly we could sometimes pass the wrong offset to
bcache_dev_sectors_dirty_add() for dirty data we were overwriting - which is
important when tracking dirty data by stripe.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Cc: linux-stable <stable@vger.kernel.org> # >= v3.10
---
 drivers/md/bcache/btree.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index f42fc7ed9cd6..d1734d9d9c79 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1742,6 +1742,9 @@ static bool fix_overlapping_extents(struct btree *b,
 		if (bkey_cmp(insert, k) < 0) {
 			bch_cut_front(insert, k);
 		} else {
+			if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
+				old_offset = KEY_START(insert);
+
 			if (bkey_written(b, k) &&
 			    bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
 				/*
@@ -1803,6 +1806,10 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
 		if (fix_overlapping_extents(b, k, &iter, op))
 			return false;
 
+		if (KEY_DIRTY(k))
+			bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
+						     KEY_START(k), KEY_SIZE(k));
+
 		while (m != end(i) &&
 		       bkey_cmp(k, &START_KEY(m)) > 0)
 			prev = m, m = bkey_next(m);
@@ -1831,10 +1838,6 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
 insert:	shift_keys(b, m, k);
 copy:	bkey_copy(m, k);
 merged:
-	if (KEY_DIRTY(k))
-		bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
-					     KEY_START(k), KEY_SIZE(k));
-
 	bch_check_keys(b, "%u for %s", status, op_type(op));
 
 	if (b->level && !KEY_OFFSET(k))
-- 
cgit v1.2.3


From 7857d5d470ec53bae187d144c69065ad3c0ebc21 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 8 Oct 2013 15:50:46 -0700
Subject: bcache: Fix a journalling performance bug

---
 drivers/md/bcache/journal.c | 47 +++++++++++++++++++++++++--------------------
 drivers/md/bcache/journal.h |  3 ++-
 2 files changed, 28 insertions(+), 22 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 8435f81e5d85..7c9e6bf6aaba 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -554,32 +554,26 @@ static void journal_write_endio(struct bio *bio, int error)
 	struct journal_write *w = bio->bi_private;
 
 	cache_set_err_on(error, w->c, "journal io error");
-	closure_put(&w->c->journal.io.cl);
+	closure_put(&w->c->journal.io);
 }
 
 static void journal_write(struct closure *);
 
 static void journal_write_done(struct closure *cl)
 {
-	struct journal *j = container_of(cl, struct journal, io.cl);
-	struct cache_set *c = container_of(j, struct cache_set, journal);
-
+	struct journal *j = container_of(cl, struct journal, io);
 	struct journal_write *w = (j->cur == j->w)
 		? &j->w[1]
 		: &j->w[0];
 
 	__closure_wake_up(&w->wait);
-
-	if (c->journal_delay_ms)
-		closure_delay(&j->io, msecs_to_jiffies(c->journal_delay_ms));
-
-	continue_at(cl, journal_write, system_wq);
+	continue_at_nobarrier(cl, journal_write, system_wq);
 }
 
 static void journal_write_unlocked(struct closure *cl)
 	__releases(c->journal.lock)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl);
+	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
 	struct cache *ca;
 	struct journal_write *w = c->journal.cur;
 	struct bkey *k = &c->journal.key;
@@ -660,7 +654,7 @@ static void journal_write_unlocked(struct closure *cl)
 
 static void journal_write(struct closure *cl)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl);
+	struct cache_set *c = container_of(cl, struct cache_set, journal.io);
 
 	spin_lock(&c->journal.lock);
 	journal_write_unlocked(cl);
@@ -669,7 +663,10 @@ static void journal_write(struct closure *cl)
 static void __journal_try_write(struct cache_set *c, bool noflush)
 	__releases(c->journal.lock)
 {
-	struct closure *cl = &c->journal.io.cl;
+	struct closure *cl = &c->journal.io;
+	struct journal_write *w = c->journal.cur;
+
+	w->need_write = true;
 
 	if (!closure_trylock(cl, &c->cl))
 		spin_unlock(&c->journal.lock);
@@ -688,18 +685,24 @@ void bch_journal_meta(struct cache_set *c, struct closure *cl)
 
 	if (CACHE_SYNC(&c->sb)) {
 		spin_lock(&c->journal.lock);
-
 		w = c->journal.cur;
-		w->need_write = true;
 
 		if (cl)
 			BUG_ON(!closure_wait(&w->wait, cl));
 
-		closure_flush(&c->journal.io);
 		__journal_try_write(c, true);
 	}
 }
 
+static void journal_write_work(struct work_struct *work)
+{
+	struct cache_set *c = container_of(to_delayed_work(work),
+					   struct cache_set,
+					   journal.work);
+	spin_lock(&c->journal.lock);
+	journal_try_write(c);
+}
+
 /*
  * Entry point to the journalling code - bio_insert() and btree_invalidate()
  * pass bch_journal() a list of keys to be journalled, and then
@@ -739,7 +742,6 @@ void bch_journal(struct closure *cl)
 	}
 
 	w = c->journal.cur;
-	w->need_write = true;
 	b = __set_blocks(w->data, w->data->keys + n, c);
 
 	if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS ||
@@ -755,8 +757,6 @@ void bch_journal(struct closure *cl)
 
 		BUG_ON(!closure_wait(&w->wait, cl));
 
-		closure_flush(&c->journal.io);
-
 		journal_try_write(c);
 		continue_at(cl, bch_journal, bcache_wq);
 	}
@@ -768,11 +768,15 @@ void bch_journal(struct closure *cl)
 	atomic_inc(op->journal);
 
 	if (op->flush_journal) {
-		closure_flush(&c->journal.io);
 		closure_wait(&w->wait, cl->parent);
+		journal_try_write(c);
+	} else if (!w->need_write) {
+		schedule_delayed_work(&c->journal.work,
+				      msecs_to_jiffies(c->journal_delay_ms));
+		spin_unlock(&c->journal.lock);
+	} else {
+		spin_unlock(&c->journal.lock);
 	}
-
-	journal_try_write(c);
 out:
 	bch_btree_insert_async(cl);
 }
@@ -790,6 +794,7 @@ int bch_journal_alloc(struct cache_set *c)
 
 	closure_init_unlocked(&j->io);
 	spin_lock_init(&j->lock);
+	INIT_DELAYED_WORK(&j->work, journal_write_work);
 
 	c->journal_delay_ms = 100;
 
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index 3d7851274b04..3ca93d3d566c 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -140,7 +140,8 @@ struct journal {
 	spinlock_t		lock;
 	/* used when waiting because the journal was full */
 	struct closure_waitlist	wait;
-	struct closure_with_timer io;
+	struct closure		io;
+	struct delayed_work	work;
 
 	/* Number of blocks free in the bucket(s) we're currently writing to */
 	unsigned		blocks_free;
-- 
cgit v1.2.3


From dd9ec84da530d3a92e99ad1f52edae44533cc27f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Thu, 24 Oct 2013 17:12:52 -0700
Subject: bcache: Fix a lockdep splat

bch_keybuf_del() takes a spinlock that can't be taken in interrupt context -
whoops. Fortunately, this code isn't enabled by default (you have to toggle a
sysfs thing).

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/movinggc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 1a3b4f4786c3..dd8a035c5ae1 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -113,7 +113,7 @@ static void write_moving(struct closure *cl)
 		bch_insert_data(&s->op.cl);
 	}
 
-	continue_at(cl, write_moving_finish, NULL);
+	continue_at(cl, write_moving_finish, bch_gc_wq);
 }
 
 static void read_moving_submit(struct closure *cl)
-- 
cgit v1.2.3


From 49b1212dfacfe51f951442563d1617bb06aac575 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:16:09 -0700
Subject: bcache: Use blkdev_issue_discard()

The old asynchronous discard code was really a relic from when all the
allocation code was asynchronous - now that allocation runs out of a
dedicated thread there's no point in keeping around all that complicated
machinery.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/alloc.c  | 114 +++++----------------------------------------
 drivers/md/bcache/bcache.h |  10 ----
 drivers/md/bcache/super.c  |   4 --
 3 files changed, 11 insertions(+), 117 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index e45f5575fd4d..e033b0203b68 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -63,13 +63,12 @@
 #include "bcache.h"
 #include "btree.h"
 
+#include <linux/blkdev.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/random.h>
 #include <trace/events/bcache.h>
 
-#define MAX_IN_FLIGHT_DISCARDS		8U
-
 /* Bucket heap / gen */
 
 uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
@@ -121,75 +120,6 @@ void bch_rescale_priorities(struct cache_set *c, int sectors)
 	mutex_unlock(&c->bucket_lock);
 }
 
-/* Discard/TRIM */
-
-struct discard {
-	struct list_head	list;
-	struct work_struct	work;
-	struct cache		*ca;
-	long			bucket;
-
-	struct bio		bio;
-	struct bio_vec		bv;
-};
-
-static void discard_finish(struct work_struct *w)
-{
-	struct discard *d = container_of(w, struct discard, work);
-	struct cache *ca = d->ca;
-	char buf[BDEVNAME_SIZE];
-
-	if (!test_bit(BIO_UPTODATE, &d->bio.bi_flags)) {
-		pr_notice("discard error on %s, disabling",
-			 bdevname(ca->bdev, buf));
-		d->ca->discard = 0;
-	}
-
-	mutex_lock(&ca->set->bucket_lock);
-
-	fifo_push(&ca->free, d->bucket);
-	list_add(&d->list, &ca->discards);
-	atomic_dec(&ca->discards_in_flight);
-
-	mutex_unlock(&ca->set->bucket_lock);
-
-	closure_wake_up(&ca->set->bucket_wait);
-	wake_up_process(ca->alloc_thread);
-
-	closure_put(&ca->set->cl);
-}
-
-static void discard_endio(struct bio *bio, int error)
-{
-	struct discard *d = container_of(bio, struct discard, bio);
-	schedule_work(&d->work);
-}
-
-static void do_discard(struct cache *ca, long bucket)
-{
-	struct discard *d = list_first_entry(&ca->discards,
-					     struct discard, list);
-
-	list_del(&d->list);
-	d->bucket = bucket;
-
-	atomic_inc(&ca->discards_in_flight);
-	closure_get(&ca->set->cl);
-
-	bio_init(&d->bio);
-
-	d->bio.bi_sector	= bucket_to_sector(ca->set, d->bucket);
-	d->bio.bi_bdev		= ca->bdev;
-	d->bio.bi_rw		= REQ_WRITE|REQ_DISCARD;
-	d->bio.bi_max_vecs	= 1;
-	d->bio.bi_io_vec	= d->bio.bi_inline_vecs;
-	d->bio.bi_size		= bucket_bytes(ca);
-	d->bio.bi_end_io	= discard_endio;
-	bio_set_prio(&d->bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
-
-	submit_bio(0, &d->bio);
-}
-
 /* Allocation */
 
 static inline bool can_inc_bucket_gen(struct bucket *b)
@@ -398,16 +328,18 @@ static int bch_allocator_thread(void *arg)
 			else
 				break;
 
-			allocator_wait(ca, (int) fifo_free(&ca->free) >
-				       atomic_read(&ca->discards_in_flight));
-
 			if (ca->discard) {
-				allocator_wait(ca, !list_empty(&ca->discards));
-				do_discard(ca, bucket);
-			} else {
-				fifo_push(&ca->free, bucket);
-				closure_wake_up(&ca->set->bucket_wait);
+				mutex_unlock(&ca->set->bucket_lock);
+				blkdev_issue_discard(ca->bdev,
+					bucket_to_sector(ca->set, bucket),
+					ca->sb.block_size, GFP_KERNEL, 0);
+				mutex_lock(&ca->set->bucket_lock);
 			}
+
+			allocator_wait(ca, !fifo_full(&ca->free));
+
+			fifo_push(&ca->free, bucket);
+			closure_wake_up(&ca->set->bucket_wait);
 		}
 
 		/*
@@ -556,22 +488,8 @@ int bch_cache_allocator_start(struct cache *ca)
 	return 0;
 }
 
-void bch_cache_allocator_exit(struct cache *ca)
-{
-	struct discard *d;
-
-	while (!list_empty(&ca->discards)) {
-		d = list_first_entry(&ca->discards, struct discard, list);
-		cancel_work_sync(&d->work);
-		list_del(&d->list);
-		kfree(d);
-	}
-}
-
 int bch_cache_allocator_init(struct cache *ca)
 {
-	unsigned i;
-
 	/*
 	 * Reserve:
 	 * Prio/gen writes first
@@ -589,15 +507,5 @@ int bch_cache_allocator_init(struct cache *ca)
 	ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
 		ca->watermark[WATERMARK_MOVINGGC];
 
-	for (i = 0; i < MAX_IN_FLIGHT_DISCARDS; i++) {
-		struct discard *d = kzalloc(sizeof(*d), GFP_KERNEL);
-		if (!d)
-			return -ENOMEM;
-
-		d->ca = ca;
-		INIT_WORK(&d->work, discard_finish);
-		list_add(&d->list, &ca->discards);
-	}
-
 	return 0;
 }
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 0f12382aa35d..bf37474a6888 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -620,15 +620,6 @@ struct cache {
 
 	bool			discard; /* Get rid of? */
 
-	/*
-	 * We preallocate structs for issuing discards to buckets, and keep them
-	 * on this list when they're not in use; do_discard() issues discards
-	 * whenever there's work to do and is called by free_some_buckets() and
-	 * when a discard finishes.
-	 */
-	atomic_t		discards_in_flight;
-	struct list_head	discards;
-
 	struct journal_device	journal;
 
 	/* The rest of this all shows up in sysfs */
@@ -1222,7 +1213,6 @@ int bch_btree_cache_alloc(struct cache_set *);
 void bch_moving_init_cache_set(struct cache_set *);
 
 int bch_cache_allocator_start(struct cache *ca);
-void bch_cache_allocator_exit(struct cache *ca);
 int bch_cache_allocator_init(struct cache *ca);
 
 void bch_debug_exit(void);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 547c4c57b052..fd37342a7eb5 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1725,8 +1725,6 @@ void bch_cache_release(struct kobject *kobj)
 	if (ca->set)
 		ca->set->cache[ca->sb.nr_this_dev] = NULL;
 
-	bch_cache_allocator_exit(ca);
-
 	bio_split_pool_free(&ca->bio_split_hook);
 
 	free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
@@ -1758,8 +1756,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
 	__module_get(THIS_MODULE);
 	kobject_init(&ca->kobj, &bch_cache_ktype);
 
-	INIT_LIST_HEAD(&ca->discards);
-
 	bio_init(&ca->journal.bio);
 	ca->journal.bio.bi_max_vecs = 8;
 	ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs;
-- 
cgit v1.2.3


From 77c320eb46e216c17aee5c943949229ccfed6904 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Thu, 11 Jul 2013 19:42:51 -0700
Subject: bcache: Add on error panic/unregister setting

Works kind of like the ext4 setting, to panic or remount read only on
errors.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h  |  6 ++++++
 drivers/md/bcache/journal.c |  7 +++----
 drivers/md/bcache/super.c   |  6 +++++-
 drivers/md/bcache/sysfs.c   | 21 +++++++++++++++++++++
 4 files changed, 35 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index bf37474a6888..578615604be5 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -843,8 +843,14 @@ struct cache_set {
 	atomic_long_t		cache_read_races;
 	atomic_long_t		writeback_keys_done;
 	atomic_long_t		writeback_keys_failed;
+
+	enum			{
+		ON_ERROR_UNREGISTER,
+		ON_ERROR_PANIC,
+	}			on_error;
 	unsigned		error_limit;
 	unsigned		error_decay;
+
 	unsigned short		journal_delay_ms;
 	unsigned		verify:1;
 	unsigned		key_merging_disabled:1;
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 7c9e6bf6aaba..9e8775872dba 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -305,10 +305,9 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
 	list_for_each_entry(i, list, list) {
 		BUG_ON(i->pin && atomic_read(i->pin) != 1);
 
-		if (n != i->j.seq)
-			pr_err(
-		"journal entries %llu-%llu missing! (replaying %llu-%llu)\n",
-		n, i->j.seq - 1, start, end);
+		cache_set_err_on(n != i->j.seq, s,
+"bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)",
+				 n, i->j.seq - 1, start, end);
 
 		for (k = i->j.start;
 		     k < end(&i->j);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index fd37342a7eb5..74f2e902d876 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1260,7 +1260,8 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
 {
 	va_list args;
 
-	if (test_bit(CACHE_SET_STOPPING, &c->flags))
+	if (c->on_error != ON_ERROR_PANIC &&
+	    test_bit(CACHE_SET_STOPPING, &c->flags))
 		return false;
 
 	/* XXX: we can be called from atomic context
@@ -1275,6 +1276,9 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
 
 	printk(", disabling caching\n");
 
+	if (c->on_error == ON_ERROR_PANIC)
+		panic("panic forced after error\n");
+
 	bch_cache_set_unregister(c);
 	return true;
 }
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 924dcfdae111..4211d82179fc 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -21,6 +21,12 @@ static const char * const cache_replacement_policies[] = {
 	NULL
 };
 
+static const char * const error_actions[] = {
+	"unregister",
+	"panic",
+	NULL
+};
+
 write_attribute(attach);
 write_attribute(detach);
 write_attribute(unregister);
@@ -90,6 +96,7 @@ rw_attribute(discard);
 rw_attribute(running);
 rw_attribute(label);
 rw_attribute(readahead);
+rw_attribute(errors);
 rw_attribute(io_error_limit);
 rw_attribute(io_error_halflife);
 rw_attribute(verify);
@@ -492,6 +499,10 @@ lock_root:
 	sysfs_print(writeback_keys_failed,
 		    atomic_long_read(&c->writeback_keys_failed));
 
+	if (attr == &sysfs_errors)
+		return bch_snprint_string_list(buf, PAGE_SIZE, error_actions,
+					       c->on_error);
+
 	/* See count_io_errors for why 88 */
 	sysfs_print(io_error_halflife,	c->error_decay * 88);
 	sysfs_print(io_error_limit,	c->error_limit >> IO_ERROR_SHIFT);
@@ -569,6 +580,15 @@ STORE(__bch_cache_set)
 	sysfs_strtoul(congested_write_threshold_us,
 		      c->congested_write_threshold_us);
 
+	if (attr == &sysfs_errors) {
+		ssize_t v = bch_read_string_list(buf, error_actions);
+
+		if (v < 0)
+			return v;
+
+		c->on_error = v;
+	}
+
 	if (attr == &sysfs_io_error_limit)
 		c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT;
 
@@ -620,6 +640,7 @@ static struct attribute *bch_cache_set_files[] = {
 	&sysfs_average_key_size,
 	&sysfs_dirty_data,
 
+	&sysfs_errors,
 	&sysfs_io_error_limit,
 	&sysfs_io_error_halflife,
 	&sysfs_congested,
-- 
cgit v1.2.3


From 2d679fc75678551485df62274edaed452becd16d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Sat, 17 Aug 2013 02:13:15 -0700
Subject: bcache: Stripe size isn't necessarily a power of two

Originally I got this right... except that the divides didn't use
do_div(), which broke 32 bit kernels. When I went to fix that, I forgot
that the raid stripe size usually isn't a power of two... doh

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h    |  2 +-
 drivers/md/bcache/super.c     |  7 +++----
 drivers/md/bcache/sysfs.c     |  2 +-
 drivers/md/bcache/writeback.c | 33 +++++++++++++++++----------------
 drivers/md/bcache/writeback.h |  8 +++++---
 5 files changed, 27 insertions(+), 25 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 578615604be5..6e836f22f276 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -437,7 +437,7 @@ struct bcache_device {
 	int			flush_done;
 
 	uint64_t		nr_stripes;
-	unsigned		stripe_size_bits;
+	unsigned		stripe_size;
 	atomic_t		*stripe_sectors_dirty;
 
 	unsigned long		sectors_dirty_last;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 74f2e902d876..d3169c0652f8 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -761,11 +761,10 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 	struct request_queue *q;
 	size_t n;
 
-	if (!d->stripe_size_bits)
-		d->stripe_size_bits = 31;
+	if (!d->stripe_size)
+		d->stripe_size = 1 << 31;
 
-	d->nr_stripes = round_up(sectors, 1 << d->stripe_size_bits) >>
-		d->stripe_size_bits;
+	d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
 
 	if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t))
 		return -ENOMEM;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 4211d82179fc..b3a66f17231d 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -157,7 +157,7 @@ SHOW(__bch_cached_dev)
 	sysfs_hprint(dirty_data,
 		     bcache_dev_sectors_dirty(&dc->disk) << 9);
 
-	sysfs_hprint(stripe_size,	(1 << dc->disk.stripe_size_bits) << 9);
+	sysfs_hprint(stripe_size,	dc->disk.stripe_size << 9);
 	var_printf(partial_stripes_expensive,	"%u");
 
 	var_printf(sequential_merge,	"%i");
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index ba3ee48320f2..b842fbfbf1db 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -114,25 +114,25 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k)
 
 static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k)
 {
-	uint64_t stripe;
+	uint64_t stripe = KEY_START(k);
 	unsigned nr_sectors = KEY_SIZE(k);
 	struct cached_dev *dc = container_of(buf, struct cached_dev,
 					     writeback_keys);
-	unsigned stripe_size = 1 << dc->disk.stripe_size_bits;
 
 	if (!KEY_DIRTY(k))
 		return false;
 
-	stripe = KEY_START(k) >> dc->disk.stripe_size_bits;
-	while (1) {
-		if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) !=
-		    stripe_size)
-			return false;
+	do_div(stripe, dc->disk.stripe_size);
 
-		if (nr_sectors <= stripe_size)
+	while (1) {
+		if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) ==
+		    dc->disk.stripe_size)
 			return true;
 
-		nr_sectors -= stripe_size;
+		if (nr_sectors <= dc->disk.stripe_size)
+			return false;
+
+		nr_sectors -= dc->disk.stripe_size;
 		stripe++;
 	}
 }
@@ -186,11 +186,12 @@ static void refill_dirty(struct closure *cl)
 
 		for (i = 0; i < dc->disk.nr_stripes; i++)
 			if (atomic_read(dc->disk.stripe_sectors_dirty + i) ==
-			    1 << dc->disk.stripe_size_bits)
+			    dc->disk.stripe_size)
 				goto full_stripes;
 
 		goto normal_refill;
 full_stripes:
+		searched_from_start = false;	/* not searching entire btree */
 		bch_refill_keybuf(dc->disk.c, buf, &end,
 				  dirty_full_stripe_pred);
 	} else {
@@ -252,19 +253,19 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
 				  uint64_t offset, int nr_sectors)
 {
 	struct bcache_device *d = c->devices[inode];
-	unsigned stripe_size, stripe_offset;
-	uint64_t stripe;
+	unsigned stripe_offset;
+	uint64_t stripe = offset;
 
 	if (!d)
 		return;
 
-	stripe_size = 1 << d->stripe_size_bits;
-	stripe = offset >> d->stripe_size_bits;
-	stripe_offset = offset & (stripe_size - 1);
+	do_div(stripe, d->stripe_size);
+
+	stripe_offset = offset & (d->stripe_size - 1);
 
 	while (nr_sectors) {
 		int s = min_t(unsigned, abs(nr_sectors),
-			      stripe_size - stripe_offset);
+			      d->stripe_size - stripe_offset);
 
 		if (nr_sectors < 0)
 			s = -s;
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index c91f61bb95b6..34961888b5a9 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -18,16 +18,18 @@ static inline bool bcache_dev_stripe_dirty(struct bcache_device *d,
 					   uint64_t offset,
 					   unsigned nr_sectors)
 {
-	uint64_t stripe = offset >> d->stripe_size_bits;
+	uint64_t stripe = offset;
+
+	do_div(stripe, d->stripe_size);
 
 	while (1) {
 		if (atomic_read(d->stripe_sectors_dirty + stripe))
 			return true;
 
-		if (nr_sectors <= 1 << d->stripe_size_bits)
+		if (nr_sectors <= d->stripe_size)
 			return false;
 
-		nr_sectors -= 1 << d->stripe_size_bits;
+		nr_sectors -= d->stripe_size;
 		stripe++;
 	}
 }
-- 
cgit v1.2.3


From 8304ad4dc818ffd701c2f3e90683b5b8013f44e2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:20:00 -0700
Subject: bcache: Remove unnecessary check in should_split()

Checking i->seq was redundant, because since ages ago we always
initialize the new bset when advancing b->written

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 3333d3723633..8a1c7e6bbbe3 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -353,7 +353,7 @@ static inline void rw_unlock(bool w, struct btree *b)
 		    _w == insert_lock(op, _b))				\
 			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\
 		rw_unlock(_w, _b);					\
-		bch_cannibalize_unlock(c, &(op)->cl);		\
+		bch_cannibalize_unlock(c, &(op)->cl);			\
 	} while (_r == -EINTR);						\
 									\
 	_r;								\
@@ -363,8 +363,7 @@ static inline bool should_split(struct btree *b)
 {
 	struct bset *i = write_block(b);
 	return b->written >= btree_blocks(b) ||
-		(i->seq == b->sets[0].data->seq &&
-		 b->written + __set_blocks(i, i->keys + 15, b->c)
+		(b->written + __set_blocks(i, i->keys + 15, b->c)
 		 > btree_blocks(b));
 }
 
-- 
cgit v1.2.3


From d6fd3b11cea82346837957feab25b0be48aa424c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:20:19 -0700
Subject: bcache: Explicitly track btree node's parent

This is prep work for the reworked btree insertion code.

The way we set b->parent is ugly and hacky... the problem is, when
btree_split() or garbage collection splits or rewrites a btree node, the
parent changes for all its (potentially already cached) children.

I may change this later and add some code to look through the btree node
cache and find all our cached child nodes and change the parent pointer
then...

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c | 14 ++++++++++----
 drivers/md/bcache/btree.h | 16 ++++++++++------
 2 files changed, 20 insertions(+), 10 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index d1734d9d9c79..87299baa8a1b 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -884,6 +884,7 @@ out:
 
 	lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
 	b->level	= level;
+	b->parent	= (void *) ~0UL;
 
 	mca_reinit(b);
 
@@ -1898,7 +1899,7 @@ out:
 
 static int btree_split(struct btree *b, struct btree_op *op)
 {
-	bool split, root = b == b->c->root;
+	bool split;
 	struct btree *n1, *n2 = NULL, *n3 = NULL;
 	uint64_t start_time = local_clock();
 
@@ -1920,7 +1921,7 @@ static int btree_split(struct btree *b, struct btree_op *op)
 		if (IS_ERR(n2))
 			goto err_free1;
 
-		if (root) {
+		if (!b->parent) {
 			n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl);
 			if (IS_ERR(n3))
 				goto err_free2;
@@ -1928,7 +1929,8 @@ static int btree_split(struct btree *b, struct btree_op *op)
 
 		bch_btree_insert_keys(n1, op);
 
-		/* Has to be a linear search because we don't have an auxiliary
+		/*
+		 * Has to be a linear search because we don't have an auxiliary
 		 * search tree yet
 		 */
 
@@ -1960,6 +1962,8 @@ static int btree_split(struct btree *b, struct btree_op *op)
 	bch_btree_node_write(n1, &op->cl);
 
 	if (n3) {
+		/* Depth increases, make a new root */
+
 		bkey_copy_key(&n3->key, &MAX_KEY);
 		bch_btree_insert_keys(n3, op);
 		bch_btree_node_write(n3, &op->cl);
@@ -1967,7 +1971,9 @@ static int btree_split(struct btree *b, struct btree_op *op)
 		closure_sync(&op->cl);
 		bch_btree_set_root(n3);
 		rw_unlock(true, n3);
-	} else if (root) {
+	} else if (!b->parent) {
+		/* Root filled up but didn't need to be split */
+
 		op->keys.top = op->keys.bottom;
 		closure_sync(&op->cl);
 		bch_btree_set_root(n1);
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 8a1c7e6bbbe3..6d2fb7550706 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -125,6 +125,7 @@ struct btree {
 	unsigned long		seq;
 	struct rw_semaphore	lock;
 	struct cache_set	*c;
+	struct btree		*parent;
 
 	unsigned long		flags;
 	uint16_t		written;	/* would be nice to kill */
@@ -327,12 +328,13 @@ static inline void rw_unlock(bool w, struct btree *b)
 ({									\
 	int _r, l = (b)->level - 1;					\
 	bool _w = l <= (op)->lock;					\
-	struct btree *_b = bch_btree_node_get((b)->c, key, l, op);	\
-	if (!IS_ERR(_b)) {						\
-		_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);		\
-		rw_unlock(_w, _b);					\
+	struct btree *_child = bch_btree_node_get((b)->c, key, l, op);	\
+	if (!IS_ERR(_child)) {						\
+		_child->parent = (b);					\
+		_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);	\
+		rw_unlock(_w, _child);					\
 	} else								\
-		_r = PTR_ERR(_b);					\
+		_r = PTR_ERR(_child);					\
 	_r;								\
 })
 
@@ -350,8 +352,10 @@ static inline void rw_unlock(bool w, struct btree *b)
 		bool _w = insert_lock(op, _b);				\
 		rw_lock(_w, _b, _b->level);				\
 		if (_b == (c)->root &&					\
-		    _w == insert_lock(op, _b))				\
+		    _w == insert_lock(op, _b)) {			\
+			_b->parent = NULL;				\
 			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\
+		}							\
 		rw_unlock(_w, _b);					\
 		bch_cannibalize_unlock(c, &(op)->cl);			\
 	} while (_r == -EINTR);						\
-- 
cgit v1.2.3


From 26c949f8062cb9221a28b2228104f1cc5b265097 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 10 Sep 2013 18:41:15 -0700
Subject: bcache: Add btree_insert_node()

The flow of control in the old btree insertion code was rather -
backwards; we'd recurse down the btree (in btree_insert_recurse()), and
then if we needed to split the keys to be inserted into the parent node
would be effectively returned up to btree_insert_recurse(), which would
notice there was more work to do and finish the insertion.

The main problem with this was that the full logic for btree insertion
could only be used by calling btree_insert_recurse; if you'd gotten to a
btree leaf some other way and had a key to insert, if it turned out that
node needed to be split you were SOL.

This inverts the flow of control so btree_insert_node() does _full_
btree insertion, including splitting - and takes a (leaf) btree node to
insert into as a parameter.

This means we can now _correctly_ handle cache misses - for cache
misses, we need to insert a fake "check" key into the btree when we
discover we have a cache miss - while we still have the btree locked.
Previously, if the btree node was full inserting a cache miss would just
fail.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bset.c  |  12 ++++
 drivers/md/bcache/bset.h  |   1 +
 drivers/md/bcache/btree.c | 158 +++++++++++++++++++++++++++-------------------
 3 files changed, 105 insertions(+), 66 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 22d1ae72c282..830eede86d56 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -73,6 +73,18 @@ struct bkey *bch_keylist_pop(struct keylist *l)
 	return l->top = k;
 }
 
+void bch_keylist_pop_front(struct keylist *l)
+{
+	struct bkey *next = bkey_next(l->bottom);
+	size_t bytes = ((void *) l->top) - ((void *) next);
+
+	memmove(l->bottom,
+		next,
+		bytes);
+
+	l->top = ((void *) l->bottom) + bytes;
+}
+
 /* Pointer validation */
 
 bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index ae115a253d73..a3627d0cd88b 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -267,6 +267,7 @@ static inline void bch_keylist_free(struct keylist *l)
 
 void bch_keylist_copy(struct keylist *, struct keylist *);
 struct bkey *bch_keylist_pop(struct keylist *);
+void bch_keylist_pop_front(struct keylist *);
 int bch_keylist_realloc(struct keylist *, int, struct cache_set *);
 
 void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 87299baa8a1b..c2722e075f18 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1849,15 +1849,43 @@ merged:
 	return true;
 }
 
-static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op)
+static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
+				  struct keylist *insert_keys)
 {
 	bool ret = false;
-	struct bkey *k;
 	unsigned oldsize = bch_count_data(b);
 
-	while ((k = bch_keylist_pop(&op->keys))) {
-		bkey_put(b->c, k, b->level);
-		ret |= btree_insert_key(b, op, k);
+	BUG_ON(!insert_lock(op, b));
+
+	while (!bch_keylist_empty(insert_keys)) {
+		struct bkey *k = insert_keys->bottom;
+
+		if (b->level ||
+		    bkey_cmp(k, &b->key) <= 0) {
+			bkey_put(b->c, k, b->level);
+
+			ret |= btree_insert_key(b, op, k);
+			bch_keylist_pop_front(insert_keys);
+		} else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
+#if 0
+			if (op->type == BTREE_REPLACE) {
+				bkey_put(b->c, k, b->level);
+				bch_keylist_pop_front(insert_keys);
+				op->insert_collision = true;
+				break;
+			}
+#endif
+			BKEY_PADDED(key) temp;
+			bkey_copy(&temp.key, insert_keys->bottom);
+
+			bch_cut_back(&b->key, &temp.key);
+			bch_cut_front(&b->key, insert_keys->bottom);
+
+			ret |= btree_insert_key(b, op, &temp.key);
+			break;
+		} else {
+			break;
+		}
 	}
 
 	BUG_ON(bch_count_data(b) < oldsize);
@@ -1897,7 +1925,9 @@ out:
 	return ret;
 }
 
-static int btree_split(struct btree *b, struct btree_op *op)
+static int btree_split(struct btree *b, struct btree_op *op,
+		       struct keylist *insert_keys,
+		       struct keylist *parent_keys)
 {
 	bool split;
 	struct btree *n1, *n2 = NULL, *n3 = NULL;
@@ -1927,7 +1957,7 @@ static int btree_split(struct btree *b, struct btree_op *op)
 				goto err_free2;
 		}
 
-		bch_btree_insert_keys(n1, op);
+		bch_btree_insert_keys(n1, op, insert_keys);
 
 		/*
 		 * Has to be a linear search because we don't have an auxiliary
@@ -1949,23 +1979,23 @@ static int btree_split(struct btree *b, struct btree_op *op)
 
 		bkey_copy_key(&n2->key, &b->key);
 
-		bch_keylist_add(&op->keys, &n2->key);
+		bch_keylist_add(parent_keys, &n2->key);
 		bch_btree_node_write(n2, &op->cl);
 		rw_unlock(true, n2);
 	} else {
 		trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
 
-		bch_btree_insert_keys(n1, op);
+		bch_btree_insert_keys(n1, op, insert_keys);
 	}
 
-	bch_keylist_add(&op->keys, &n1->key);
+	bch_keylist_add(parent_keys, &n1->key);
 	bch_btree_node_write(n1, &op->cl);
 
 	if (n3) {
 		/* Depth increases, make a new root */
 
 		bkey_copy_key(&n3->key, &MAX_KEY);
-		bch_btree_insert_keys(n3, op);
+		bch_btree_insert_keys(n3, op, parent_keys);
 		bch_btree_node_write(n3, &op->cl);
 
 		closure_sync(&op->cl);
@@ -1974,22 +2004,22 @@ static int btree_split(struct btree *b, struct btree_op *op)
 	} else if (!b->parent) {
 		/* Root filled up but didn't need to be split */
 
-		op->keys.top = op->keys.bottom;
+		parent_keys->top = parent_keys->bottom;
 		closure_sync(&op->cl);
 		bch_btree_set_root(n1);
 	} else {
 		unsigned i;
 
-		bkey_copy(op->keys.top, &b->key);
-		bkey_copy_key(op->keys.top, &ZERO_KEY);
+		bkey_copy(parent_keys->top, &b->key);
+		bkey_copy_key(parent_keys->top, &ZERO_KEY);
 
 		for (i = 0; i < KEY_PTRS(&b->key); i++) {
 			uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1;
 
-			SET_PTR_GEN(op->keys.top, i, g);
+			SET_PTR_GEN(parent_keys->top, i, g);
 		}
 
-		bch_keylist_push(&op->keys);
+		bch_keylist_push(parent_keys);
 		closure_sync(&op->cl);
 		atomic_inc(&b->c->prio_blocked);
 	}
@@ -2018,69 +2048,65 @@ err:
 	return -ENOMEM;
 }
 
-static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
-				    struct keylist *stack_keys)
+static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
+				 struct keylist *insert_keys)
 {
-	if (b->level) {
-		int ret;
-		struct bkey *insert = op->keys.bottom;
-		struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert));
+	int ret = 0;
+	struct keylist split_keys;
 
-		if (!k) {
-			btree_bug(b, "no key to recurse on at level %i/%i",
-				  b->level, b->c->root->level);
+	bch_keylist_init(&split_keys);
 
-			op->keys.top = op->keys.bottom;
-			return -EIO;
-		}
+	BUG_ON(b->level);
 
-		if (bkey_cmp(insert, k) > 0) {
-			unsigned i;
+	do {
+		if (should_split(b)) {
+			if (current->bio_list) {
+				op->lock = b->c->root->level + 1;
+				ret = -EAGAIN;
+			} else if (op->lock <= b->c->root->level) {
+				op->lock = b->c->root->level + 1;
+				ret = -EINTR;
+			} else {
+				struct btree *parent = b->parent;
 
-			if (op->type == BTREE_REPLACE) {
-				__bkey_put(b->c, insert);
-				op->keys.top = op->keys.bottom;
-				op->insert_collision = true;
-				return 0;
+				ret = btree_split(b, op, insert_keys,
+						  &split_keys);
+				insert_keys = &split_keys;
+				b = parent;
 			}
+		} else {
+			BUG_ON(write_block(b) != b->sets[b->nsets].data);
 
-			for (i = 0; i < KEY_PTRS(insert); i++)
-				atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin);
-
-			bkey_copy(stack_keys->top, insert);
-
-			bch_cut_back(k, insert);
-			bch_cut_front(k, stack_keys->top);
-
-			bch_keylist_push(stack_keys);
+			if (bch_btree_insert_keys(b, op, insert_keys)) {
+				if (!b->level)
+					bch_btree_leaf_dirty(b, op);
+				else
+					bch_btree_node_write(b, &op->cl);
+			}
 		}
+	} while (!bch_keylist_empty(&split_keys));
 
-		ret = btree(insert_recurse, k, b, op, stack_keys);
-		if (ret)
-			return ret;
-	}
+	return ret;
+}
 
-	if (!bch_keylist_empty(&op->keys)) {
-		if (should_split(b)) {
-			if (op->lock <= b->c->root->level) {
-				BUG_ON(b->level);
-				op->lock = b->c->root->level + 1;
-				return -EINTR;
-			}
-			return btree_split(b, op);
-		}
+static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op)
+{
+	if (b->level) {
+		struct bkey *insert = op->keys.bottom;
+		struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert));
 
-		BUG_ON(write_block(b) != b->sets[b->nsets].data);
+		if (!k) {
+			btree_bug(b, "no key to recurse on at level %i/%i",
+				  b->level, b->c->root->level);
 
-		if (bch_btree_insert_keys(b, op)) {
-			if (!b->level)
-				bch_btree_leaf_dirty(b, op);
-			else
-				bch_btree_node_write(b, &op->cl);
+			op->keys.top = op->keys.bottom;
+			return -EIO;
 		}
-	}
 
-	return 0;
+		return btree(insert_recurse, k, b, op);
+	} else {
+		return bch_btree_insert_node(b, op, &op->keys);
+	}
 }
 
 int bch_btree_insert(struct btree_op *op, struct cache_set *c)
@@ -2106,7 +2132,7 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c)
 			op->lock = 0;
 		}
 
-		ret = btree_root(insert_recurse, c, op, &stack_keys);
+		ret = btree_root(insert_recurse, c, op);
 
 		if (ret == -EAGAIN) {
 			ret = 0;
-- 
cgit v1.2.3


From 403b6cdeb1a38d896ffcb1f99ddcfd4e343b5d69 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:22:44 -0700
Subject: bcache: Insert multiple keys at a time

We'll often end up with a list of adjacent keys to insert -
because bch_data_insert() may have to fragment the data it writes.

Originally, to simplify things and avoid having to deal with corner
cases bch_btree_insert() would pass keys from this list one at a time to
btree_insert_recurse() - mainly because the list of keys might span leaf
nodes, so it was easier this way.

With the btree_insert_node() refactoring, it's now a lot easier to just
pass down the whole list and have btree_insert_recurse() iterate over
leaf nodes until it's done.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index c2722e075f18..60d06465d772 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1858,10 +1858,14 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
 	BUG_ON(!insert_lock(op, b));
 
 	while (!bch_keylist_empty(insert_keys)) {
+		struct bset *i = write_block(b);
 		struct bkey *k = insert_keys->bottom;
 
-		if (b->level ||
-		    bkey_cmp(k, &b->key) <= 0) {
+		if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c)
+		    > btree_blocks(b))
+			break;
+
+		if (bkey_cmp(k, &b->key) <= 0) {
 			bkey_put(b->c, k, b->level);
 
 			ret |= btree_insert_key(b, op, k);
@@ -1888,6 +1892,8 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
 		}
 	}
 
+	BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
+
 	BUG_ON(bch_count_data(b) < oldsize);
 	return ret;
 }
@@ -2073,6 +2079,8 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 						  &split_keys);
 				insert_keys = &split_keys;
 				b = parent;
+				if (!ret)
+					ret = -EINTR;
 			}
 		} else {
 			BUG_ON(write_block(b) != b->sets[b->nsets].data);
@@ -2091,6 +2099,9 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 
 static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op)
 {
+	if (bch_keylist_empty(&op->keys))
+		return 0;
+
 	if (b->level) {
 		struct bkey *insert = op->keys.bottom;
 		struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert));
@@ -2112,7 +2123,6 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op)
 int bch_btree_insert(struct btree_op *op, struct cache_set *c)
 {
 	int ret = 0;
-	struct keylist stack_keys;
 
 	/*
 	 * Don't want to block with the btree locked unless we have to,
@@ -2121,17 +2131,9 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c)
 	clear_closure_blocking(&op->cl);
 
 	BUG_ON(bch_keylist_empty(&op->keys));
-	bch_keylist_copy(&stack_keys, &op->keys);
-	bch_keylist_init(&op->keys);
-
-	while (!bch_keylist_empty(&stack_keys) ||
-	       !bch_keylist_empty(&op->keys)) {
-		if (bch_keylist_empty(&op->keys)) {
-			bch_keylist_add(&op->keys,
-					bch_keylist_pop(&stack_keys));
-			op->lock = 0;
-		}
 
+	while (!bch_keylist_empty(&op->keys)) {
+		op->lock = 0;
 		ret = btree_root(insert_recurse, c, op);
 
 		if (ret == -EAGAIN) {
@@ -2143,14 +2145,11 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c)
 			pr_err("error %i trying to insert key for %s",
 			       ret, op_type(op));
 
-			while ((k = bch_keylist_pop(&stack_keys) ?:
-				    bch_keylist_pop(&op->keys)))
+			while ((k = bch_keylist_pop(&op->keys)))
 				bkey_put(c, k, 0);
 		}
 	}
 
-	bch_keylist_free(&stack_keys);
-
 	if (op->journal)
 		atomic_dec_bug(op->journal);
 	op->journal = NULL;
-- 
cgit v1.2.3


From e7c590eb63509c5d5f48a390d23aa25f4417ac96 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 10 Sep 2013 18:39:16 -0700
Subject: bcache: Convert btree_insert_check_key() to btree_insert_node()

This was the main point of all this refactoring - now,
btree_insert_check_key() won't fail just because the leaf node happened
to be full.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h  |  8 -----
 drivers/md/bcache/btree.c   | 82 ++++++++++++++++++++++++++-------------------
 drivers/md/bcache/btree.h   |  6 ++--
 drivers/md/bcache/request.c | 55 +++++++++++++++---------------
 4 files changed, 79 insertions(+), 72 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 6e836f22f276..10ce0c825fce 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -1091,14 +1091,6 @@ do {									\
 	for (b = (ca)->buckets + (ca)->sb.first_bucket;			\
 	     b < (ca)->buckets + (ca)->sb.nbuckets; b++)
 
-static inline void __bkey_put(struct cache_set *c, struct bkey *k)
-{
-	unsigned i;
-
-	for (i = 0; i < KEY_PTRS(k); i++)
-		atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
-}
-
 static inline void cached_dev_put(struct cached_dev *dc)
 {
 	if (atomic_dec_and_test(&dc->count))
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 60d06465d772..08a85327431c 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -118,6 +118,15 @@ void bch_btree_op_init_stack(struct btree_op *op)
 
 /* Btree key manipulation */
 
+void __bkey_put(struct cache_set *c, struct bkey *k)
+{
+	unsigned i;
+
+	for (i = 0; i < KEY_PTRS(k); i++)
+		if (ptr_available(c, k, i))
+			atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
+}
+
 static void bkey_put(struct cache_set *c, struct bkey *k, int level)
 {
 	if ((level && KEY_OFFSET(k)) || !level)
@@ -1855,8 +1864,6 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
 	bool ret = false;
 	unsigned oldsize = bch_count_data(b);
 
-	BUG_ON(!insert_lock(op, b));
-
 	while (!bch_keylist_empty(insert_keys)) {
 		struct bset *i = write_block(b);
 		struct bkey *k = insert_keys->bottom;
@@ -1898,39 +1905,6 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
 	return ret;
 }
 
-bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
-				   struct bio *bio)
-{
-	bool ret = false;
-	uint64_t btree_ptr = b->key.ptr[0];
-	unsigned long seq = b->seq;
-	BKEY_PADDED(k) tmp;
-
-	rw_unlock(false, b);
-	rw_lock(true, b, b->level);
-
-	if (b->key.ptr[0] != btree_ptr ||
-	    b->seq != seq + 1 ||
-	    should_split(b))
-		goto out;
-
-	op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio));
-
-	SET_KEY_PTRS(&op->replace, 1);
-	get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t));
-
-	SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV);
-
-	bkey_copy(&tmp.k, &op->replace);
-
-	BUG_ON(op->type != BTREE_INSERT);
-	BUG_ON(!btree_insert_key(b, op, &tmp.k));
-	ret = true;
-out:
-	downgrade_write(&b->lock);
-	return ret;
-}
-
 static int btree_split(struct btree *b, struct btree_op *op,
 		       struct keylist *insert_keys,
 		       struct keylist *parent_keys)
@@ -2097,6 +2071,44 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 	return ret;
 }
 
+int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
+			       struct bkey *check_key)
+{
+	int ret = -EINTR;
+	uint64_t btree_ptr = b->key.ptr[0];
+	unsigned long seq = b->seq;
+	struct keylist insert;
+	bool upgrade = op->lock == -1;
+
+	bch_keylist_init(&insert);
+
+	if (upgrade) {
+		rw_unlock(false, b);
+		rw_lock(true, b, b->level);
+
+		if (b->key.ptr[0] != btree_ptr ||
+		    b->seq != seq + 1)
+			goto out;
+	}
+
+	SET_KEY_PTRS(check_key, 1);
+	get_random_bytes(&check_key->ptr[0], sizeof(uint64_t));
+
+	SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV);
+
+	bch_keylist_add(&insert, check_key);
+
+	BUG_ON(op->type != BTREE_INSERT);
+
+	ret = bch_btree_insert_node(b, op, &insert);
+
+	BUG_ON(!ret && !bch_keylist_empty(&insert));
+out:
+	if (upgrade)
+		downgrade_write(&b->lock);
+	return ret;
+}
+
 static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op)
 {
 	if (bch_keylist_empty(&op->keys))
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 6d2fb7550706..73bd62105e39 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -216,6 +216,8 @@ static inline struct bkey *bch_btree_iter_init(struct btree *b,
 	return __bch_btree_iter_init(b, iter, search, b->sets);
 }
 
+void __bkey_put(struct cache_set *c, struct bkey *k);
+
 /* Looping macros */
 
 #define for_each_cached_btree(b, c, iter)				\
@@ -380,8 +382,8 @@ struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *);
 struct btree *bch_btree_node_get(struct cache_set *, struct bkey *,
 				int, struct btree_op *);
 
-bool bch_btree_insert_check_key(struct btree *, struct btree_op *,
-				   struct bio *);
+int bch_btree_insert_check_key(struct btree *, struct btree_op *,
+			       struct bkey *);
 int bch_btree_insert(struct btree_op *, struct cache_set *);
 
 int bch_btree_search_recurse(struct btree *, struct btree_op *);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 2a7f0dd6abab..9ed334ca484d 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -869,35 +869,39 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 				 struct bio *bio, unsigned sectors)
 {
 	int ret = 0;
-	unsigned reada;
+	unsigned reada = 0;
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	struct bio *miss;
 
-	miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
-	if (miss == bio)
-		s->op.lookup_done = true;
+	if (s->cache_miss || s->op.skip) {
+		miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+		if (miss == bio)
+			s->op.lookup_done = true;
+		goto out_submit;
+	}
 
-	miss->bi_end_io		= request_endio;
-	miss->bi_private	= &s->cl;
+	if (!(bio->bi_rw & REQ_RAHEAD) &&
+	    !(bio->bi_rw & REQ_META) &&
+	    s->op.c->gc_stats.in_use < CUTOFF_CACHE_READA)
+		reada = min_t(sector_t, dc->readahead >> 9,
+			      bdev_sectors(bio->bi_bdev) - bio_end_sector(bio));
 
-	if (s->cache_miss || s->op.skip)
-		goto out_submit;
+	s->cache_bio_sectors = min(sectors, bio_sectors(bio) + reada);
 
-	if (miss != bio ||
-	    (bio->bi_rw & REQ_RAHEAD) ||
-	    (bio->bi_rw & REQ_META) ||
-	    s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA)
-		reada = 0;
-	else {
-		reada = min(dc->readahead >> 9,
-			    sectors - bio_sectors(miss));
-
-		if (bio_end_sector(miss) + reada > bdev_sectors(miss->bi_bdev))
-			reada = bdev_sectors(miss->bi_bdev) -
-				bio_end_sector(miss);
-	}
+	s->op.replace = KEY(s->op.inode, bio->bi_sector +
+			    s->cache_bio_sectors, s->cache_bio_sectors);
+
+	ret = bch_btree_insert_check_key(b, &s->op, &s->op.replace);
+	if (ret)
+		return ret;
+
+	miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+	if (miss == bio)
+		s->op.lookup_done = true;
+	else
+		/* btree_search_recurse()'s btree iterator is no good anymore */
+		ret = -EINTR;
 
-	s->cache_bio_sectors = bio_sectors(miss) + reada;
 	s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT,
 			DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS),
 			dc->disk.bio_split);
@@ -912,11 +916,6 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	s->op.cache_bio->bi_end_io	= request_endio;
 	s->op.cache_bio->bi_private	= &s->cl;
 
-	/* btree_search_recurse()'s btree iterator is no good anymore */
-	ret = -EINTR;
-	if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio))
-		goto out_put;
-
 	bch_bio_map(s->op.cache_bio, NULL);
 	if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO))
 		goto out_put;
@@ -931,6 +930,8 @@ out_put:
 	bio_put(s->op.cache_bio);
 	s->op.cache_bio = NULL;
 out_submit:
+	miss->bi_end_io		= request_endio;
+	miss->bi_private	= &s->cl;
 	closure_bio_submit(miss, &s->cl, s->d);
 	return ret;
 }
-- 
cgit v1.2.3


From 4f3d40147b8d0ce7055e241e1d263e0aa2b2b46d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 10 Sep 2013 18:46:36 -0700
Subject: bcache: Add explicit keylist arg to btree_insert()

Some refactoring - better to explicitly pass stuff around instead of
having it all in the "big bag of state", struct btree_op. Going to prune
struct btree_op quite a bit over time.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c     | 26 ++++++++++++++------------
 drivers/md/bcache/btree.h     |  2 +-
 drivers/md/bcache/journal.c   |  2 +-
 drivers/md/bcache/request.c   |  2 +-
 drivers/md/bcache/writeback.c |  2 +-
 5 files changed, 18 insertions(+), 16 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 08a85327431c..fc3cae5c94b2 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -2109,30 +2109,32 @@ out:
 	return ret;
 }
 
-static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op)
+static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
+				    struct keylist *keys)
 {
-	if (bch_keylist_empty(&op->keys))
+	if (bch_keylist_empty(keys))
 		return 0;
 
 	if (b->level) {
-		struct bkey *insert = op->keys.bottom;
-		struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert));
+		struct bkey *k;
 
+		k = bch_next_recurse_key(b, &START_KEY(keys->bottom));
 		if (!k) {
 			btree_bug(b, "no key to recurse on at level %i/%i",
 				  b->level, b->c->root->level);
 
-			op->keys.top = op->keys.bottom;
+			keys->top = keys->bottom;
 			return -EIO;
 		}
 
-		return btree(insert_recurse, k, b, op);
+		return btree(insert_recurse, k, b, op, keys);
 	} else {
-		return bch_btree_insert_node(b, op, &op->keys);
+		return bch_btree_insert_node(b, op, keys);
 	}
 }
 
-int bch_btree_insert(struct btree_op *op, struct cache_set *c)
+int bch_btree_insert(struct btree_op *op, struct cache_set *c,
+		     struct keylist *keys)
 {
 	int ret = 0;
 
@@ -2142,11 +2144,11 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c)
 	 */
 	clear_closure_blocking(&op->cl);
 
-	BUG_ON(bch_keylist_empty(&op->keys));
+	BUG_ON(bch_keylist_empty(keys));
 
-	while (!bch_keylist_empty(&op->keys)) {
+	while (!bch_keylist_empty(keys)) {
 		op->lock = 0;
-		ret = btree_root(insert_recurse, c, op);
+		ret = btree_root(insert_recurse, c, op, keys);
 
 		if (ret == -EAGAIN) {
 			ret = 0;
@@ -2157,7 +2159,7 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c)
 			pr_err("error %i trying to insert key for %s",
 			       ret, op_type(op));
 
-			while ((k = bch_keylist_pop(&op->keys)))
+			while ((k = bch_keylist_pop(keys)))
 				bkey_put(c, k, 0);
 		}
 	}
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 73bd62105e39..967aacd20625 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -384,7 +384,7 @@ struct btree *bch_btree_node_get(struct cache_set *, struct bkey *,
 
 int bch_btree_insert_check_key(struct btree *, struct btree_op *,
 			       struct bkey *);
-int bch_btree_insert(struct btree_op *, struct cache_set *);
+int bch_btree_insert(struct btree_op *, struct cache_set *, struct keylist *);
 
 int bch_btree_search_recurse(struct btree *, struct btree_op *);
 
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 9e8775872dba..5abe5d5fc183 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -320,7 +320,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
 			op->journal = i->pin;
 			atomic_inc(op->journal);
 
-			ret = bch_btree_insert(op, s);
+			ret = bch_btree_insert(op, s, &op->keys);
 			if (ret)
 				goto err;
 
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 9ed334ca484d..7fd84ce9e835 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -607,7 +607,7 @@ void bch_btree_insert_async(struct closure *cl)
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
 	struct search *s = container_of(op, struct search, op);
 
-	if (bch_btree_insert(op, op->c)) {
+	if (bch_btree_insert(op, op->c, &op->keys)) {
 		s->error		= -ENOMEM;
 		op->insert_data_done	= true;
 	}
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index b842fbfbf1db..8ffc8ec7231d 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -311,7 +311,7 @@ static void write_dirty_finish(struct closure *cl)
 		for (i = 0; i < KEY_PTRS(&w->key); i++)
 			atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
 
-		bch_btree_insert(&op, dc->disk.c);
+		bch_btree_insert(&op, dc->disk.c, &op.keys);
 		closure_sync(&op.cl);
 
 		if (op.insert_collision)
-- 
cgit v1.2.3


From c2f95ae2ebbe1ab61b1d4437f5923fdf720d4d4d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:24:25 -0700
Subject: bcache: Clean up keylist code

More random refactoring.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bset.c    | 44 +++++++++++++++-----------------------------
 drivers/md/bcache/bset.h    | 35 ++++++++++++++++++++++++++---------
 drivers/md/bcache/btree.c   | 12 ++++++------
 drivers/md/bcache/journal.c | 14 ++++++++------
 drivers/md/bcache/request.c |  4 ++--
 5 files changed, 57 insertions(+), 52 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 830eede86d56..d0512e451dda 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -14,22 +14,12 @@
 
 /* Keylists */
 
-void bch_keylist_copy(struct keylist *dest, struct keylist *src)
-{
-	*dest = *src;
-
-	if (src->list == src->d) {
-		size_t n = (uint64_t *) src->top - src->d;
-		dest->top = (struct bkey *) &dest->d[n];
-		dest->list = dest->d;
-	}
-}
-
 int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c)
 {
-	unsigned oldsize = (uint64_t *) l->top - l->list;
-	unsigned newsize = oldsize + 2 + nptrs;
-	uint64_t *new;
+	size_t oldsize = bch_keylist_nkeys(l);
+	size_t newsize = oldsize + 2 + nptrs;
+	uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p;
+	uint64_t *new_keys;
 
 	/* The journalling code doesn't handle the case where the keys to insert
 	 * is bigger than an empty write: If we just return -ENOMEM here,
@@ -45,24 +35,23 @@ int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c)
 	    roundup_pow_of_two(oldsize) == newsize)
 		return 0;
 
-	new = krealloc(l->list == l->d ? NULL : l->list,
-		       sizeof(uint64_t) * newsize, GFP_NOIO);
+	new_keys = krealloc(old_keys, sizeof(uint64_t) * newsize, GFP_NOIO);
 
-	if (!new)
+	if (!new_keys)
 		return -ENOMEM;
 
-	if (l->list == l->d)
-		memcpy(new, l->list, sizeof(uint64_t) * KEYLIST_INLINE);
+	if (!old_keys)
+		memcpy(new_keys, l->inline_keys, sizeof(uint64_t) * oldsize);
 
-	l->list = new;
-	l->top = (struct bkey *) (&l->list[oldsize]);
+	l->keys_p = new_keys;
+	l->top_p = new_keys + oldsize;
 
 	return 0;
 }
 
 struct bkey *bch_keylist_pop(struct keylist *l)
 {
-	struct bkey *k = l->bottom;
+	struct bkey *k = l->keys;
 
 	if (k == l->top)
 		return NULL;
@@ -75,14 +64,11 @@ struct bkey *bch_keylist_pop(struct keylist *l)
 
 void bch_keylist_pop_front(struct keylist *l)
 {
-	struct bkey *next = bkey_next(l->bottom);
-	size_t bytes = ((void *) l->top) - ((void *) next);
-
-	memmove(l->bottom,
-		next,
-		bytes);
+	l->top_p -= bkey_u64s(l->keys);
 
-	l->top = ((void *) l->bottom) + bytes;
+	memmove(l->keys,
+		bkey_next(l->keys),
+		bch_keylist_bytes(l));
 }
 
 /* Pointer validation */
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index a3627d0cd88b..8a9305685b7e 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -227,20 +227,23 @@ static inline struct bkey *bkey_next(const struct bkey *k)
 /* Keylists */
 
 struct keylist {
-	struct bkey		*top;
 	union {
-		uint64_t		*list;
-		struct bkey		*bottom;
+		struct bkey		*keys;
+		uint64_t		*keys_p;
+	};
+	union {
+		struct bkey		*top;
+		uint64_t		*top_p;
 	};
 
 	/* Enough room for btree_split's keys without realloc */
 #define KEYLIST_INLINE		16
-	uint64_t		d[KEYLIST_INLINE];
+	uint64_t		inline_keys[KEYLIST_INLINE];
 };
 
 static inline void bch_keylist_init(struct keylist *l)
 {
-	l->top = (void *) (l->list = l->d);
+	l->top_p = l->keys_p = l->inline_keys;
 }
 
 static inline void bch_keylist_push(struct keylist *l)
@@ -256,16 +259,30 @@ static inline void bch_keylist_add(struct keylist *l, struct bkey *k)
 
 static inline bool bch_keylist_empty(struct keylist *l)
 {
-	return l->top == (void *) l->list;
+	return l->top == l->keys;
+}
+
+static inline void bch_keylist_reset(struct keylist *l)
+{
+	l->top = l->keys;
 }
 
 static inline void bch_keylist_free(struct keylist *l)
 {
-	if (l->list != l->d)
-		kfree(l->list);
+	if (l->keys_p != l->inline_keys)
+		kfree(l->keys_p);
+}
+
+static inline size_t bch_keylist_nkeys(struct keylist *l)
+{
+	return l->top_p - l->keys_p;
+}
+
+static inline size_t bch_keylist_bytes(struct keylist *l)
+{
+	return bch_keylist_nkeys(l) * sizeof(uint64_t);
 }
 
-void bch_keylist_copy(struct keylist *, struct keylist *);
 struct bkey *bch_keylist_pop(struct keylist *);
 void bch_keylist_pop_front(struct keylist *);
 int bch_keylist_realloc(struct keylist *, int, struct cache_set *);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index fc3cae5c94b2..f960607f1f25 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1866,7 +1866,7 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
 
 	while (!bch_keylist_empty(insert_keys)) {
 		struct bset *i = write_block(b);
-		struct bkey *k = insert_keys->bottom;
+		struct bkey *k = insert_keys->keys;
 
 		if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c)
 		    > btree_blocks(b))
@@ -1887,10 +1887,10 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
 			}
 #endif
 			BKEY_PADDED(key) temp;
-			bkey_copy(&temp.key, insert_keys->bottom);
+			bkey_copy(&temp.key, insert_keys->keys);
 
 			bch_cut_back(&b->key, &temp.key);
-			bch_cut_front(&b->key, insert_keys->bottom);
+			bch_cut_front(&b->key, insert_keys->keys);
 
 			ret |= btree_insert_key(b, op, &temp.key);
 			break;
@@ -1984,7 +1984,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
 	} else if (!b->parent) {
 		/* Root filled up but didn't need to be split */
 
-		parent_keys->top = parent_keys->bottom;
+		bch_keylist_reset(parent_keys);
 		closure_sync(&op->cl);
 		bch_btree_set_root(n1);
 	} else {
@@ -2118,12 +2118,12 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
 	if (b->level) {
 		struct bkey *k;
 
-		k = bch_next_recurse_key(b, &START_KEY(keys->bottom));
+		k = bch_next_recurse_key(b, &START_KEY(keys->keys));
 		if (!k) {
 			btree_bug(b, "no key to recurse on at level %i/%i",
 				  b->level, b->c->root->level);
 
-			keys->top = keys->bottom;
+			bch_keylist_reset(keys);
 			return -EIO;
 		}
 
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 5abe5d5fc183..1bdefdb1fa71 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -713,7 +713,7 @@ void bch_journal(struct closure *cl)
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
 	struct cache_set *c = op->c;
 	struct journal_write *w;
-	size_t b, n = ((uint64_t *) op->keys.top) - op->keys.list;
+	size_t sectors, nkeys;
 
 	if (op->type != BTREE_INSERT ||
 	    !CACHE_SYNC(&c->sb))
@@ -741,10 +741,12 @@ void bch_journal(struct closure *cl)
 	}
 
 	w = c->journal.cur;
-	b = __set_blocks(w->data, w->data->keys + n, c);
+	nkeys = w->data->keys + bch_keylist_nkeys(&op->keys);
+	sectors = __set_blocks(w->data, nkeys, c) * c->sb.block_size;
 
-	if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS ||
-	    b > c->journal.blocks_free) {
+	if (sectors > min_t(size_t,
+			    c->journal.blocks_free * c->sb.block_size,
+			    PAGE_SECTORS << JSET_BITS)) {
 		trace_bcache_journal_entry_full(c);
 
 		/*
@@ -760,8 +762,8 @@ void bch_journal(struct closure *cl)
 		continue_at(cl, bch_journal, bcache_wq);
 	}
 
-	memcpy(end(w->data), op->keys.list, n * sizeof(uint64_t));
-	w->data->keys += n;
+	memcpy(end(w->data), op->keys.keys, bch_keylist_bytes(&op->keys));
+	w->data->keys += bch_keylist_nkeys(&op->keys);
 
 	op->journal = &fifo_back(&c->journal.pin);
 	atomic_inc(op->journal);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 7fd84ce9e835..a000e918b795 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -438,13 +438,13 @@ static void bch_insert_data_error(struct closure *cl)
 	 * from the keys we'll accomplish just that.
 	 */
 
-	struct bkey *src = op->keys.bottom, *dst = op->keys.bottom;
+	struct bkey *src = op->keys.keys, *dst = op->keys.keys;
 
 	while (src != op->keys.top) {
 		struct bkey *n = bkey_next(src);
 
 		SET_KEY_PTRS(src, 0);
-		bkey_copy(dst, src);
+		memmove(dst, src, bkey_bytes(src));
 
 		dst = bkey_next(dst);
 		src = n;
-- 
cgit v1.2.3


From 84f0db03ea1e024f2a9e6cfcf7ac0323e4f84d3a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:24:52 -0700
Subject: bcache: Refactor request_write()

Try to improve some of the naming a bit to be more consistent, and also
improve the flow of control in request_write() a bit.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.h   |   2 +-
 drivers/md/bcache/request.c | 368 ++++++++++++++++++++++----------------------
 2 files changed, 183 insertions(+), 187 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 967aacd20625..ea0814b51574 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -260,7 +260,7 @@ struct btree_op {
 	} type:8;
 
 	unsigned		csum:1;
-	unsigned		skip:1;
+	unsigned		bypass:1;
 	unsigned		flush_journal:1;
 
 	unsigned		insert_data_done:1;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index a000e918b795..dbc2ef6e7a35 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -25,8 +25,6 @@
 
 struct kmem_cache *bch_search_cache;
 
-static void check_should_skip(struct cached_dev *, struct search *);
-
 /* Cgroup interface */
 
 #ifdef CONFIG_CGROUP_BCACHE
@@ -480,7 +478,7 @@ static void bch_insert_data_loop(struct closure *cl)
 	struct search *s = container_of(op, struct search, op);
 	struct bio *bio = op->cache_bio, *n;
 
-	if (op->skip)
+	if (op->bypass)
 		return bio_invalidate(cl);
 
 	if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
@@ -557,7 +555,7 @@ err:
 		 * we wait for buckets to be freed up, so just invalidate the
 		 * rest of the write.
 		 */
-		op->skip = true;
+		op->bypass = true;
 		return bio_invalidate(cl);
 	} else {
 		/*
@@ -590,8 +588,8 @@ err:
  * It inserts the data in op->cache_bio; bi_sector is used for the key offset,
  * and op->inode is used for the key inode.
  *
- * If op->skip is true, instead of inserting the data it invalidates the region
- * of the cache represented by op->cache_bio and op->inode.
+ * If op->bypass is true, instead of inserting the data it invalidates the
+ * region of the cache represented by op->cache_bio and op->inode.
  */
 void bch_insert_data(struct closure *cl)
 {
@@ -717,7 +715,6 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
 	s->orig_bio		= bio;
 	s->write		= (bio->bi_rw & REQ_WRITE) != 0;
 	s->op.flush_journal	= (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
-	s->op.skip		= (bio->bi_rw & REQ_DISCARD) != 0;
 	s->recoverable		= 1;
 	s->start_time		= jiffies;
 	do_bio_hook(s);
@@ -757,6 +754,134 @@ static void cached_dev_bio_complete(struct closure *cl)
 	cached_dev_put(dc);
 }
 
+unsigned bch_get_congested(struct cache_set *c)
+{
+	int i;
+	long rand;
+
+	if (!c->congested_read_threshold_us &&
+	    !c->congested_write_threshold_us)
+		return 0;
+
+	i = (local_clock_us() - c->congested_last_us) / 1024;
+	if (i < 0)
+		return 0;
+
+	i += atomic_read(&c->congested);
+	if (i >= 0)
+		return 0;
+
+	i += CONGESTED_MAX;
+
+	if (i > 0)
+		i = fract_exp_two(i, 6);
+
+	rand = get_random_int();
+	i -= bitmap_weight(&rand, BITS_PER_LONG);
+
+	return i > 0 ? i : 1;
+}
+
+static void add_sequential(struct task_struct *t)
+{
+	ewma_add(t->sequential_io_avg,
+		 t->sequential_io, 8, 0);
+
+	t->sequential_io = 0;
+}
+
+static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
+{
+	return &dc->io_hash[hash_64(k, RECENT_IO_BITS)];
+}
+
+static bool check_should_bypass(struct cached_dev *dc, struct search *s)
+{
+	struct cache_set *c = s->op.c;
+	struct bio *bio = &s->bio.bio;
+	unsigned mode = cache_mode(dc, bio);
+	unsigned sectors, congested = bch_get_congested(c);
+
+	if (atomic_read(&dc->disk.detaching) ||
+	    c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
+	    (bio->bi_rw & REQ_DISCARD))
+		goto skip;
+
+	if (mode == CACHE_MODE_NONE ||
+	    (mode == CACHE_MODE_WRITEAROUND &&
+	     (bio->bi_rw & REQ_WRITE)))
+		goto skip;
+
+	if (bio->bi_sector & (c->sb.block_size - 1) ||
+	    bio_sectors(bio) & (c->sb.block_size - 1)) {
+		pr_debug("skipping unaligned io");
+		goto skip;
+	}
+
+	if (!congested && !dc->sequential_cutoff)
+		goto rescale;
+
+	if (!congested &&
+	    mode == CACHE_MODE_WRITEBACK &&
+	    (bio->bi_rw & REQ_WRITE) &&
+	    (bio->bi_rw & REQ_SYNC))
+		goto rescale;
+
+	if (dc->sequential_merge) {
+		struct io *i;
+
+		spin_lock(&dc->io_lock);
+
+		hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash)
+			if (i->last == bio->bi_sector &&
+			    time_before(jiffies, i->jiffies))
+				goto found;
+
+		i = list_first_entry(&dc->io_lru, struct io, lru);
+
+		add_sequential(s->task);
+		i->sequential = 0;
+found:
+		if (i->sequential + bio->bi_size > i->sequential)
+			i->sequential	+= bio->bi_size;
+
+		i->last			 = bio_end_sector(bio);
+		i->jiffies		 = jiffies + msecs_to_jiffies(5000);
+		s->task->sequential_io	 = i->sequential;
+
+		hlist_del(&i->hash);
+		hlist_add_head(&i->hash, iohash(dc, i->last));
+		list_move_tail(&i->lru, &dc->io_lru);
+
+		spin_unlock(&dc->io_lock);
+	} else {
+		s->task->sequential_io = bio->bi_size;
+
+		add_sequential(s->task);
+	}
+
+	sectors = max(s->task->sequential_io,
+		      s->task->sequential_io_avg) >> 9;
+
+	if (dc->sequential_cutoff &&
+	    sectors >= dc->sequential_cutoff >> 9) {
+		trace_bcache_bypass_sequential(s->orig_bio);
+		goto skip;
+	}
+
+	if (congested && sectors >= congested) {
+		trace_bcache_bypass_congested(s->orig_bio);
+		goto skip;
+	}
+
+rescale:
+	bch_rescale_priorities(c, bio_sectors(bio));
+	return false;
+skip:
+	bch_mark_sectors_bypassed(s, bio_sectors(bio));
+	return true;
+}
+
 /* Process reads */
 
 static void cached_dev_read_complete(struct closure *cl)
@@ -854,8 +979,8 @@ static void request_read_done_bh(struct closure *cl)
 	struct search *s = container_of(cl, struct search, cl);
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 
-	bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip);
-	trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip);
+	bch_mark_cache_accounting(s, !s->cache_miss, s->op.bypass);
+	trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.bypass);
 
 	if (s->error)
 		continue_at_nobarrier(cl, request_read_error, bcache_wq);
@@ -873,7 +998,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	struct bio *miss;
 
-	if (s->cache_miss || s->op.skip) {
+	if (s->cache_miss || s->op.bypass) {
 		miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
 		if (miss == bio)
 			s->op.lookup_done = true;
@@ -940,9 +1065,7 @@ static void request_read(struct cached_dev *dc, struct search *s)
 {
 	struct closure *cl = &s->cl;
 
-	check_should_skip(dc, s);
 	closure_call(&s->op.cl, btree_read_async, NULL, cl);
-
 	continue_at(cl, request_read_done_bh, NULL);
 }
 
@@ -961,41 +1084,48 @@ static void request_write(struct cached_dev *dc, struct search *s)
 {
 	struct closure *cl = &s->cl;
 	struct bio *bio = &s->bio.bio;
-	struct bkey start, end;
-	start = KEY(dc->disk.id, bio->bi_sector, 0);
-	end = KEY(dc->disk.id, bio_end_sector(bio), 0);
+	struct bkey start = KEY(dc->disk.id, bio->bi_sector, 0);
+	struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0);
 
 	bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end);
 
-	check_should_skip(dc, s);
 	down_read_non_owner(&dc->writeback_lock);
-
 	if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) {
-		s->op.skip	= false;
+		/*
+		 * We overlap with some dirty data undergoing background
+		 * writeback, force this write to writeback
+		 */
+		s->op.bypass	= false;
 		s->writeback	= true;
 	}
 
+	/*
+	 * Discards aren't _required_ to do anything, so skipping if
+	 * check_overlapping returned true is ok
+	 *
+	 * But check_overlapping drops dirty keys for which io hasn't started,
+	 * so we still want to call it.
+	 */
 	if (bio->bi_rw & REQ_DISCARD)
-		goto skip;
+		s->op.bypass = true;
 
 	if (should_writeback(dc, s->orig_bio,
 			     cache_mode(dc, bio),
-			     s->op.skip)) {
-		s->op.skip = false;
+			     s->op.bypass)) {
+		s->op.bypass = false;
 		s->writeback = true;
 	}
 
-	if (s->op.skip)
-		goto skip;
-
-	trace_bcache_write(s->orig_bio, s->writeback, s->op.skip);
+	trace_bcache_write(s->orig_bio, s->writeback, s->op.bypass);
 
-	if (!s->writeback) {
-		s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
-						   dc->disk.bio_split);
+	if (s->op.bypass) {
+		s->op.cache_bio = s->orig_bio;
+		bio_get(s->op.cache_bio);
 
-		closure_bio_submit(bio, cl, s->d);
-	} else {
+		if (!(bio->bi_rw & REQ_DISCARD) ||
+		    blk_queue_discard(bdev_get_queue(dc->bdev)))
+			closure_bio_submit(bio, cl, s->d);
+	} else if (s->writeback) {
 		bch_writeback_add(dc);
 		s->op.cache_bio = bio;
 
@@ -1011,21 +1141,15 @@ static void request_write(struct cached_dev *dc, struct search *s)
 
 			closure_bio_submit(flush, cl, s->d);
 		}
+	} else {
+		s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
+						   dc->disk.bio_split);
+
+		closure_bio_submit(bio, cl, s->d);
 	}
-out:
+
 	closure_call(&s->op.cl, bch_insert_data, NULL, cl);
 	continue_at(cl, cached_dev_write_complete, NULL);
-skip:
-	s->op.skip = true;
-	s->op.cache_bio = s->orig_bio;
-	bio_get(s->op.cache_bio);
-
-	if ((bio->bi_rw & REQ_DISCARD) &&
-	    !blk_queue_discard(bdev_get_queue(dc->bdev)))
-		goto out;
-
-	closure_bio_submit(bio, cl, s->d);
-	goto out;
 }
 
 static void request_nodata(struct cached_dev *dc, struct search *s)
@@ -1033,14 +1157,10 @@ static void request_nodata(struct cached_dev *dc, struct search *s)
 	struct closure *cl = &s->cl;
 	struct bio *bio = &s->bio.bio;
 
-	if (bio->bi_rw & REQ_DISCARD) {
-		request_write(dc, s);
-		return;
-	}
-
 	if (s->op.flush_journal)
 		bch_journal_meta(s->op.c, cl);
 
+	/* If it's a flush, we send the flush to the backing device too */
 	closure_bio_submit(bio, cl, s->d);
 
 	continue_at(cl, cached_dev_bio_complete, NULL);
@@ -1048,134 +1168,6 @@ static void request_nodata(struct cached_dev *dc, struct search *s)
 
 /* Cached devices - read & write stuff */
 
-unsigned bch_get_congested(struct cache_set *c)
-{
-	int i;
-	long rand;
-
-	if (!c->congested_read_threshold_us &&
-	    !c->congested_write_threshold_us)
-		return 0;
-
-	i = (local_clock_us() - c->congested_last_us) / 1024;
-	if (i < 0)
-		return 0;
-
-	i += atomic_read(&c->congested);
-	if (i >= 0)
-		return 0;
-
-	i += CONGESTED_MAX;
-
-	if (i > 0)
-		i = fract_exp_two(i, 6);
-
-	rand = get_random_int();
-	i -= bitmap_weight(&rand, BITS_PER_LONG);
-
-	return i > 0 ? i : 1;
-}
-
-static void add_sequential(struct task_struct *t)
-{
-	ewma_add(t->sequential_io_avg,
-		 t->sequential_io, 8, 0);
-
-	t->sequential_io = 0;
-}
-
-static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
-{
-	return &dc->io_hash[hash_64(k, RECENT_IO_BITS)];
-}
-
-static void check_should_skip(struct cached_dev *dc, struct search *s)
-{
-	struct cache_set *c = s->op.c;
-	struct bio *bio = &s->bio.bio;
-	unsigned mode = cache_mode(dc, bio);
-	unsigned sectors, congested = bch_get_congested(c);
-
-	if (atomic_read(&dc->disk.detaching) ||
-	    c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
-	    (bio->bi_rw & REQ_DISCARD))
-		goto skip;
-
-	if (mode == CACHE_MODE_NONE ||
-	    (mode == CACHE_MODE_WRITEAROUND &&
-	     (bio->bi_rw & REQ_WRITE)))
-		goto skip;
-
-	if (bio->bi_sector   & (c->sb.block_size - 1) ||
-	    bio_sectors(bio) & (c->sb.block_size - 1)) {
-		pr_debug("skipping unaligned io");
-		goto skip;
-	}
-
-	if (!congested && !dc->sequential_cutoff)
-		goto rescale;
-
-	if (!congested &&
-	    mode == CACHE_MODE_WRITEBACK &&
-	    (bio->bi_rw & REQ_WRITE) &&
-	    (bio->bi_rw & REQ_SYNC))
-		goto rescale;
-
-	if (dc->sequential_merge) {
-		struct io *i;
-
-		spin_lock(&dc->io_lock);
-
-		hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash)
-			if (i->last == bio->bi_sector &&
-			    time_before(jiffies, i->jiffies))
-				goto found;
-
-		i = list_first_entry(&dc->io_lru, struct io, lru);
-
-		add_sequential(s->task);
-		i->sequential = 0;
-found:
-		if (i->sequential + bio->bi_size > i->sequential)
-			i->sequential	+= bio->bi_size;
-
-		i->last			 = bio_end_sector(bio);
-		i->jiffies		 = jiffies + msecs_to_jiffies(5000);
-		s->task->sequential_io	 = i->sequential;
-
-		hlist_del(&i->hash);
-		hlist_add_head(&i->hash, iohash(dc, i->last));
-		list_move_tail(&i->lru, &dc->io_lru);
-
-		spin_unlock(&dc->io_lock);
-	} else {
-		s->task->sequential_io = bio->bi_size;
-
-		add_sequential(s->task);
-	}
-
-	sectors = max(s->task->sequential_io,
-		      s->task->sequential_io_avg) >> 9;
-
-	if (dc->sequential_cutoff &&
-	    sectors >= dc->sequential_cutoff >> 9) {
-		trace_bcache_bypass_sequential(s->orig_bio);
-		goto skip;
-	}
-
-	if (congested && sectors >= congested) {
-		trace_bcache_bypass_congested(s->orig_bio);
-		goto skip;
-	}
-
-rescale:
-	bch_rescale_priorities(c, bio_sectors(bio));
-	return;
-skip:
-	bch_mark_sectors_bypassed(s, bio_sectors(bio));
-	s->op.skip = true;
-}
-
 static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct search *s;
@@ -1195,12 +1187,16 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
 		s = search_alloc(bio, d);
 		trace_bcache_request_start(s, bio);
 
-		if (!bio_has_data(bio))
+		if (!bio->bi_size)
 			request_nodata(dc, s);
-		else if (rw)
-			request_write(dc, s);
-		else
-			request_read(dc, s);
+		else {
+			s->op.bypass = check_should_bypass(dc, s);
+
+			if (rw)
+				request_write(dc, s);
+			else
+				request_read(dc, s);
+		}
 	} else {
 		if ((bio->bi_rw & REQ_DISCARD) &&
 		    !blk_queue_discard(bdev_get_queue(dc->bdev)))
@@ -1298,21 +1294,21 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 
 	trace_bcache_request_start(s, bio);
 
-	if (bio_has_data(bio) && !rw) {
-		closure_call(&s->op.cl, btree_read_async, NULL, cl);
-	} else if (bio_has_data(bio) || s->op.skip) {
+	if (!bio->bi_size) {
+		if (s->op.flush_journal)
+			bch_journal_meta(s->op.c, cl);
+	} else if (rw) {
 		bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys,
 					&KEY(d->id, bio->bi_sector, 0),
 					&KEY(d->id, bio_end_sector(bio), 0));
 
+		s->op.bypass	= (bio->bi_rw & REQ_DISCARD) != 0;
 		s->writeback	= true;
 		s->op.cache_bio	= bio;
 
 		closure_call(&s->op.cl, bch_insert_data, NULL, cl);
 	} else {
-		/* No data - probably a cache flush */
-		if (s->op.flush_journal)
-			bch_journal_meta(s->op.c, cl);
+		closure_call(&s->op.cl, btree_read_async, NULL, cl);
 	}
 
 	continue_at(cl, search_free, NULL);
-- 
cgit v1.2.3


From cdd972b164be8fc69f6ee8533c5a07b621da74c7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 10 Sep 2013 17:06:17 -0700
Subject: bcache: Refactor read request code a bit

More refactoring, and renaming.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/request.c | 71 ++++++++++++++++++++++-----------------------
 1 file changed, 35 insertions(+), 36 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index dbc2ef6e7a35..3b85f33ae4c7 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -884,7 +884,7 @@ skip:
 
 /* Process reads */
 
-static void cached_dev_read_complete(struct closure *cl)
+static void cached_dev_cache_miss_done(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 
@@ -902,9 +902,10 @@ static void cached_dev_read_complete(struct closure *cl)
 	cached_dev_bio_complete(cl);
 }
 
-static void request_read_error(struct closure *cl)
+static void cached_dev_read_error(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
+	struct bio *bio = &s->bio.bio;
 	struct bio_vec *bv;
 	int i;
 
@@ -928,20 +929,20 @@ static void request_read_error(struct closure *cl)
 
 		/* XXX: invalidate cache */
 
-		closure_bio_submit(&s->bio.bio, &s->cl, s->d);
+		closure_bio_submit(bio, cl, s->d);
 	}
 
-	continue_at(cl, cached_dev_read_complete, NULL);
+	continue_at(cl, cached_dev_cache_miss_done, NULL);
 }
 
-static void request_read_done(struct closure *cl)
+static void cached_dev_read_done(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 
 	/*
-	 * s->cache_bio != NULL implies that we had a cache miss; cache_bio now
-	 * contains data ready to be inserted into the cache.
+	 * We had a cache miss; cache_bio now contains data ready to be inserted
+	 * into the cache.
 	 *
 	 * First, we copy the data we just read from cache_bio's bounce buffers
 	 * to the buffers the original bio pointed to:
@@ -971,10 +972,10 @@ static void request_read_done(struct closure *cl)
 		closure_call(&s->op.cl, bch_insert_data, NULL, cl);
 	}
 
-	continue_at(cl, cached_dev_read_complete, NULL);
+	continue_at(cl, cached_dev_cache_miss_done, NULL);
 }
 
-static void request_read_done_bh(struct closure *cl)
+static void cached_dev_read_done_bh(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
@@ -983,11 +984,11 @@ static void request_read_done_bh(struct closure *cl)
 	trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.bypass);
 
 	if (s->error)
-		continue_at_nobarrier(cl, request_read_error, bcache_wq);
+		continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq);
 	else if (s->op.cache_bio || verify(dc, &s->bio.bio))
-		continue_at_nobarrier(cl, request_read_done, bcache_wq);
+		continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq);
 	else
-		continue_at_nobarrier(cl, cached_dev_read_complete, NULL);
+		continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
 }
 
 static int cached_dev_cache_miss(struct btree *b, struct search *s,
@@ -996,7 +997,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	int ret = 0;
 	unsigned reada = 0;
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
-	struct bio *miss;
+	struct bio *miss, *cache_bio;
 
 	if (s->cache_miss || s->op.bypass) {
 		miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
@@ -1027,33 +1028,31 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 		/* btree_search_recurse()'s btree iterator is no good anymore */
 		ret = -EINTR;
 
-	s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT,
+	cache_bio = bio_alloc_bioset(GFP_NOWAIT,
 			DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS),
 			dc->disk.bio_split);
-
-	if (!s->op.cache_bio)
+	if (!cache_bio)
 		goto out_submit;
 
-	s->op.cache_bio->bi_sector	= miss->bi_sector;
-	s->op.cache_bio->bi_bdev	= miss->bi_bdev;
-	s->op.cache_bio->bi_size	= s->cache_bio_sectors << 9;
+	cache_bio->bi_sector	= miss->bi_sector;
+	cache_bio->bi_bdev	= miss->bi_bdev;
+	cache_bio->bi_size	= s->cache_bio_sectors << 9;
 
-	s->op.cache_bio->bi_end_io	= request_endio;
-	s->op.cache_bio->bi_private	= &s->cl;
+	cache_bio->bi_end_io	= request_endio;
+	cache_bio->bi_private	= &s->cl;
 
-	bch_bio_map(s->op.cache_bio, NULL);
-	if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO))
+	bch_bio_map(cache_bio, NULL);
+	if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO))
 		goto out_put;
 
-	s->cache_miss = miss;
-	bio_get(s->op.cache_bio);
-
-	closure_bio_submit(s->op.cache_bio, &s->cl, s->d);
+	s->cache_miss	= miss;
+	s->op.cache_bio = cache_bio;
+	bio_get(cache_bio);
+	closure_bio_submit(cache_bio, &s->cl, s->d);
 
 	return ret;
 out_put:
-	bio_put(s->op.cache_bio);
-	s->op.cache_bio = NULL;
+	bio_put(cache_bio);
 out_submit:
 	miss->bi_end_io		= request_endio;
 	miss->bi_private	= &s->cl;
@@ -1061,12 +1060,12 @@ out_submit:
 	return ret;
 }
 
-static void request_read(struct cached_dev *dc, struct search *s)
+static void cached_dev_read(struct cached_dev *dc, struct search *s)
 {
 	struct closure *cl = &s->cl;
 
 	closure_call(&s->op.cl, btree_read_async, NULL, cl);
-	continue_at(cl, request_read_done_bh, NULL);
+	continue_at(cl, cached_dev_read_done_bh, NULL);
 }
 
 /* Process writes */
@@ -1080,7 +1079,7 @@ static void cached_dev_write_complete(struct closure *cl)
 	cached_dev_bio_complete(cl);
 }
 
-static void request_write(struct cached_dev *dc, struct search *s)
+static void cached_dev_write(struct cached_dev *dc, struct search *s)
 {
 	struct closure *cl = &s->cl;
 	struct bio *bio = &s->bio.bio;
@@ -1152,7 +1151,7 @@ static void request_write(struct cached_dev *dc, struct search *s)
 	continue_at(cl, cached_dev_write_complete, NULL);
 }
 
-static void request_nodata(struct cached_dev *dc, struct search *s)
+static void cached_dev_nodata(struct cached_dev *dc, struct search *s)
 {
 	struct closure *cl = &s->cl;
 	struct bio *bio = &s->bio.bio;
@@ -1188,14 +1187,14 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
 		trace_bcache_request_start(s, bio);
 
 		if (!bio->bi_size)
-			request_nodata(dc, s);
+			cached_dev_nodata(dc, s);
 		else {
 			s->op.bypass = check_should_bypass(dc, s);
 
 			if (rw)
-				request_write(dc, s);
+				cached_dev_write(dc, s);
 			else
-				request_read(dc, s);
+				cached_dev_read(dc, s);
 		}
 	} else {
 		if ((bio->bi_rw & REQ_DISCARD) &&
-- 
cgit v1.2.3


From a34a8bfd4e6358c646928320d37b0425c0762f8a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Thu, 24 Oct 2013 17:07:04 -0700
Subject: bcache: Refactor journalling flow control

Making things less asynchronous that don't need to be - bch_journal()
only has to block when the journal or journal entry is full, which is
emphatically not a fast path. So make it a normal function that just
returns when it finishes, to make the code and control flow easier to
follow.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c    |   3 -
 drivers/md/bcache/closure.h  |   2 +-
 drivers/md/bcache/journal.c  | 213 ++++++++++++++++++++-----------------------
 drivers/md/bcache/journal.h  |   3 +-
 drivers/md/bcache/movinggc.c |   2 +-
 drivers/md/bcache/request.c  | 160 ++++++++++++++++++++------------
 drivers/md/bcache/request.h  |   3 +-
 7 files changed, 207 insertions(+), 179 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index f960607f1f25..777c01d67ef0 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -2164,9 +2164,6 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c,
 		}
 	}
 
-	if (op->journal)
-		atomic_dec_bug(op->journal);
-	op->journal = NULL;
 	return ret;
 }
 
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 00039924ea9d..ab011f03801f 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -642,7 +642,7 @@ do {									\
 #define continue_at_nobarrier(_cl, _fn, _wq)				\
 do {									\
 	set_closure_fn(_cl, _fn, _wq);					\
-	closure_queue(cl);						\
+	closure_queue(_cl);						\
 	return;								\
 } while (0)
 
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 1bdefdb1fa71..940e89e0d706 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -318,7 +318,6 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
 			bch_keylist_push(&op->keys);
 
 			op->journal = i->pin;
-			atomic_inc(op->journal);
 
 			ret = bch_btree_insert(op, s, &op->keys);
 			if (ret)
@@ -357,48 +356,35 @@ static void btree_flush_write(struct cache_set *c)
 	 * Try to find the btree node with that references the oldest journal
 	 * entry, best is our current candidate and is locked if non NULL:
 	 */
-	struct btree *b, *best = NULL;
-	unsigned iter;
+	struct btree *b, *best;
+	unsigned i;
+retry:
+	best = NULL;
+
+	for_each_cached_btree(b, c, i)
+		if (btree_current_write(b)->journal) {
+			if (!best)
+				best = b;
+			else if (journal_pin_cmp(c,
+						 btree_current_write(best),
+						 btree_current_write(b))) {
+				best = b;
+			}
+		}
 
-	for_each_cached_btree(b, c, iter) {
-		if (!down_write_trylock(&b->lock))
-			continue;
+	b = best;
+	if (b) {
+		rw_lock(true, b, b->level);
 
-		if (!btree_node_dirty(b) ||
-		    !btree_current_write(b)->journal) {
+		if (!btree_current_write(b)->journal) {
 			rw_unlock(true, b);
-			continue;
+			/* We raced */
+			goto retry;
 		}
 
-		if (!best)
-			best = b;
-		else if (journal_pin_cmp(c,
-					 btree_current_write(best),
-					 btree_current_write(b))) {
-			rw_unlock(true, best);
-			best = b;
-		} else
-			rw_unlock(true, b);
+		bch_btree_node_write(b, NULL);
+		rw_unlock(true, b);
 	}
-
-	if (best)
-		goto out;
-
-	/* We can't find the best btree node, just pick the first */
-	list_for_each_entry(b, &c->btree_cache, list)
-		if (!b->level && btree_node_dirty(b)) {
-			best = b;
-			rw_lock(true, best, best->level);
-			goto found;
-		}
-
-out:
-	if (!best)
-		return;
-found:
-	if (btree_node_dirty(best))
-		bch_btree_node_write(best, NULL);
-	rw_unlock(true, best);
 }
 
 #define last_seq(j)	((j)->seq - fifo_used(&(j)->pin) + 1)
@@ -494,7 +480,7 @@ static void journal_reclaim(struct cache_set *c)
 		do_journal_discard(ca);
 
 	if (c->journal.blocks_free)
-		return;
+		goto out;
 
 	/*
 	 * Allocate:
@@ -520,7 +506,7 @@ static void journal_reclaim(struct cache_set *c)
 
 	if (n)
 		c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
-
+out:
 	if (!journal_full(&c->journal))
 		__closure_wake_up(&c->journal.wait);
 }
@@ -659,7 +645,7 @@ static void journal_write(struct closure *cl)
 	journal_write_unlocked(cl);
 }
 
-static void __journal_try_write(struct cache_set *c, bool noflush)
+static void journal_try_write(struct cache_set *c)
 	__releases(c->journal.lock)
 {
 	struct closure *cl = &c->journal.io;
@@ -667,29 +653,59 @@ static void __journal_try_write(struct cache_set *c, bool noflush)
 
 	w->need_write = true;
 
-	if (!closure_trylock(cl, &c->cl))
-		spin_unlock(&c->journal.lock);
-	else if (noflush && journal_full(&c->journal)) {
-		spin_unlock(&c->journal.lock);
-		continue_at(cl, journal_write, system_wq);
-	} else
+	if (closure_trylock(cl, &c->cl))
 		journal_write_unlocked(cl);
+	else
+		spin_unlock(&c->journal.lock);
 }
 
-#define journal_try_write(c)	__journal_try_write(c, false)
-
-void bch_journal_meta(struct cache_set *c, struct closure *cl)
+static struct journal_write *journal_wait_for_write(struct cache_set *c,
+						    unsigned nkeys)
 {
-	struct journal_write *w;
+	size_t sectors;
+	struct closure cl;
 
-	if (CACHE_SYNC(&c->sb)) {
-		spin_lock(&c->journal.lock);
-		w = c->journal.cur;
+	closure_init_stack(&cl);
+
+	spin_lock(&c->journal.lock);
+
+	while (1) {
+		struct journal_write *w = c->journal.cur;
+
+		sectors = __set_blocks(w->data, w->data->keys + nkeys,
+				       c) * c->sb.block_size;
+
+		if (sectors <= min_t(size_t,
+				     c->journal.blocks_free * c->sb.block_size,
+				     PAGE_SECTORS << JSET_BITS))
+			return w;
+
+		/* XXX: tracepoint */
+		if (!journal_full(&c->journal)) {
+			trace_bcache_journal_entry_full(c);
+
+			/*
+			 * XXX: If we were inserting so many keys that they
+			 * won't fit in an _empty_ journal write, we'll
+			 * deadlock. For now, handle this in
+			 * bch_keylist_realloc() - but something to think about.
+			 */
+			BUG_ON(!w->data->keys);
+
+			closure_wait(&w->wait, &cl);
+			journal_try_write(c); /* unlocks */
+		} else {
+			trace_bcache_journal_full(c);
+
+			closure_wait(&c->journal.wait, &cl);
+			journal_reclaim(c);
+			spin_unlock(&c->journal.lock);
 
-		if (cl)
-			BUG_ON(!closure_wait(&w->wait, cl));
+			btree_flush_write(c);
+		}
 
-		__journal_try_write(c, true);
+		closure_sync(&cl);
+		spin_lock(&c->journal.lock);
 	}
 }
 
@@ -708,68 +724,26 @@ static void journal_write_work(struct work_struct *work)
  * bch_journal() hands those same keys off to btree_insert_async()
  */
 
-void bch_journal(struct closure *cl)
+atomic_t *bch_journal(struct cache_set *c,
+		      struct keylist *keys,
+		      struct closure *parent)
 {
-	struct btree_op *op = container_of(cl, struct btree_op, cl);
-	struct cache_set *c = op->c;
 	struct journal_write *w;
-	size_t sectors, nkeys;
-
-	if (op->type != BTREE_INSERT ||
-	    !CACHE_SYNC(&c->sb))
-		goto out;
-
-	/*
-	 * If we're looping because we errored, might already be waiting on
-	 * another journal write:
-	 */
-	while (atomic_read(&cl->parent->remaining) & CLOSURE_WAITING)
-		closure_sync(cl->parent);
-
-	spin_lock(&c->journal.lock);
-
-	if (journal_full(&c->journal)) {
-		trace_bcache_journal_full(c);
-
-		closure_wait(&c->journal.wait, cl);
-
-		journal_reclaim(c);
-		spin_unlock(&c->journal.lock);
-
-		btree_flush_write(c);
-		continue_at(cl, bch_journal, bcache_wq);
-	}
+	atomic_t *ret;
 
-	w = c->journal.cur;
-	nkeys = w->data->keys + bch_keylist_nkeys(&op->keys);
-	sectors = __set_blocks(w->data, nkeys, c) * c->sb.block_size;
+	if (!CACHE_SYNC(&c->sb))
+		return NULL;
 
-	if (sectors > min_t(size_t,
-			    c->journal.blocks_free * c->sb.block_size,
-			    PAGE_SECTORS << JSET_BITS)) {
-		trace_bcache_journal_entry_full(c);
+	w = journal_wait_for_write(c, bch_keylist_nkeys(keys));
 
-		/*
-		 * XXX: If we were inserting so many keys that they won't fit in
-		 * an _empty_ journal write, we'll deadlock. For now, handle
-		 * this in bch_keylist_realloc() - but something to think about.
-		 */
-		BUG_ON(!w->data->keys);
+	memcpy(end(w->data), keys->keys, bch_keylist_bytes(keys));
+	w->data->keys += bch_keylist_nkeys(keys);
 
-		BUG_ON(!closure_wait(&w->wait, cl));
+	ret = &fifo_back(&c->journal.pin);
+	atomic_inc(ret);
 
-		journal_try_write(c);
-		continue_at(cl, bch_journal, bcache_wq);
-	}
-
-	memcpy(end(w->data), op->keys.keys, bch_keylist_bytes(&op->keys));
-	w->data->keys += bch_keylist_nkeys(&op->keys);
-
-	op->journal = &fifo_back(&c->journal.pin);
-	atomic_inc(op->journal);
-
-	if (op->flush_journal) {
-		closure_wait(&w->wait, cl->parent);
+	if (parent) {
+		closure_wait(&w->wait, parent);
 		journal_try_write(c);
 	} else if (!w->need_write) {
 		schedule_delayed_work(&c->journal.work,
@@ -778,8 +752,21 @@ void bch_journal(struct closure *cl)
 	} else {
 		spin_unlock(&c->journal.lock);
 	}
-out:
-	bch_btree_insert_async(cl);
+
+
+	return ret;
+}
+
+void bch_journal_meta(struct cache_set *c, struct closure *cl)
+{
+	struct keylist keys;
+	atomic_t *ref;
+
+	bch_keylist_init(&keys);
+
+	ref = bch_journal(c, &keys, cl);
+	if (ref)
+		atomic_dec_bug(ref);
 }
 
 void bch_journal_free(struct cache_set *c)
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index 3ca93d3d566c..7045e6fd2d5a 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -200,8 +200,9 @@ struct journal_device {
 struct closure;
 struct cache_set;
 struct btree_op;
+struct keylist;
 
-void bch_journal(struct closure *);
+atomic_t *bch_journal(struct cache_set *, struct keylist *, struct closure *);
 void bch_journal_next(struct journal *);
 void bch_journal_mark(struct cache_set *, struct list_head *);
 void bch_journal_meta(struct cache_set *, struct closure *);
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index dd8a035c5ae1..2c42377a65aa 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -110,7 +110,7 @@ static void write_moving(struct closure *cl)
 		bkey_copy(&s->op.replace, &io->w->key);
 
 		closure_init(&s->op.cl, cl);
-		bch_insert_data(&s->op.cl);
+		bch_data_insert(&s->op.cl);
 	}
 
 	continue_at(cl, write_moving_finish, bch_gc_wq);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 3b85f33ae4c7..1c3af44b097b 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -25,6 +25,8 @@
 
 struct kmem_cache *bch_search_cache;
 
+static void bch_data_insert_start(struct closure *);
+
 /* Cgroup interface */
 
 #ifdef CONFIG_CGROUP_BCACHE
@@ -211,31 +213,42 @@ static void bio_csum(struct bio *bio, struct bkey *k)
 
 /* Insert data into cache */
 
-static void bio_invalidate(struct closure *cl)
+static void bch_data_insert_keys(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
-	struct bio *bio = op->cache_bio;
+	struct search *s = container_of(op, struct search, op);
 
-	pr_debug("invalidating %i sectors from %llu",
-		 bio_sectors(bio), (uint64_t) bio->bi_sector);
+	/*
+	 * If we're looping, might already be waiting on
+	 * another journal write - can't wait on more than one journal write at
+	 * a time
+	 *
+	 * XXX: this looks wrong
+	 */
+#if 0
+	while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING)
+		closure_sync(&s->cl);
+#endif
 
-	while (bio_sectors(bio)) {
-		unsigned len = min(bio_sectors(bio), 1U << 14);
+	if (s->write)
+		op->journal = bch_journal(op->c, &op->keys,
+					  op->flush_journal
+					  ? &s->cl : NULL);
 
-		if (bch_keylist_realloc(&op->keys, 0, op->c))
-			goto out;
+	if (bch_btree_insert(op, op->c, &op->keys)) {
+		s->error		= -ENOMEM;
+		op->insert_data_done	= true;
+	}
 
-		bio->bi_sector	+= len;
-		bio->bi_size	-= len << 9;
+	if (op->journal)
+		atomic_dec_bug(op->journal);
+	op->journal = NULL;
 
-		bch_keylist_add(&op->keys,
-				&KEY(op->inode, bio->bi_sector, len));
-	}
+	if (!op->insert_data_done)
+		continue_at(cl, bch_data_insert_start, bcache_wq);
 
-	op->insert_data_done = true;
-	bio_put(bio);
-out:
-	continue_at(cl, bch_journal, bcache_wq);
+	bch_keylist_free(&op->keys);
+	closure_return(cl);
 }
 
 struct open_bucket {
@@ -423,7 +436,34 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
 	return true;
 }
 
-static void bch_insert_data_error(struct closure *cl)
+static void bch_data_invalidate(struct closure *cl)
+{
+	struct btree_op *op = container_of(cl, struct btree_op, cl);
+	struct bio *bio = op->cache_bio;
+
+	pr_debug("invalidating %i sectors from %llu",
+		 bio_sectors(bio), (uint64_t) bio->bi_sector);
+
+	while (bio_sectors(bio)) {
+		unsigned len = min(bio_sectors(bio), 1U << 14);
+
+		if (bch_keylist_realloc(&op->keys, 0, op->c))
+			goto out;
+
+		bio->bi_sector	+= len;
+		bio->bi_size	-= len << 9;
+
+		bch_keylist_add(&op->keys, &KEY(op->inode,
+						bio->bi_sector, len));
+	}
+
+	op->insert_data_done = true;
+	bio_put(bio);
+out:
+	continue_at(cl, bch_data_insert_keys, bcache_wq);
+}
+
+static void bch_data_insert_error(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
 
@@ -450,10 +490,10 @@ static void bch_insert_data_error(struct closure *cl)
 
 	op->keys.top = dst;
 
-	bch_journal(cl);
+	bch_data_insert_keys(cl);
 }
 
-static void bch_insert_data_endio(struct bio *bio, int error)
+static void bch_data_insert_endio(struct bio *bio, int error)
 {
 	struct closure *cl = bio->bi_private;
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
@@ -464,7 +504,7 @@ static void bch_insert_data_endio(struct bio *bio, int error)
 		if (s->writeback)
 			s->error = error;
 		else if (s->write)
-			set_closure_fn(cl, bch_insert_data_error, bcache_wq);
+			set_closure_fn(cl, bch_data_insert_error, bcache_wq);
 		else
 			set_closure_fn(cl, NULL, NULL);
 	}
@@ -472,14 +512,14 @@ static void bch_insert_data_endio(struct bio *bio, int error)
 	bch_bbio_endio(op->c, bio, error, "writing data to cache");
 }
 
-static void bch_insert_data_loop(struct closure *cl)
+static void bch_data_insert_start(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
 	struct search *s = container_of(op, struct search, op);
 	struct bio *bio = op->cache_bio, *n;
 
 	if (op->bypass)
-		return bio_invalidate(cl);
+		return bch_data_invalidate(cl);
 
 	if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
 		set_gc_sectors(op->c);
@@ -502,7 +542,7 @@ static void bch_insert_data_loop(struct closure *cl)
 		if (bch_keylist_realloc(&op->keys,
 					1 + (op->csum ? 1 : 0),
 					op->c))
-			continue_at(cl, bch_journal, bcache_wq);
+			continue_at(cl, bch_data_insert_keys, bcache_wq);
 
 		k = op->keys.top;
 		bkey_init(k);
@@ -514,7 +554,7 @@ static void bch_insert_data_loop(struct closure *cl)
 
 		n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
 
-		n->bi_end_io	= bch_insert_data_endio;
+		n->bi_end_io	= bch_data_insert_endio;
 		n->bi_private	= cl;
 
 		if (s->writeback) {
@@ -537,7 +577,7 @@ static void bch_insert_data_loop(struct closure *cl)
 	} while (n != bio);
 
 	op->insert_data_done = true;
-	continue_at(cl, bch_journal, bcache_wq);
+	continue_at(cl, bch_data_insert_keys, bcache_wq);
 err:
 	/* bch_alloc_sectors() blocks if s->writeback = true */
 	BUG_ON(s->writeback);
@@ -556,7 +596,7 @@ err:
 		 * rest of the write.
 		 */
 		op->bypass = true;
-		return bio_invalidate(cl);
+		return bch_data_invalidate(cl);
 	} else {
 		/*
 		 * From a cache miss, we can just insert the keys for the data
@@ -566,14 +606,14 @@ err:
 		bio_put(bio);
 
 		if (!bch_keylist_empty(&op->keys))
-			continue_at(cl, bch_journal, bcache_wq);
+			continue_at(cl, bch_data_insert_keys, bcache_wq);
 		else
 			closure_return(cl);
 	}
 }
 
 /**
- * bch_insert_data - stick some data in the cache
+ * bch_data_insert - stick some data in the cache
  *
  * This is the starting point for any data to end up in a cache device; it could
  * be from a normal write, or a writeback write, or a write to a flash only
@@ -591,30 +631,13 @@ err:
  * If op->bypass is true, instead of inserting the data it invalidates the
  * region of the cache represented by op->cache_bio and op->inode.
  */
-void bch_insert_data(struct closure *cl)
+void bch_data_insert(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
 
 	bch_keylist_init(&op->keys);
 	bio_get(op->cache_bio);
-	bch_insert_data_loop(cl);
-}
-
-void bch_btree_insert_async(struct closure *cl)
-{
-	struct btree_op *op = container_of(cl, struct btree_op, cl);
-	struct search *s = container_of(op, struct search, op);
-
-	if (bch_btree_insert(op, op->c, &op->keys)) {
-		s->error		= -ENOMEM;
-		op->insert_data_done	= true;
-	}
-
-	if (op->insert_data_done) {
-		bch_keylist_free(&op->keys);
-		closure_return(cl);
-	} else
-		continue_at(cl, bch_insert_data_loop, bcache_wq);
+	bch_data_insert_start(cl);
 }
 
 /* Common code for the make_request functions */
@@ -969,7 +992,7 @@ static void cached_dev_read_done(struct closure *cl)
 	if (s->op.cache_bio &&
 	    !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) {
 		s->op.type = BTREE_REPLACE;
-		closure_call(&s->op.cl, bch_insert_data, NULL, cl);
+		closure_call(&s->op.cl, bch_data_insert, NULL, cl);
 	}
 
 	continue_at(cl, cached_dev_cache_miss_done, NULL);
@@ -1147,13 +1170,13 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 		closure_bio_submit(bio, cl, s->d);
 	}
 
-	closure_call(&s->op.cl, bch_insert_data, NULL, cl);
+	closure_call(&s->op.cl, bch_data_insert, NULL, cl);
 	continue_at(cl, cached_dev_write_complete, NULL);
 }
 
-static void cached_dev_nodata(struct cached_dev *dc, struct search *s)
+static void cached_dev_nodata(struct closure *cl)
 {
-	struct closure *cl = &s->cl;
+	struct search *s = container_of(cl, struct search, cl);
 	struct bio *bio = &s->bio.bio;
 
 	if (s->op.flush_journal)
@@ -1186,9 +1209,15 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
 		s = search_alloc(bio, d);
 		trace_bcache_request_start(s, bio);
 
-		if (!bio->bi_size)
-			cached_dev_nodata(dc, s);
-		else {
+		if (!bio->bi_size) {
+			/*
+			 * can't call bch_journal_meta from under
+			 * generic_make_request
+			 */
+			continue_at_nobarrier(&s->cl,
+					      cached_dev_nodata,
+					      bcache_wq);
+		} else {
 			s->op.bypass = check_should_bypass(dc, s);
 
 			if (rw)
@@ -1275,6 +1304,16 @@ static int flash_dev_cache_miss(struct btree *b, struct search *s,
 	return 0;
 }
 
+static void flash_dev_nodata(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+
+	if (s->op.flush_journal)
+		bch_journal_meta(s->op.c, cl);
+
+	continue_at(cl, search_free, NULL);
+}
+
 static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct search *s;
@@ -1294,8 +1333,13 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 	trace_bcache_request_start(s, bio);
 
 	if (!bio->bi_size) {
-		if (s->op.flush_journal)
-			bch_journal_meta(s->op.c, cl);
+		/*
+		 * can't call bch_journal_meta from under
+		 * generic_make_request
+		 */
+		continue_at_nobarrier(&s->cl,
+				      flash_dev_nodata,
+				      bcache_wq);
 	} else if (rw) {
 		bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys,
 					&KEY(d->id, bio->bi_sector, 0),
@@ -1305,7 +1349,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 		s->writeback	= true;
 		s->op.cache_bio	= bio;
 
-		closure_call(&s->op.cl, bch_insert_data, NULL, cl);
+		closure_call(&s->op.cl, bch_data_insert, NULL, cl);
 	} else {
 		closure_call(&s->op.cl, btree_read_async, NULL, cl);
 	}
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 57dc4784f4f4..1f1b59d38db5 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -31,8 +31,7 @@ struct search {
 
 void bch_cache_read_endio(struct bio *, int);
 unsigned bch_get_congested(struct cache_set *);
-void bch_insert_data(struct closure *cl);
-void bch_btree_insert_async(struct closure *);
+void bch_data_insert(struct closure *cl);
 void bch_cache_read_endio(struct bio *, int);
 
 void bch_open_buckets_free(struct cache_set *);
-- 
cgit v1.2.3


From 0b93207abb40d3c42bb83eba1e1e7edc1da77810 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:26:51 -0700
Subject: bcache: Move keylist out of btree_op

Slowly working on pruning struct btree_op - the aim is for it to only
contain things that are actually necessary for traversing the btree.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c     |  1 -
 drivers/md/bcache/btree.h     |  4 ----
 drivers/md/bcache/journal.c   | 11 +++++++----
 drivers/md/bcache/request.c   | 37 +++++++++++++++++++++----------------
 drivers/md/bcache/request.h   |  4 +++-
 drivers/md/bcache/writeback.c |  7 +++++--
 6 files changed, 36 insertions(+), 28 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 777c01d67ef0..731cd8e3fe90 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -113,7 +113,6 @@ void bch_btree_op_init_stack(struct btree_op *op)
 	memset(op, 0, sizeof(struct btree_op));
 	closure_init_stack(&op->cl);
 	op->lock = -1;
-	bch_keylist_init(&op->keys);
 }
 
 /* Btree key manipulation */
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index ea0814b51574..17b7a4e39c7e 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -267,10 +267,6 @@ struct btree_op {
 	unsigned		lookup_done:1;
 	unsigned		insert_collision:1;
 
-	/* Anything after this point won't get zeroed in do_bio_hook() */
-
-	/* Keys to be inserted */
-	struct keylist		keys;
 	BKEY_PADDED(replace);
 };
 
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 940e89e0d706..8866f8ee3a07 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -301,6 +301,9 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
 		list_entry(list->prev, struct journal_replay, list);
 
 	uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
+	struct keylist keylist;
+
+	bch_keylist_init(&keylist);
 
 	list_for_each_entry(i, list, list) {
 		BUG_ON(i->pin && atomic_read(i->pin) != 1);
@@ -314,16 +317,16 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
 		     k = bkey_next(k)) {
 			trace_bcache_journal_replay_key(k);
 
-			bkey_copy(op->keys.top, k);
-			bch_keylist_push(&op->keys);
+			bkey_copy(keylist.top, k);
+			bch_keylist_push(&keylist);
 
 			op->journal = i->pin;
 
-			ret = bch_btree_insert(op, s, &op->keys);
+			ret = bch_btree_insert(op, s, &keylist);
 			if (ret)
 				goto err;
 
-			BUG_ON(!bch_keylist_empty(&op->keys));
+			BUG_ON(!bch_keylist_empty(&keylist));
 			keys++;
 
 			cond_resched();
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 1c3af44b097b..d85c7001df61 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -231,11 +231,11 @@ static void bch_data_insert_keys(struct closure *cl)
 #endif
 
 	if (s->write)
-		op->journal = bch_journal(op->c, &op->keys,
+		op->journal = bch_journal(op->c, &s->insert_keys,
 					  op->flush_journal
 					  ? &s->cl : NULL);
 
-	if (bch_btree_insert(op, op->c, &op->keys)) {
+	if (bch_btree_insert(op, op->c, &s->insert_keys)) {
 		s->error		= -ENOMEM;
 		op->insert_data_done	= true;
 	}
@@ -247,7 +247,7 @@ static void bch_data_insert_keys(struct closure *cl)
 	if (!op->insert_data_done)
 		continue_at(cl, bch_data_insert_start, bcache_wq);
 
-	bch_keylist_free(&op->keys);
+	bch_keylist_free(&s->insert_keys);
 	closure_return(cl);
 }
 
@@ -439,6 +439,7 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
 static void bch_data_invalidate(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
+	struct search *s = container_of(op, struct search, op);
 	struct bio *bio = op->cache_bio;
 
 	pr_debug("invalidating %i sectors from %llu",
@@ -447,14 +448,14 @@ static void bch_data_invalidate(struct closure *cl)
 	while (bio_sectors(bio)) {
 		unsigned len = min(bio_sectors(bio), 1U << 14);
 
-		if (bch_keylist_realloc(&op->keys, 0, op->c))
+		if (bch_keylist_realloc(&s->insert_keys, 0, op->c))
 			goto out;
 
 		bio->bi_sector	+= len;
 		bio->bi_size	-= len << 9;
 
-		bch_keylist_add(&op->keys, &KEY(op->inode,
-						bio->bi_sector, len));
+		bch_keylist_add(&s->insert_keys,
+				&KEY(op->inode, bio->bi_sector, len));
 	}
 
 	op->insert_data_done = true;
@@ -466,6 +467,7 @@ out:
 static void bch_data_insert_error(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
+	struct search *s = container_of(op, struct search, op);
 
 	/*
 	 * Our data write just errored, which means we've got a bunch of keys to
@@ -476,9 +478,9 @@ static void bch_data_insert_error(struct closure *cl)
 	 * from the keys we'll accomplish just that.
 	 */
 
-	struct bkey *src = op->keys.keys, *dst = op->keys.keys;
+	struct bkey *src = s->insert_keys.keys, *dst = s->insert_keys.keys;
 
-	while (src != op->keys.top) {
+	while (src != s->insert_keys.top) {
 		struct bkey *n = bkey_next(src);
 
 		SET_KEY_PTRS(src, 0);
@@ -488,7 +490,7 @@ static void bch_data_insert_error(struct closure *cl)
 		src = n;
 	}
 
-	op->keys.top = dst;
+	s->insert_keys.top = dst;
 
 	bch_data_insert_keys(cl);
 }
@@ -539,12 +541,12 @@ static void bch_data_insert_start(struct closure *cl)
 			? s->d->bio_split : op->c->bio_split;
 
 		/* 1 for the device pointer and 1 for the chksum */
-		if (bch_keylist_realloc(&op->keys,
+		if (bch_keylist_realloc(&s->insert_keys,
 					1 + (op->csum ? 1 : 0),
 					op->c))
 			continue_at(cl, bch_data_insert_keys, bcache_wq);
 
-		k = op->keys.top;
+		k = s->insert_keys.top;
 		bkey_init(k);
 		SET_KEY_INODE(k, op->inode);
 		SET_KEY_OFFSET(k, bio->bi_sector);
@@ -570,7 +572,7 @@ static void bch_data_insert_start(struct closure *cl)
 			bio_csum(n, k);
 
 		trace_bcache_cache_insert(k);
-		bch_keylist_push(&op->keys);
+		bch_keylist_push(&s->insert_keys);
 
 		n->bi_rw |= REQ_WRITE;
 		bch_submit_bbio(n, op->c, k, 0);
@@ -605,7 +607,7 @@ err:
 		op->insert_data_done = true;
 		bio_put(bio);
 
-		if (!bch_keylist_empty(&op->keys))
+		if (!bch_keylist_empty(&s->insert_keys))
 			continue_at(cl, bch_data_insert_keys, bcache_wq);
 		else
 			closure_return(cl);
@@ -634,8 +636,9 @@ err:
 void bch_data_insert(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
+	struct search *s = container_of(op, struct search, op);
 
-	bch_keylist_init(&op->keys);
+	bch_keylist_init(&s->insert_keys);
 	bio_get(op->cache_bio);
 	bch_data_insert_start(cl);
 }
@@ -724,9 +727,11 @@ static void search_free(struct closure *cl)
 
 static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
 {
+	struct search *s;
 	struct bio_vec *bv;
-	struct search *s = mempool_alloc(d->c->search, GFP_NOIO);
-	memset(s, 0, offsetof(struct search, op.keys));
+
+	s = mempool_alloc(d->c->search, GFP_NOIO);
+	memset(s, 0, offsetof(struct search, insert_keys));
 
 	__closure_init(&s->cl, NULL);
 
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 1f1b59d38db5..7d02ac5f936e 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -25,8 +25,10 @@ struct search {
 	short			error;
 	unsigned long		start_time;
 
-	/* Anything past op->keys won't get zeroed in do_bio_hook */
 	struct btree_op		op;
+
+	/* Anything past this point won't get zeroed in search_alloc() */
+	struct keylist		insert_keys;
 };
 
 void bch_cache_read_endio(struct bio *, int);
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 8ffc8ec7231d..51dc709c9bf7 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -300,18 +300,21 @@ static void write_dirty_finish(struct closure *cl)
 	if (KEY_DIRTY(&w->key)) {
 		unsigned i;
 		struct btree_op op;
+		struct keylist keys;
+
 		bch_btree_op_init_stack(&op);
+		bch_keylist_init(&keys);
 
 		op.type = BTREE_REPLACE;
 		bkey_copy(&op.replace, &w->key);
 
 		SET_KEY_DIRTY(&w->key, false);
-		bch_keylist_add(&op.keys, &w->key);
+		bch_keylist_add(&keys, &w->key);
 
 		for (i = 0; i < KEY_PTRS(&w->key); i++)
 			atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
 
-		bch_btree_insert(&op, dc->disk.c, &op.keys);
+		bch_btree_insert(&op, dc->disk.c, &keys);
 		closure_sync(&op.cl);
 
 		if (op.insert_collision)
-- 
cgit v1.2.3


From e8e1d4682c8cb06dbcb5ef7bb851bf9bcb889c84 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:27:07 -0700
Subject: bcache: Convert try_wait to wait_queue_head_t

We never waited on c->try_wait asynchronously, so just use the standard
primitives.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h |   4 +-
 drivers/md/bcache/btree.c  | 150 ++++++++++++++++++---------------------------
 drivers/md/bcache/btree.h  |  10 ++-
 drivers/md/bcache/super.c  |  10 +--
 4 files changed, 75 insertions(+), 99 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 10ce0c825fce..c1c44191afb1 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -735,8 +735,8 @@ struct cache_set {
 	 * basically a lock for this that we can wait on asynchronously. The
 	 * btree_root() macro releases the lock when it returns.
 	 */
-	struct closure		*try_harder;
-	struct closure_waitlist	try_wait;
+	struct task_struct	*try_harder;
+	wait_queue_head_t	try_wait;
 	uint64_t		try_harder_start;
 
 	/*
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 731cd8e3fe90..4d50f1e7006e 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -437,7 +437,7 @@ static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op)
 
 	set_btree_node_dirty(b);
 
-	if (op && op->journal) {
+	if (op->journal) {
 		if (w->journal &&
 		    journal_pin_cmp(b->c, w, op)) {
 			atomic_dec_bug(w->journal);
@@ -574,34 +574,35 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
 	return b;
 }
 
-static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order)
+static int mca_reap(struct btree *b, unsigned min_order, bool flush)
 {
+	struct closure cl;
+
+	closure_init_stack(&cl);
 	lockdep_assert_held(&b->c->bucket_lock);
 
 	if (!down_write_trylock(&b->lock))
 		return -ENOMEM;
 
-	if (b->page_order < min_order) {
+	BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
+
+	if (b->page_order < min_order ||
+	    (!flush &&
+	     (btree_node_dirty(b) ||
+	      atomic_read(&b->io.cl.remaining) != -1))) {
 		rw_unlock(true, b);
 		return -ENOMEM;
 	}
 
-	BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
-
-	if (cl && btree_node_dirty(b))
-		bch_btree_node_write(b, NULL);
-
-	if (cl)
-		closure_wait_event_async(&b->io.wait, cl,
-			 atomic_read(&b->io.cl.remaining) == -1);
-
-	if (btree_node_dirty(b) ||
-	    !closure_is_unlocked(&b->io.cl) ||
-	    work_pending(&b->work.work)) {
-		rw_unlock(true, b);
-		return -EAGAIN;
+	if (btree_node_dirty(b)) {
+		bch_btree_node_write(b, &cl);
+		closure_sync(&cl);
 	}
 
+	/* wait for any in flight btree write */
+	closure_wait_event_sync(&b->io.wait, &cl,
+		atomic_read(&b->io.cl.remaining) == -1);
+
 	return 0;
 }
 
@@ -641,7 +642,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
 			break;
 
 		if (++i > 3 &&
-		    !mca_reap(b, NULL, 0)) {
+		    !mca_reap(b, 0, false)) {
 			mca_data_free(b);
 			rw_unlock(true, b);
 			freed++;
@@ -660,7 +661,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
 		list_rotate_left(&c->btree_cache);
 
 		if (!b->accessed &&
-		    !mca_reap(b, NULL, 0)) {
+		    !mca_reap(b, 0, false)) {
 			mca_bucket_free(b);
 			mca_data_free(b);
 			rw_unlock(true, b);
@@ -783,52 +784,27 @@ out:
 	return b;
 }
 
-static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k,
-				     int level, struct closure *cl)
+static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
 {
-	int ret = -ENOMEM;
-	struct btree *i;
+	struct btree *b;
 
 	trace_bcache_btree_cache_cannibalize(c);
 
-	if (!cl)
-		return ERR_PTR(-ENOMEM);
-
-	/*
-	 * Trying to free up some memory - i.e. reuse some btree nodes - may
-	 * require initiating IO to flush the dirty part of the node. If we're
-	 * running under generic_make_request(), that IO will never finish and
-	 * we would deadlock. Returning -EAGAIN causes the cache lookup code to
-	 * punt to workqueue and retry.
-	 */
-	if (current->bio_list)
-		return ERR_PTR(-EAGAIN);
-
-	if (c->try_harder && c->try_harder != cl) {
-		closure_wait_event_async(&c->try_wait, cl, !c->try_harder);
-		return ERR_PTR(-EAGAIN);
-	}
+	if (!c->try_harder) {
+		c->try_harder = current;
+		c->try_harder_start = local_clock();
+	} else if (c->try_harder != current)
+		return ERR_PTR(-ENOSPC);
 
-	c->try_harder = cl;
-	c->try_harder_start = local_clock();
-retry:
-	list_for_each_entry_reverse(i, &c->btree_cache, list) {
-		int r = mca_reap(i, cl, btree_order(k));
-		if (!r)
-			return i;
-		if (r != -ENOMEM)
-			ret = r;
-	}
+	list_for_each_entry_reverse(b, &c->btree_cache, list)
+		if (!mca_reap(b, btree_order(k), false))
+			return b;
 
-	if (ret == -EAGAIN &&
-	    closure_blocking(cl)) {
-		mutex_unlock(&c->bucket_lock);
-		closure_sync(cl);
-		mutex_lock(&c->bucket_lock);
-		goto retry;
-	}
+	list_for_each_entry_reverse(b, &c->btree_cache, list)
+		if (!mca_reap(b, btree_order(k), true))
+			return b;
 
-	return ERR_PTR(ret);
+	return ERR_PTR(-ENOMEM);
 }
 
 /*
@@ -839,18 +815,19 @@ retry:
  */
 void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl)
 {
-	if (c->try_harder == cl) {
+	if (c->try_harder == current) {
 		bch_time_stats_update(&c->try_harder_time, c->try_harder_start);
 		c->try_harder = NULL;
-		__closure_wake_up(&c->try_wait);
+		wake_up(&c->try_wait);
 	}
 }
 
-static struct btree *mca_alloc(struct cache_set *c, struct bkey *k,
-			       int level, struct closure *cl)
+static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
 {
 	struct btree *b;
 
+	BUG_ON(current->bio_list);
+
 	lockdep_assert_held(&c->bucket_lock);
 
 	if (mca_find(c, k))
@@ -860,14 +837,14 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k,
 	 * the list. Check if there's any freed nodes there:
 	 */
 	list_for_each_entry(b, &c->btree_cache_freeable, list)
-		if (!mca_reap(b, NULL, btree_order(k)))
+		if (!mca_reap(b, btree_order(k), false))
 			goto out;
 
 	/* We never free struct btree itself, just the memory that holds the on
 	 * disk node. Check the freed list before allocating a new one:
 	 */
 	list_for_each_entry(b, &c->btree_cache_freed, list)
-		if (!mca_reap(b, NULL, 0)) {
+		if (!mca_reap(b, 0, false)) {
 			mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
 			if (!b->sets[0].data)
 				goto err;
@@ -901,7 +878,7 @@ err:
 	if (b)
 		rw_unlock(true, b);
 
-	b = mca_cannibalize(c, k, level, cl);
+	b = mca_cannibalize(c, k);
 	if (!IS_ERR(b))
 		goto out;
 
@@ -919,10 +896,9 @@ err:
  * level and op->lock.
  */
 struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k,
-				 int level, struct btree_op *op)
+				 int level, bool write)
 {
 	int i = 0;
-	bool write = level <= op->lock;
 	struct btree *b;
 
 	BUG_ON(level < 0);
@@ -934,7 +910,7 @@ retry:
 			return ERR_PTR(-EAGAIN);
 
 		mutex_lock(&c->bucket_lock);
-		b = mca_alloc(c, k, level, &op->cl);
+		b = mca_alloc(c, k, level);
 		mutex_unlock(&c->bucket_lock);
 
 		if (!b)
@@ -980,7 +956,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
 	struct btree *b;
 
 	mutex_lock(&c->bucket_lock);
-	b = mca_alloc(c, k, level, NULL);
+	b = mca_alloc(c, k, level);
 	mutex_unlock(&c->bucket_lock);
 
 	if (!IS_ERR_OR_NULL(b)) {
@@ -991,17 +967,12 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
 
 /* Btree alloc */
 
-static void btree_node_free(struct btree *b, struct btree_op *op)
+static void btree_node_free(struct btree *b)
 {
 	unsigned i;
 
 	trace_bcache_btree_node_free(b);
 
-	/*
-	 * The BUG_ON() in btree_node_get() implies that we must have a write
-	 * lock on parent to free or even invalidate a node
-	 */
-	BUG_ON(op->lock <= b->level);
 	BUG_ON(b == b->c->root);
 
 	if (btree_node_dirty(b))
@@ -1037,7 +1008,7 @@ retry:
 
 	SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
 
-	b = mca_alloc(c, &k.key, level, cl);
+	b = mca_alloc(c, &k.key, level);
 	if (IS_ERR(b))
 		goto err_free;
 
@@ -1173,8 +1144,7 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys,
 	return stale;
 }
 
-static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k,
-				    struct btree_op *op)
+static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k)
 {
 	/*
 	 * We block priorities from being written for the duration of garbage
@@ -1191,7 +1161,7 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k,
 		memcpy(k->ptr, b->key.ptr,
 		       sizeof(uint64_t) * KEY_PTRS(&b->key));
 
-		btree_node_free(n, op);
+		btree_node_free(n);
 		up_write(&n->lock);
 	}
 
@@ -1211,8 +1181,8 @@ struct gc_merge_info {
 	unsigned	keys;
 };
 
-static void btree_gc_coalesce(struct btree *b, struct btree_op *op,
-			      struct gc_stat *gc, struct gc_merge_info *r)
+static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc,
+			      struct gc_merge_info *r)
 {
 	unsigned nodes = 0, keys = 0, blocks;
 	int i;
@@ -1228,7 +1198,7 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op,
 
 	for (i = nodes - 1; i >= 0; --i) {
 		if (r[i].b->written)
-			r[i].b = btree_gc_alloc(r[i].b, r[i].k, op);
+			r[i].b = btree_gc_alloc(r[i].b, r[i].k);
 
 		if (r[i].b->written)
 			return;
@@ -1292,7 +1262,7 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op,
 		r[i - 1].keys	= n2->keys;
 	}
 
-	btree_node_free(r->b, op);
+	btree_node_free(r->b);
 	up_write(&r->b->lock);
 
 	trace_bcache_btree_gc_coalesce(nodes);
@@ -1324,7 +1294,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 	memset(r, 0, sizeof(r));
 
 	while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) {
-		r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op);
+		r->b = bch_btree_node_get(b->c, r->k, b->level - 1, true);
 
 		if (IS_ERR(r->b)) {
 			ret = PTR_ERR(r->b);
@@ -1337,7 +1307,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 		if (!b->written &&
 		    (r->b->level || stale > 10 ||
 		     b->c->gc_always_rewrite))
-			r->b = btree_gc_alloc(r->b, r->k, op);
+			r->b = btree_gc_alloc(r->b, r->k);
 
 		if (r->b->level)
 			ret = btree_gc_recurse(r->b, op, writes, gc);
@@ -1350,7 +1320,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 		bkey_copy_key(&b->c->gc_done, r->k);
 
 		if (!b->written)
-			btree_gc_coalesce(b, op, gc, r);
+			btree_gc_coalesce(b, gc, r);
 
 		if (r[GC_MERGE_NODES - 1].b)
 			write(r[GC_MERGE_NODES - 1].b);
@@ -1404,7 +1374,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
 	if (!IS_ERR_OR_NULL(n)) {
 		closure_sync(&op->cl);
 		bch_btree_set_root(b);
-		btree_node_free(n, op);
+		btree_node_free(n);
 		rw_unlock(true, b);
 	}
 
@@ -2004,18 +1974,18 @@ static int btree_split(struct btree *b, struct btree_op *op,
 	}
 
 	rw_unlock(true, n1);
-	btree_node_free(b, op);
+	btree_node_free(b);
 
 	bch_time_stats_update(&b->c->btree_split_time, start_time);
 
 	return 0;
 err_free2:
 	__bkey_put(n2->c, &n2->key);
-	btree_node_free(n2, op);
+	btree_node_free(n2);
 	rw_unlock(true, n2);
 err_free1:
 	__bkey_put(n1->c, &n1->key);
-	btree_node_free(n1, op);
+	btree_node_free(n1);
 	rw_unlock(true, n1);
 err:
 	if (n3 == ERR_PTR(-EAGAIN) ||
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 17b7a4e39c7e..72794ab8e8e5 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -326,7 +326,7 @@ static inline void rw_unlock(bool w, struct btree *b)
 ({									\
 	int _r, l = (b)->level - 1;					\
 	bool _w = l <= (op)->lock;					\
-	struct btree *_child = bch_btree_node_get((b)->c, key, l, op);	\
+	struct btree *_child = bch_btree_node_get((b)->c, key, l, _w);	\
 	if (!IS_ERR(_child)) {						\
 		_child->parent = (b);					\
 		_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);	\
@@ -356,6 +356,11 @@ static inline void rw_unlock(bool w, struct btree *b)
 		}							\
 		rw_unlock(_w, _b);					\
 		bch_cannibalize_unlock(c, &(op)->cl);			\
+		if (_r == -ENOSPC) {					\
+			wait_event((c)->try_wait,			\
+				   !(c)->try_harder);			\
+			_r = -EINTR;					\
+		}							\
 	} while (_r == -EINTR);						\
 									\
 	_r;								\
@@ -375,8 +380,7 @@ void bch_btree_node_write(struct btree *, struct closure *);
 void bch_cannibalize_unlock(struct cache_set *, struct closure *);
 void bch_btree_set_root(struct btree *);
 struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *);
-struct btree *bch_btree_node_get(struct cache_set *, struct bkey *,
-				int, struct btree_op *);
+struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool);
 
 int bch_btree_insert_check_key(struct btree *, struct btree_op *,
 			       struct bkey *);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index d3169c0652f8..9a164cd4058c 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1436,12 +1436,14 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 
 	c->sort_crit_factor = int_sqrt(c->btree_pages);
 
-	mutex_init(&c->bucket_lock);
-	mutex_init(&c->sort_lock);
-	spin_lock_init(&c->sort_time_lock);
 	closure_init_unlocked(&c->sb_write);
+	mutex_init(&c->bucket_lock);
+	init_waitqueue_head(&c->try_wait);
 	closure_init_unlocked(&c->uuid_write);
+	spin_lock_init(&c->sort_time_lock);
+	mutex_init(&c->sort_lock);
 	spin_lock_init(&c->btree_read_time_lock);
+
 	bch_moving_init_cache_set(c);
 
 	INIT_LIST_HEAD(&c->list);
@@ -1529,7 +1531,7 @@ static void run_cache_set(struct cache_set *c)
 			goto err;
 
 		err = "error reading btree root";
-		c->root = bch_btree_node_get(c, k, j->btree_level, &op);
+		c->root = bch_btree_node_get(c, k, j->btree_level, true);
 		if (IS_ERR_OR_NULL(c->root))
 			goto err;
 
-- 
cgit v1.2.3


From 35fcd848d72683141052aa9880542461577f2dbe Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:29:09 -0700
Subject: bcache: Convert bucket_wait to wait_queue_head_t

At one point we did do fancy asynchronous waiting stuff with
bucket_wait, but that's all gone (and bucket_wait is used a lot less
than it used to be). So use the standard primitives.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/alloc.c   | 82 ++++++++++++++++++++++++++-------------------
 drivers/md/bcache/bcache.h  |  8 ++---
 drivers/md/bcache/btree.c   | 25 ++++++--------
 drivers/md/bcache/btree.h   |  6 ++--
 drivers/md/bcache/request.c |  9 ++---
 drivers/md/bcache/super.c   |  7 ++--
 6 files changed, 70 insertions(+), 67 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index e033b0203b68..1b64e662e81b 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -339,7 +339,7 @@ static int bch_allocator_thread(void *arg)
 			allocator_wait(ca, !fifo_full(&ca->free));
 
 			fifo_push(&ca->free, bucket);
-			closure_wake_up(&ca->set->bucket_wait);
+			wake_up(&ca->set->bucket_wait);
 		}
 
 		/*
@@ -365,16 +365,41 @@ static int bch_allocator_thread(void *arg)
 	}
 }
 
-long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl)
+long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait)
 {
-	long r = -1;
-again:
+	DEFINE_WAIT(w);
+	struct bucket *b;
+	long r;
+
+	/* fastpath */
+	if (fifo_used(&ca->free) > ca->watermark[watermark]) {
+		fifo_pop(&ca->free, r);
+		goto out;
+	}
+
+	if (!wait)
+		return -1;
+
+	while (1) {
+		if (fifo_used(&ca->free) > ca->watermark[watermark]) {
+			fifo_pop(&ca->free, r);
+			break;
+		}
+
+		prepare_to_wait(&ca->set->bucket_wait, &w,
+				TASK_UNINTERRUPTIBLE);
+
+		mutex_unlock(&ca->set->bucket_lock);
+		schedule();
+		mutex_lock(&ca->set->bucket_lock);
+	}
+
+	finish_wait(&ca->set->bucket_wait, &w);
+out:
 	wake_up_process(ca->alloc_thread);
 
-	if (fifo_used(&ca->free) > ca->watermark[watermark] &&
-	    fifo_pop(&ca->free, r)) {
-		struct bucket *b = ca->buckets + r;
 #ifdef CONFIG_BCACHE_EDEBUG
+	{
 		size_t iter;
 		long i;
 
@@ -387,36 +412,23 @@ again:
 			BUG_ON(i == r);
 		fifo_for_each(i, &ca->unused, iter)
 			BUG_ON(i == r);
-#endif
-		BUG_ON(atomic_read(&b->pin) != 1);
-
-		SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
-
-		if (watermark <= WATERMARK_METADATA) {
-			SET_GC_MARK(b, GC_MARK_METADATA);
-			b->prio = BTREE_PRIO;
-		} else {
-			SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
-			b->prio = INITIAL_PRIO;
-		}
-
-		return r;
 	}
+#endif
+	b = ca->buckets + r;
 
-	trace_bcache_alloc_fail(ca);
+	BUG_ON(atomic_read(&b->pin) != 1);
 
-	if (cl) {
-		closure_wait(&ca->set->bucket_wait, cl);
+	SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
 
-		if (closure_blocking(cl)) {
-			mutex_unlock(&ca->set->bucket_lock);
-			closure_sync(cl);
-			mutex_lock(&ca->set->bucket_lock);
-			goto again;
-		}
+	if (watermark <= WATERMARK_METADATA) {
+		SET_GC_MARK(b, GC_MARK_METADATA);
+		b->prio = BTREE_PRIO;
+	} else {
+		SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
+		b->prio = INITIAL_PRIO;
 	}
 
-	return -1;
+	return r;
 }
 
 void bch_bucket_free(struct cache_set *c, struct bkey *k)
@@ -433,7 +445,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k)
 }
 
 int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
-			   struct bkey *k, int n, struct closure *cl)
+			   struct bkey *k, int n, bool wait)
 {
 	int i;
 
@@ -446,7 +458,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
 
 	for (i = 0; i < n; i++) {
 		struct cache *ca = c->cache_by_alloc[i];
-		long b = bch_bucket_alloc(ca, watermark, cl);
+		long b = bch_bucket_alloc(ca, watermark, wait);
 
 		if (b == -1)
 			goto err;
@@ -466,11 +478,11 @@ err:
 }
 
 int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
-			 struct bkey *k, int n, struct closure *cl)
+			 struct bkey *k, int n, bool wait)
 {
 	int ret;
 	mutex_lock(&c->bucket_lock);
-	ret = __bch_bucket_alloc_set(c, watermark, k, n, cl);
+	ret = __bch_bucket_alloc_set(c, watermark, k, n, wait);
 	mutex_unlock(&c->bucket_lock);
 	return ret;
 }
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index c1c44191afb1..d3520748bc27 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -750,7 +750,7 @@ struct cache_set {
 	 * written.
 	 */
 	atomic_t		prio_blocked;
-	struct closure_waitlist	bucket_wait;
+	wait_queue_head_t	bucket_wait;
 
 	/*
 	 * For any bio we don't skip we subtract the number of sectors from
@@ -1162,13 +1162,13 @@ uint8_t bch_inc_gen(struct cache *, struct bucket *);
 void bch_rescale_priorities(struct cache_set *, int);
 bool bch_bucket_add_unused(struct cache *, struct bucket *);
 
-long bch_bucket_alloc(struct cache *, unsigned, struct closure *);
+long bch_bucket_alloc(struct cache *, unsigned, bool);
 void bch_bucket_free(struct cache_set *, struct bkey *);
 
 int __bch_bucket_alloc_set(struct cache_set *, unsigned,
-			   struct bkey *, int, struct closure *);
+			   struct bkey *, int, bool);
 int bch_bucket_alloc_set(struct cache_set *, unsigned,
-			 struct bkey *, int, struct closure *);
+			 struct bkey *, int, bool);
 
 __printf(2, 3)
 bool bch_cache_set_error(struct cache_set *, const char *, ...);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 4d50f1e7006e..935d90df397b 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -813,7 +813,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
  * cannibalize_bucket() will take. This means every time we unlock the root of
  * the btree, we need to release this lock if we have it held.
  */
-void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl)
+void bch_cannibalize_unlock(struct cache_set *c)
 {
 	if (c->try_harder == current) {
 		bch_time_stats_update(&c->try_harder_time, c->try_harder_start);
@@ -995,15 +995,14 @@ static void btree_node_free(struct btree *b)
 	mutex_unlock(&b->c->bucket_lock);
 }
 
-struct btree *bch_btree_node_alloc(struct cache_set *c, int level,
-				   struct closure *cl)
+struct btree *bch_btree_node_alloc(struct cache_set *c, int level)
 {
 	BKEY_PADDED(key) k;
 	struct btree *b = ERR_PTR(-EAGAIN);
 
 	mutex_lock(&c->bucket_lock);
 retry:
-	if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl))
+	if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true))
 		goto err;
 
 	SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
@@ -1036,10 +1035,9 @@ err:
 	return b;
 }
 
-static struct btree *btree_node_alloc_replacement(struct btree *b,
-						  struct closure *cl)
+static struct btree *btree_node_alloc_replacement(struct btree *b)
 {
-	struct btree *n = bch_btree_node_alloc(b->c, b->level, cl);
+	struct btree *n = bch_btree_node_alloc(b->c, b->level);
 	if (!IS_ERR_OR_NULL(n))
 		bch_btree_sort_into(b, n);
 
@@ -1152,7 +1150,7 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k)
 	 * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it
 	 * our closure.
 	 */
-	struct btree *n = btree_node_alloc_replacement(b, NULL);
+	struct btree *n = btree_node_alloc_replacement(b);
 
 	if (!IS_ERR_OR_NULL(n)) {
 		swap(b, n);
@@ -1359,7 +1357,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
 	int ret = 0, stale = btree_gc_mark_node(b, &keys, gc);
 
 	if (b->level || stale > 10)
-		n = btree_node_alloc_replacement(b, NULL);
+		n = btree_node_alloc_replacement(b);
 
 	if (!IS_ERR_OR_NULL(n))
 		swap(b, n);
@@ -1882,10 +1880,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
 	struct btree *n1, *n2 = NULL, *n3 = NULL;
 	uint64_t start_time = local_clock();
 
-	if (b->level)
-		set_closure_blocking(&op->cl);
-
-	n1 = btree_node_alloc_replacement(b, &op->cl);
+	n1 = btree_node_alloc_replacement(b);
 	if (IS_ERR(n1))
 		goto err;
 
@@ -1896,12 +1891,12 @@ static int btree_split(struct btree *b, struct btree_op *op,
 
 		trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
 
-		n2 = bch_btree_node_alloc(b->c, b->level, &op->cl);
+		n2 = bch_btree_node_alloc(b->c, b->level);
 		if (IS_ERR(n2))
 			goto err_free1;
 
 		if (!b->parent) {
-			n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl);
+			n3 = bch_btree_node_alloc(b->c, b->level + 1);
 			if (IS_ERR(n3))
 				goto err_free2;
 		}
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 72794ab8e8e5..d691d954730e 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -355,7 +355,7 @@ static inline void rw_unlock(bool w, struct btree *b)
 			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\
 		}							\
 		rw_unlock(_w, _b);					\
-		bch_cannibalize_unlock(c, &(op)->cl);			\
+		bch_cannibalize_unlock(c);				\
 		if (_r == -ENOSPC) {					\
 			wait_event((c)->try_wait,			\
 				   !(c)->try_harder);			\
@@ -377,9 +377,9 @@ static inline bool should_split(struct btree *b)
 void bch_btree_node_read(struct btree *);
 void bch_btree_node_write(struct btree *, struct closure *);
 
-void bch_cannibalize_unlock(struct cache_set *, struct closure *);
+void bch_cannibalize_unlock(struct cache_set *);
 void bch_btree_set_root(struct btree *);
-struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *);
+struct btree *bch_btree_node_alloc(struct cache_set *, int);
 struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool);
 
 int bch_btree_insert_check_key(struct btree *, struct btree_op *,
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index d85c7001df61..26d18f4bf4a0 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -350,14 +350,8 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
 	struct cache_set *c = s->op.c;
 	struct open_bucket *b;
 	BKEY_PADDED(key) alloc;
-	struct closure cl, *w = NULL;
 	unsigned i;
 
-	if (s->writeback) {
-		closure_init_stack(&cl);
-		w = &cl;
-	}
-
 	/*
 	 * We might have to allocate a new bucket, which we can't do with a
 	 * spinlock held. So if we have to allocate, we drop the lock, allocate
@@ -375,7 +369,8 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
 
 		spin_unlock(&c->data_bucket_lock);
 
-		if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w))
+		if (bch_bucket_alloc_set(c, watermark, &alloc.key,
+					 1, s->writeback))
 			return false;
 
 		spin_lock(&c->data_bucket_lock);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 9a164cd4058c..84398a82fbe3 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -427,7 +427,7 @@ static int __uuid_write(struct cache_set *c)
 
 	lockdep_assert_held(&bch_register_lock);
 
-	if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, &cl))
+	if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true))
 		return 1;
 
 	SET_KEY_SIZE(&k.key, c->sb.bucket_size);
@@ -565,7 +565,7 @@ void bch_prio_write(struct cache *ca)
 		p->magic	= pset_magic(ca);
 		p->csum		= bch_crc64(&p->magic, bucket_bytes(ca) - 8);
 
-		bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, &cl);
+		bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true);
 		BUG_ON(bucket == -1);
 
 		mutex_unlock(&ca->set->bucket_lock);
@@ -1439,6 +1439,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 	closure_init_unlocked(&c->sb_write);
 	mutex_init(&c->bucket_lock);
 	init_waitqueue_head(&c->try_wait);
+	init_waitqueue_head(&c->bucket_wait);
 	closure_init_unlocked(&c->uuid_write);
 	spin_lock_init(&c->sort_time_lock);
 	mutex_init(&c->sort_lock);
@@ -1608,7 +1609,7 @@ static void run_cache_set(struct cache_set *c)
 			goto err_unlock_gc;
 
 		err = "cannot allocate new btree root";
-		c->root = bch_btree_node_alloc(c, 0, &op.cl);
+		c->root = bch_btree_node_alloc(c, 0);
 		if (IS_ERR_OR_NULL(c->root))
 			goto err_unlock_gc;
 
-- 
cgit v1.2.3


From 72a44517f3ca3725dc86081d105457df46448679 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Thu, 24 Oct 2013 17:19:26 -0700
Subject: bcache: Convert gc to a kthread

We needed a dedicated rescuer workqueue for gc anyways... and gc was
conceptually a dedicated thread, just one that wasn't running all the
time. Switch it to a dedicated thread to make the code a bit more
straightforward.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/alloc.c    |  6 +++---
 drivers/md/bcache/bcache.h   |  9 ++++----
 drivers/md/bcache/btree.c    | 50 ++++++++++++++++++++++++++++++--------------
 drivers/md/bcache/btree.h    | 10 +++++++--
 drivers/md/bcache/movinggc.c | 35 +++++++++++++------------------
 drivers/md/bcache/request.c  |  2 +-
 drivers/md/bcache/super.c    | 20 ++++++++----------
 drivers/md/bcache/sysfs.c    |  2 +-
 8 files changed, 74 insertions(+), 60 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 1b64e662e81b..b9bd5866055d 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -210,7 +210,7 @@ static void invalidate_buckets_lru(struct cache *ca)
 			 * multiple times when it can't do anything
 			 */
 			ca->invalidate_needs_gc = 1;
-			bch_queue_gc(ca->set);
+			wake_up_gc(ca->set);
 			return;
 		}
 
@@ -235,7 +235,7 @@ static void invalidate_buckets_fifo(struct cache *ca)
 
 		if (++checked >= ca->sb.nbuckets) {
 			ca->invalidate_needs_gc = 1;
-			bch_queue_gc(ca->set);
+			wake_up_gc(ca->set);
 			return;
 		}
 	}
@@ -260,7 +260,7 @@ static void invalidate_buckets_random(struct cache *ca)
 
 		if (++checked >= ca->sb.nbuckets / 2) {
 			ca->invalidate_needs_gc = 1;
-			bch_queue_gc(ca->set);
+			wake_up_gc(ca->set);
 			return;
 		}
 	}
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d3520748bc27..09410eb07d82 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -773,7 +773,7 @@ struct cache_set {
 	struct gc_stat		gc_stats;
 	size_t			nbuckets;
 
-	struct closure_with_waitlist gc;
+	struct task_struct	*gc_thread;
 	/* Where in the btree gc currently is */
 	struct bkey		gc_done;
 
@@ -786,11 +786,10 @@ struct cache_set {
 	/* Counts how many sectors bio_insert has added to the cache */
 	atomic_t		sectors_to_gc;
 
-	struct closure		moving_gc;
-	struct closure_waitlist	moving_gc_wait;
+	wait_queue_head_t	moving_gc_wait;
 	struct keybuf		moving_gc_keys;
 	/* Number of moving GC bios in flight */
-	atomic_t		in_flight;
+	struct semaphore	moving_in_flight;
 
 	struct btree		*root;
 
@@ -1176,7 +1175,7 @@ bool bch_cache_set_error(struct cache_set *, const char *, ...);
 void bch_prio_write(struct cache *);
 void bch_write_bdev_super(struct cached_dev *, struct closure *);
 
-extern struct workqueue_struct *bcache_wq, *bch_gc_wq;
+extern struct workqueue_struct *bcache_wq;
 extern const char * const bch_cache_modes[];
 extern struct mutex bch_register_lock;
 extern struct list_head bch_cache_sets;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 935d90df397b..17bfd87fc8f4 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -28,7 +28,9 @@
 
 #include <linux/slab.h>
 #include <linux/bitops.h>
+#include <linux/freezer.h>
 #include <linux/hash.h>
+#include <linux/kthread.h>
 #include <linux/prefetch.h>
 #include <linux/random.h>
 #include <linux/rcupdate.h>
@@ -105,7 +107,6 @@ static const char *op_type(struct btree_op *op)
 #define PTR_HASH(c, k)							\
 	(((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
 
-struct workqueue_struct *bch_gc_wq;
 static struct workqueue_struct *btree_io_wq;
 
 void bch_btree_op_init_stack(struct btree_op *op)
@@ -732,12 +733,9 @@ int bch_btree_cache_alloc(struct cache_set *c)
 {
 	unsigned i;
 
-	/* XXX: doesn't check for errors */
-
-	closure_init_unlocked(&c->gc);
-
 	for (i = 0; i < mca_reserve(c); i++)
-		mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
+		if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL))
+			return -ENOMEM;
 
 	list_splice_init(&c->btree_cache,
 			 &c->btree_cache_freeable);
@@ -1456,9 +1454,8 @@ size_t bch_btree_gc_finish(struct cache_set *c)
 	return available;
 }
 
-static void bch_btree_gc(struct closure *cl)
+static void bch_btree_gc(struct cache_set *c)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, gc.cl);
 	int ret;
 	unsigned long available;
 	struct gc_stat stats;
@@ -1483,7 +1480,7 @@ static void bch_btree_gc(struct closure *cl)
 
 	if (ret) {
 		pr_warn("gc failed!");
-		continue_at(cl, bch_btree_gc, bch_gc_wq);
+		return;
 	}
 
 	/* Possibly wait for new UUIDs or whatever to hit disk */
@@ -1505,12 +1502,35 @@ static void bch_btree_gc(struct closure *cl)
 
 	trace_bcache_gc_end(c);
 
-	continue_at(cl, bch_moving_gc, bch_gc_wq);
+	bch_moving_gc(c);
+}
+
+static int bch_gc_thread(void *arg)
+{
+	struct cache_set *c = arg;
+
+	while (1) {
+		bch_btree_gc(c);
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread_should_stop())
+			break;
+
+		try_to_freeze();
+		schedule();
+	}
+
+	return 0;
 }
 
-void bch_queue_gc(struct cache_set *c)
+int bch_gc_thread_start(struct cache_set *c)
 {
-	closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl);
+	c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc");
+	if (IS_ERR(c->gc_thread))
+		return PTR_ERR(c->gc_thread);
+
+	set_task_state(c->gc_thread, TASK_INTERRUPTIBLE);
+	return 0;
 }
 
 /* Initial partial gc */
@@ -2480,14 +2500,12 @@ void bch_btree_exit(void)
 {
 	if (btree_io_wq)
 		destroy_workqueue(btree_io_wq);
-	if (bch_gc_wq)
-		destroy_workqueue(bch_gc_wq);
 }
 
 int __init bch_btree_init(void)
 {
-	if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) ||
-	    !(btree_io_wq = create_singlethread_workqueue("bch_btree_io")))
+	btree_io_wq = create_singlethread_workqueue("bch_btree_io");
+	if (!btree_io_wq)
 		return -ENOMEM;
 
 	return 0;
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index d691d954730e..fa9641aaed39 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -388,12 +388,18 @@ int bch_btree_insert(struct btree_op *, struct cache_set *, struct keylist *);
 
 int bch_btree_search_recurse(struct btree *, struct btree_op *);
 
-void bch_queue_gc(struct cache_set *);
+int bch_gc_thread_start(struct cache_set *);
 size_t bch_btree_gc_finish(struct cache_set *);
-void bch_moving_gc(struct closure *);
+void bch_moving_gc(struct cache_set *);
 int bch_btree_check(struct cache_set *, struct btree_op *);
 uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *);
 
+static inline void wake_up_gc(struct cache_set *c)
+{
+	if (c->gc_thread)
+		wake_up_process(c->gc_thread);
+}
+
 void bch_keybuf_init(struct keybuf *);
 void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *,
 		       keybuf_pred_fn *);
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 2c42377a65aa..6ba050456ec8 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -57,8 +57,7 @@ static void write_moving_finish(struct closure *cl)
 
 	bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w);
 
-	atomic_dec_bug(&io->s.op.c->in_flight);
-	closure_wake_up(&io->s.op.c->moving_gc_wait);
+	up(&io->s.op.c->moving_in_flight);
 
 	closure_return_with_destructor(cl, moving_io_destructor);
 }
@@ -113,7 +112,7 @@ static void write_moving(struct closure *cl)
 		bch_data_insert(&s->op.cl);
 	}
 
-	continue_at(cl, write_moving_finish, bch_gc_wq);
+	continue_at(cl, write_moving_finish, system_wq);
 }
 
 static void read_moving_submit(struct closure *cl)
@@ -124,15 +123,17 @@ static void read_moving_submit(struct closure *cl)
 
 	bch_submit_bbio(bio, s->op.c, &io->w->key, 0);
 
-	continue_at(cl, write_moving, bch_gc_wq);
+	continue_at(cl, write_moving, system_wq);
 }
 
-static void read_moving(struct closure *cl)
+static void read_moving(struct cache_set *c)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, moving_gc);
 	struct keybuf_key *w;
 	struct moving_io *io;
 	struct bio *bio;
+	struct closure cl;
+
+	closure_init_stack(&cl);
 
 	/* XXX: if we error, background writeback could stall indefinitely */
 
@@ -164,13 +165,8 @@ static void read_moving(struct closure *cl)
 
 		trace_bcache_gc_copy(&w->key);
 
-		closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl);
-
-		if (atomic_inc_return(&c->in_flight) >= 64) {
-			closure_wait_event(&c->moving_gc_wait, cl,
-					   atomic_read(&c->in_flight) < 64);
-			continue_at(cl, read_moving, bch_gc_wq);
-		}
+		down(&c->moving_in_flight);
+		closure_call(&io->s.cl, read_moving_submit, NULL, &cl);
 	}
 
 	if (0) {
@@ -180,7 +176,7 @@ err:		if (!IS_ERR_OR_NULL(w->private))
 		bch_keybuf_del(&c->moving_gc_keys, w);
 	}
 
-	closure_return(cl);
+	closure_sync(&cl);
 }
 
 static bool bucket_cmp(struct bucket *l, struct bucket *r)
@@ -193,15 +189,14 @@ static unsigned bucket_heap_top(struct cache *ca)
 	return GC_SECTORS_USED(heap_peek(&ca->heap));
 }
 
-void bch_moving_gc(struct closure *cl)
+void bch_moving_gc(struct cache_set *c)
 {
-	struct cache_set *c = container_of(cl, struct cache_set, gc.cl);
 	struct cache *ca;
 	struct bucket *b;
 	unsigned i;
 
 	if (!c->copy_gc_enabled)
-		closure_return(cl);
+		return;
 
 	mutex_lock(&c->bucket_lock);
 
@@ -242,13 +237,11 @@ void bch_moving_gc(struct closure *cl)
 
 	c->moving_gc_keys.last_scanned = ZERO_KEY;
 
-	closure_init(&c->moving_gc, cl);
-	read_moving(&c->moving_gc);
-
-	closure_return(cl);
+	read_moving(c);
 }
 
 void bch_moving_init_cache_set(struct cache_set *c)
 {
 	bch_keybuf_init(&c->moving_gc_keys);
+	sema_init(&c->moving_in_flight, 64);
 }
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 26d18f4bf4a0..f779eb420d69 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -520,7 +520,7 @@ static void bch_data_insert_start(struct closure *cl)
 
 	if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
 		set_gc_sectors(op->c);
-		bch_queue_gc(op->c);
+		wake_up_gc(op->c);
 	}
 
 	/*
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 84398a82fbe3..f89e2296bde1 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1342,6 +1342,9 @@ static void cache_set_flush(struct closure *cl)
 	kobject_put(&c->internal);
 	kobject_del(&c->kobj);
 
+	if (c->gc_thread)
+		kthread_stop(c->gc_thread);
+
 	if (!IS_ERR_OR_NULL(c->root))
 		list_add(&c->root->list, &c->btree_cache);
 
@@ -1579,8 +1582,6 @@ static void run_cache_set(struct cache_set *c)
 		bch_journal_replay(c, &journal, &op);
 	} else {
 		pr_notice("invalidating existing data");
-		/* Don't want invalidate_buckets() to queue a gc yet */
-		closure_lock(&c->gc, NULL);
 
 		for_each_cache(ca, c, i) {
 			unsigned j;
@@ -1606,12 +1607,12 @@ static void run_cache_set(struct cache_set *c)
 
 		err = "cannot allocate new UUID bucket";
 		if (__uuid_write(c))
-			goto err_unlock_gc;
+			goto err;
 
 		err = "cannot allocate new btree root";
 		c->root = bch_btree_node_alloc(c, 0);
 		if (IS_ERR_OR_NULL(c->root))
-			goto err_unlock_gc;
+			goto err;
 
 		bkey_copy_key(&c->root->key, &MAX_KEY);
 		bch_btree_node_write(c->root, &op.cl);
@@ -1628,12 +1629,12 @@ static void run_cache_set(struct cache_set *c)
 
 		bch_journal_next(&c->journal);
 		bch_journal_meta(c, &op.cl);
-
-		/* Unlock */
-		closure_set_stopped(&c->gc.cl);
-		closure_put(&c->gc.cl);
 	}
 
+	err = "error starting gc thread";
+	if (bch_gc_thread_start(c))
+		goto err;
+
 	closure_sync(&op.cl);
 	c->sb.last_mount = get_seconds();
 	bcache_write_super(c);
@@ -1644,9 +1645,6 @@ static void run_cache_set(struct cache_set *c)
 	flash_devs_run(c);
 
 	return;
-err_unlock_gc:
-	closure_set_stopped(&c->gc.cl);
-	closure_put(&c->gc.cl);
 err:
 	closure_sync(&op.cl);
 	/* XXX: test this, it's broken */
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index b3a66f17231d..ab286b9b5e40 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -566,7 +566,7 @@ STORE(__bch_cache_set)
 	}
 
 	if (attr == &sysfs_trigger_gc)
-		bch_queue_gc(c);
+		wake_up_gc(c);
 
 	if (attr == &sysfs_prune_cache) {
 		struct shrink_control sc;
-- 
cgit v1.2.3


From 5e6926daac267dd99552ae613f041a9e88bbf258 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:50:06 -0700
Subject: bcache: Convert writeback to a kthread

This simplifies the writeback flow control quite a bit - previously, it
was conceptually two coroutines, refill_dirty() and read_dirty(). This
makes the code quite a bit more straightforward.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h    |  10 +-
 drivers/md/bcache/super.c     |   3 +-
 drivers/md/bcache/writeback.c | 371 ++++++++++++++++++++----------------------
 drivers/md/bcache/writeback.h |  25 ++-
 4 files changed, 203 insertions(+), 206 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 09410eb07d82..674e2f42e778 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -509,7 +509,7 @@ struct cached_dev {
 
 	/* Limit number of writeback bios in flight */
 	struct semaphore	in_flight;
-	struct closure_with_timer writeback;
+	struct task_struct	*writeback_thread;
 
 	struct keybuf		writeback_keys;
 
@@ -1038,7 +1038,11 @@ static inline void bkey_init(struct bkey *k)
 
 #define KEY_START(k)		(KEY_OFFSET(k) - KEY_SIZE(k))
 #define START_KEY(k)		KEY(KEY_INODE(k), KEY_START(k), 0)
-#define MAX_KEY			KEY(~(~0 << 20), ((uint64_t) ~0) >> 1, 0)
+
+#define MAX_KEY_INODE		(~(~0 << 20))
+#define MAX_KEY_OFFSET		(((uint64_t) ~0) >> 1)
+#define MAX_KEY			KEY(MAX_KEY_INODE, MAX_KEY_OFFSET, 0)
+
 #define ZERO_KEY		KEY(0, 0, 0)
 
 /*
@@ -1214,8 +1218,6 @@ int bch_cache_allocator_init(struct cache *ca);
 
 void bch_debug_exit(void);
 int bch_debug_init(struct kobject *);
-void bch_writeback_exit(void);
-int bch_writeback_init(void);
 void bch_request_exit(void);
 int bch_request_init(void);
 void bch_btree_exit(void);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index f89e2296bde1..b79dd5a6679e 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1029,6 +1029,7 @@ static void cached_dev_free(struct closure *cl)
 	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
 
 	cancel_delayed_work_sync(&dc->writeback_rate_update);
+	kthread_stop(dc->writeback_thread);
 
 	mutex_lock(&bch_register_lock);
 
@@ -2006,7 +2007,6 @@ static struct notifier_block reboot = {
 static void bcache_exit(void)
 {
 	bch_debug_exit();
-	bch_writeback_exit();
 	bch_request_exit();
 	bch_btree_exit();
 	if (bcache_kobj)
@@ -2039,7 +2039,6 @@ static int __init bcache_init(void)
 	    sysfs_create_files(bcache_kobj, files) ||
 	    bch_btree_init() ||
 	    bch_request_init() ||
-	    bch_writeback_init() ||
 	    bch_debug_init(bcache_kobj))
 		goto err;
 
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 51dc709c9bf7..4392f3f38d62 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -11,18 +11,11 @@
 #include "debug.h"
 #include "writeback.h"
 
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
 #include <trace/events/bcache.h>
 
-static struct workqueue_struct *dirty_wq;
-
-static void read_dirty(struct closure *);
-
-struct dirty_io {
-	struct closure		cl;
-	struct cached_dev	*dc;
-	struct bio		bio;
-};
-
 /* Rate limiting */
 
 static void __update_writeback_rate(struct cached_dev *dc)
@@ -72,9 +65,6 @@ out:
 	dc->writeback_rate_derivative = derivative;
 	dc->writeback_rate_change = change;
 	dc->writeback_rate_target = target;
-
-	schedule_delayed_work(&dc->writeback_rate_update,
-			      dc->writeback_rate_update_seconds * HZ);
 }
 
 static void update_writeback_rate(struct work_struct *work)
@@ -90,6 +80,9 @@ static void update_writeback_rate(struct work_struct *work)
 		__update_writeback_rate(dc);
 
 	up_read(&dc->writeback_lock);
+
+	schedule_delayed_work(&dc->writeback_rate_update,
+			      dc->writeback_rate_update_seconds * HZ);
 }
 
 static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
@@ -105,37 +98,11 @@ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
 	return min_t(uint64_t, ret, HZ);
 }
 
-/* Background writeback */
-
-static bool dirty_pred(struct keybuf *buf, struct bkey *k)
-{
-	return KEY_DIRTY(k);
-}
-
-static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k)
-{
-	uint64_t stripe = KEY_START(k);
-	unsigned nr_sectors = KEY_SIZE(k);
-	struct cached_dev *dc = container_of(buf, struct cached_dev,
-					     writeback_keys);
-
-	if (!KEY_DIRTY(k))
-		return false;
-
-	do_div(stripe, dc->disk.stripe_size);
-
-	while (1) {
-		if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) ==
-		    dc->disk.stripe_size)
-			return true;
-
-		if (nr_sectors <= dc->disk.stripe_size)
-			return false;
-
-		nr_sectors -= dc->disk.stripe_size;
-		stripe++;
-	}
-}
+struct dirty_io {
+	struct closure		cl;
+	struct cached_dev	*dc;
+	struct bio		bio;
+};
 
 static void dirty_init(struct keybuf_key *w)
 {
@@ -153,132 +120,6 @@ static void dirty_init(struct keybuf_key *w)
 	bch_bio_map(bio, NULL);
 }
 
-static void refill_dirty(struct closure *cl)
-{
-	struct cached_dev *dc = container_of(cl, struct cached_dev,
-					     writeback.cl);
-	struct keybuf *buf = &dc->writeback_keys;
-	bool searched_from_start = false;
-	struct bkey end = MAX_KEY;
-	SET_KEY_INODE(&end, dc->disk.id);
-
-	if (!atomic_read(&dc->disk.detaching) &&
-	    !dc->writeback_running)
-		closure_return(cl);
-
-	down_write(&dc->writeback_lock);
-
-	if (!atomic_read(&dc->has_dirty)) {
-		SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
-		bch_write_bdev_super(dc, NULL);
-
-		up_write(&dc->writeback_lock);
-		closure_return(cl);
-	}
-
-	if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
-		buf->last_scanned = KEY(dc->disk.id, 0, 0);
-		searched_from_start = true;
-	}
-
-	if (dc->partial_stripes_expensive) {
-		uint64_t i;
-
-		for (i = 0; i < dc->disk.nr_stripes; i++)
-			if (atomic_read(dc->disk.stripe_sectors_dirty + i) ==
-			    dc->disk.stripe_size)
-				goto full_stripes;
-
-		goto normal_refill;
-full_stripes:
-		searched_from_start = false;	/* not searching entire btree */
-		bch_refill_keybuf(dc->disk.c, buf, &end,
-				  dirty_full_stripe_pred);
-	} else {
-normal_refill:
-		bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
-	}
-
-	if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) {
-		/* Searched the entire btree  - delay awhile */
-
-		if (RB_EMPTY_ROOT(&buf->keys)) {
-			atomic_set(&dc->has_dirty, 0);
-			cached_dev_put(dc);
-		}
-
-		if (!atomic_read(&dc->disk.detaching))
-			closure_delay(&dc->writeback, dc->writeback_delay * HZ);
-	}
-
-	up_write(&dc->writeback_lock);
-
-	bch_ratelimit_reset(&dc->writeback_rate);
-
-	/* Punt to workqueue only so we don't recurse and blow the stack */
-	continue_at(cl, read_dirty, dirty_wq);
-}
-
-void bch_writeback_queue(struct cached_dev *dc)
-{
-	if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) {
-		if (!atomic_read(&dc->disk.detaching))
-			closure_delay(&dc->writeback, dc->writeback_delay * HZ);
-
-		continue_at(&dc->writeback.cl, refill_dirty, dirty_wq);
-	}
-}
-
-void bch_writeback_add(struct cached_dev *dc)
-{
-	if (!atomic_read(&dc->has_dirty) &&
-	    !atomic_xchg(&dc->has_dirty, 1)) {
-		atomic_inc(&dc->count);
-
-		if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
-			SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
-			/* XXX: should do this synchronously */
-			bch_write_bdev_super(dc, NULL);
-		}
-
-		bch_writeback_queue(dc);
-
-		if (dc->writeback_percent)
-			schedule_delayed_work(&dc->writeback_rate_update,
-				      dc->writeback_rate_update_seconds * HZ);
-	}
-}
-
-void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
-				  uint64_t offset, int nr_sectors)
-{
-	struct bcache_device *d = c->devices[inode];
-	unsigned stripe_offset;
-	uint64_t stripe = offset;
-
-	if (!d)
-		return;
-
-	do_div(stripe, d->stripe_size);
-
-	stripe_offset = offset & (d->stripe_size - 1);
-
-	while (nr_sectors) {
-		int s = min_t(unsigned, abs(nr_sectors),
-			      d->stripe_size - stripe_offset);
-
-		if (nr_sectors < 0)
-			s = -s;
-
-		atomic_add(s, d->stripe_sectors_dirty + stripe);
-		nr_sectors -= s;
-		stripe_offset = 0;
-		stripe++;
-	}
-}
-
-/* Background writeback - IO loop */
-
 static void dirty_io_destructor(struct closure *cl)
 {
 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
@@ -378,30 +219,33 @@ static void read_dirty_submit(struct closure *cl)
 	continue_at(cl, write_dirty, system_wq);
 }
 
-static void read_dirty(struct closure *cl)
+static void read_dirty(struct cached_dev *dc)
 {
-	struct cached_dev *dc = container_of(cl, struct cached_dev,
-					     writeback.cl);
-	unsigned delay = writeback_delay(dc, 0);
+	unsigned delay = 0;
 	struct keybuf_key *w;
 	struct dirty_io *io;
+	struct closure cl;
+
+	closure_init_stack(&cl);
 
 	/*
 	 * XXX: if we error, background writeback just spins. Should use some
 	 * mempools.
 	 */
 
-	while (1) {
+	while (!kthread_should_stop()) {
+		try_to_freeze();
+
 		w = bch_keybuf_next(&dc->writeback_keys);
 		if (!w)
 			break;
 
 		BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
 
-		if (delay > 0 &&
-		    (KEY_START(&w->key) != dc->last_read ||
-		     jiffies_to_msecs(delay) > 50))
-			delay = schedule_timeout_uninterruptible(delay);
+		if (KEY_START(&w->key) != dc->last_read ||
+		    jiffies_to_msecs(delay) > 50)
+			while (!kthread_should_stop() && delay)
+				delay = schedule_timeout_interruptible(delay);
 
 		dc->last_read	= KEY_OFFSET(&w->key);
 
@@ -427,7 +271,7 @@ static void read_dirty(struct closure *cl)
 		trace_bcache_writeback(&w->key);
 
 		down(&dc->in_flight);
-		closure_call(&io->cl, read_dirty_submit, NULL, cl);
+		closure_call(&io->cl, read_dirty_submit, NULL, &cl);
 
 		delay = writeback_delay(dc, KEY_SIZE(&w->key));
 	}
@@ -443,7 +287,148 @@ err:
 	 * Wait for outstanding writeback IOs to finish (and keybuf slots to be
 	 * freed) before refilling again
 	 */
-	continue_at(cl, refill_dirty, dirty_wq);
+	closure_sync(&cl);
+}
+
+/* Scan for dirty data */
+
+void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
+				  uint64_t offset, int nr_sectors)
+{
+	struct bcache_device *d = c->devices[inode];
+	unsigned stripe_offset;
+	uint64_t stripe = offset;
+
+	if (!d)
+		return;
+
+	do_div(stripe, d->stripe_size);
+
+	stripe_offset = offset & (d->stripe_size - 1);
+
+	while (nr_sectors) {
+		int s = min_t(unsigned, abs(nr_sectors),
+			      d->stripe_size - stripe_offset);
+
+		if (nr_sectors < 0)
+			s = -s;
+
+		atomic_add(s, d->stripe_sectors_dirty + stripe);
+		nr_sectors -= s;
+		stripe_offset = 0;
+		stripe++;
+	}
+}
+
+static bool dirty_pred(struct keybuf *buf, struct bkey *k)
+{
+	return KEY_DIRTY(k);
+}
+
+static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k)
+{
+	uint64_t stripe = KEY_START(k);
+	unsigned nr_sectors = KEY_SIZE(k);
+	struct cached_dev *dc = container_of(buf, struct cached_dev,
+					     writeback_keys);
+
+	if (!KEY_DIRTY(k))
+		return false;
+
+	do_div(stripe, dc->disk.stripe_size);
+
+	while (1) {
+		if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) ==
+		    dc->disk.stripe_size)
+			return true;
+
+		if (nr_sectors <= dc->disk.stripe_size)
+			return false;
+
+		nr_sectors -= dc->disk.stripe_size;
+		stripe++;
+	}
+}
+
+static bool refill_dirty(struct cached_dev *dc)
+{
+	struct keybuf *buf = &dc->writeback_keys;
+	bool searched_from_start = false;
+	struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0);
+
+	if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
+		buf->last_scanned = KEY(dc->disk.id, 0, 0);
+		searched_from_start = true;
+	}
+
+	if (dc->partial_stripes_expensive) {
+		uint64_t i;
+
+		for (i = 0; i < dc->disk.nr_stripes; i++)
+			if (atomic_read(dc->disk.stripe_sectors_dirty + i) ==
+			    dc->disk.stripe_size)
+				goto full_stripes;
+
+		goto normal_refill;
+full_stripes:
+		searched_from_start = false;	/* not searching entire btree */
+		bch_refill_keybuf(dc->disk.c, buf, &end,
+				  dirty_full_stripe_pred);
+	} else {
+normal_refill:
+		bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
+	}
+
+	return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start;
+}
+
+static int bch_writeback_thread(void *arg)
+{
+	struct cached_dev *dc = arg;
+	bool searched_full_index;
+
+	while (!kthread_should_stop()) {
+		down_write(&dc->writeback_lock);
+		if (!atomic_read(&dc->has_dirty) ||
+		    (!atomic_read(&dc->disk.detaching) &&
+		     !dc->writeback_running)) {
+			up_write(&dc->writeback_lock);
+			set_current_state(TASK_INTERRUPTIBLE);
+
+			if (kthread_should_stop())
+				return 0;
+
+			try_to_freeze();
+			schedule();
+			continue;
+		}
+
+		searched_full_index = refill_dirty(dc);
+
+		if (searched_full_index &&
+		    RB_EMPTY_ROOT(&dc->writeback_keys.keys)) {
+			atomic_set(&dc->has_dirty, 0);
+			cached_dev_put(dc);
+			SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
+			bch_write_bdev_super(dc, NULL);
+		}
+
+		up_write(&dc->writeback_lock);
+
+		bch_ratelimit_reset(&dc->writeback_rate);
+		read_dirty(dc);
+
+		if (searched_full_index) {
+			unsigned delay = dc->writeback_delay * HZ;
+
+			while (delay &&
+			       !kthread_should_stop() &&
+			       !atomic_read(&dc->disk.detaching))
+				delay = schedule_timeout_interruptible(delay);
+		}
+	}
+
+	return 0;
 }
 
 /* Init */
@@ -483,12 +468,10 @@ void bch_sectors_dirty_init(struct cached_dev *dc)
 	btree_root(sectors_dirty_init, dc->disk.c, &op, dc);
 }
 
-void bch_cached_dev_writeback_init(struct cached_dev *dc)
+int bch_cached_dev_writeback_init(struct cached_dev *dc)
 {
 	sema_init(&dc->in_flight, 64);
-	closure_init_unlocked(&dc->writeback);
 	init_rwsem(&dc->writeback_lock);
-
 	bch_keybuf_init(&dc->writeback_keys);
 
 	dc->writeback_metadata		= true;
@@ -502,22 +485,16 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
 	dc->writeback_rate_p_term_inverse = 64;
 	dc->writeback_rate_d_smooth	= 8;
 
+	dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
+					      "bcache_writeback");
+	if (IS_ERR(dc->writeback_thread))
+		return PTR_ERR(dc->writeback_thread);
+
+	set_task_state(dc->writeback_thread, TASK_INTERRUPTIBLE);
+
 	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
 	schedule_delayed_work(&dc->writeback_rate_update,
 			      dc->writeback_rate_update_seconds * HZ);
-}
-
-void bch_writeback_exit(void)
-{
-	if (dirty_wq)
-		destroy_workqueue(dirty_wq);
-}
-
-int __init bch_writeback_init(void)
-{
-	dirty_wq = create_workqueue("bcache_writeback");
-	if (!dirty_wq)
-		return -ENOMEM;
 
 	return 0;
 }
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 34961888b5a9..60516bfa6052 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -56,11 +56,30 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 		in_use <= CUTOFF_WRITEBACK;
 }
 
+static inline void bch_writeback_queue(struct cached_dev *dc)
+{
+	wake_up_process(dc->writeback_thread);
+}
+
+static inline void bch_writeback_add(struct cached_dev *dc)
+{
+	if (!atomic_read(&dc->has_dirty) &&
+	    !atomic_xchg(&dc->has_dirty, 1)) {
+		atomic_inc(&dc->count);
+
+		if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
+			SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
+			/* XXX: should do this synchronously */
+			bch_write_bdev_super(dc, NULL);
+		}
+
+		bch_writeback_queue(dc);
+	}
+}
+
 void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
-void bch_writeback_queue(struct cached_dev *);
-void bch_writeback_add(struct cached_dev *);
 
 void bch_sectors_dirty_init(struct cached_dev *dc);
-void bch_cached_dev_writeback_init(struct cached_dev *);
+int bch_cached_dev_writeback_init(struct cached_dev *);
 
 #endif
-- 
cgit v1.2.3


From 48dad8baf92fe8967d9e1358af1cfdda1d2d3298 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 10 Sep 2013 18:48:51 -0700
Subject: bcache: Add btree_map() functions

Lots of stuff has been open coding its own btree traversal - which is
generally pretty simple code, but there are a few subtleties.

This adds new new functions, bch_btree_map_nodes() and
bch_btree_map_keys(), which do the traversal for you. Everything that's
open coding btree traversal now (with the exception of garbage
collection) is slowly going to be converted to these two functions;
being able to write other code at a higher level of abstraction  is a
big improvement w.r.t. overall code quality.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h    |   2 -
 drivers/md/bcache/bset.c      |  33 ++++----
 drivers/md/bcache/btree.c     | 174 +++++++++++++++++++++++++++++-------------
 drivers/md/bcache/btree.h     |  37 ++++++++-
 drivers/md/bcache/writeback.c |  37 ++++-----
 5 files changed, 186 insertions(+), 97 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 674e2f42e778..20fe96c121d9 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -384,8 +384,6 @@ struct keybuf_key {
 	void			*private;
 };
 
-typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);
-
 struct keybuf {
 	struct bkey		last_scanned;
 	spinlock_t		lock;
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index d0512e451dda..14c2a23d3884 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -842,6 +842,13 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
 
 /* Btree iterator */
 
+/*
+ * Returns true if l > r - unless l == r, in which case returns true if l is
+ * older than r.
+ *
+ * Necessary for btree_sort_fixup() - if there are multiple keys that compare
+ * equal in different sets, we have to process them newest to oldest.
+ */
 static inline bool btree_iter_cmp(struct btree_iter_set l,
 				  struct btree_iter_set r)
 {
@@ -1146,16 +1153,16 @@ out:
 /* Sysfs stuff */
 
 struct bset_stats {
+	struct btree_op op;
 	size_t nodes;
 	size_t sets_written, sets_unwritten;
 	size_t bytes_written, bytes_unwritten;
 	size_t floats, failed;
 };
 
-static int bch_btree_bset_stats(struct btree *b, struct btree_op *op,
-			    struct bset_stats *stats)
+static int btree_bset_stats(struct btree_op *op, struct btree *b)
 {
-	struct bkey *k;
+	struct bset_stats *stats = container_of(op, struct bset_stats, op);
 	unsigned i;
 
 	stats->nodes++;
@@ -1180,30 +1187,20 @@ static int bch_btree_bset_stats(struct btree *b, struct btree_op *op,
 		}
 	}
 
-	if (b->level) {
-		struct btree_iter iter;
-
-		for_each_key_filter(b, k, &iter, bch_ptr_bad) {
-			int ret = btree(bset_stats, k, b, op, stats);
-			if (ret)
-				return ret;
-		}
-	}
-
-	return 0;
+	return MAP_CONTINUE;
 }
 
 int bch_bset_print_stats(struct cache_set *c, char *buf)
 {
-	struct btree_op op;
 	struct bset_stats t;
 	int ret;
 
-	bch_btree_op_init_stack(&op);
 	memset(&t, 0, sizeof(struct bset_stats));
+	bch_btree_op_init_stack(&t.op);
+	t.op.c = c;
 
-	ret = btree_root(bset_stats, c, &op, &t);
-	if (ret)
+	ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats);
+	if (ret < 0)
 		return ret;
 
 	return snprintf(buf, PAGE_SIZE,
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 17bfd87fc8f4..cfbdcf349b7f 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -2296,6 +2296,82 @@ int bch_btree_search_recurse(struct btree *b, struct btree_op *op)
 	return ret;
 }
 
+/* Map across nodes or keys */
+
+static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
+				       struct bkey *from,
+				       btree_map_nodes_fn *fn, int flags)
+{
+	int ret = MAP_CONTINUE;
+
+	if (b->level) {
+		struct bkey *k;
+		struct btree_iter iter;
+
+		bch_btree_iter_init(b, &iter, from);
+
+		while ((k = bch_btree_iter_next_filter(&iter, b,
+						       bch_ptr_bad))) {
+			ret = btree(map_nodes_recurse, k, b,
+				    op, from, fn, flags);
+			from = NULL;
+
+			if (ret != MAP_CONTINUE)
+				return ret;
+		}
+	}
+
+	if (!b->level || flags == MAP_ALL_NODES)
+		ret = fn(op, b);
+
+	return ret;
+}
+
+int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
+			  struct bkey *from, btree_map_nodes_fn *fn, int flags)
+{
+	int ret = btree_root(map_nodes_recurse, c, op, from, fn, flags);
+	if (closure_blocking(&op->cl))
+		closure_sync(&op->cl);
+	return ret;
+}
+
+static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
+				      struct bkey *from, btree_map_keys_fn *fn,
+				      int flags)
+{
+	int ret = MAP_CONTINUE;
+	struct bkey *k;
+	struct btree_iter iter;
+
+	bch_btree_iter_init(b, &iter, from);
+
+	while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) {
+		ret = !b->level
+			? fn(op, b, k)
+			: btree(map_keys_recurse, k, b, op, from, fn, flags);
+		from = NULL;
+
+		if (ret != MAP_CONTINUE)
+			return ret;
+	}
+
+	if (!b->level && (flags & MAP_END_KEY))
+		ret = fn(op, b, &KEY(KEY_INODE(&b->key),
+				     KEY_OFFSET(&b->key), 0));
+
+	return ret;
+}
+
+int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
+		       struct bkey *from, btree_map_keys_fn *fn, int flags)
+{
+	int ret = btree_root(map_keys_recurse, c, op, from, fn, flags);
+	if (closure_blocking(&op->cl))
+		closure_sync(&op->cl);
+	return ret;
+}
+
 /* Keybuf code */
 
 static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
@@ -2314,74 +2390,70 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
 	return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);
 }
 
-static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
-				   struct keybuf *buf, struct bkey *end,
-				   keybuf_pred_fn *pred)
-{
-	struct btree_iter iter;
-	bch_btree_iter_init(b, &iter, &buf->last_scanned);
-
-	while (!array_freelist_empty(&buf->freelist)) {
-		struct bkey *k = bch_btree_iter_next_filter(&iter, b,
-							    bch_ptr_bad);
-
-		if (!b->level) {
-			if (!k) {
-				buf->last_scanned = b->key;
-				break;
-			}
+struct refill {
+	struct btree_op	op;
+	struct keybuf	*buf;
+	struct bkey	*end;
+	keybuf_pred_fn	*pred;
+};
 
-			buf->last_scanned = *k;
-			if (bkey_cmp(&buf->last_scanned, end) >= 0)
-				break;
+static int refill_keybuf_fn(struct btree_op *op, struct btree *b,
+			    struct bkey *k)
+{
+	struct refill *refill = container_of(op, struct refill, op);
+	struct keybuf *buf = refill->buf;
+	int ret = MAP_CONTINUE;
 
-			if (pred(buf, k)) {
-				struct keybuf_key *w;
+	if (bkey_cmp(k, refill->end) >= 0) {
+		ret = MAP_DONE;
+		goto out;
+	}
 
-				spin_lock(&buf->lock);
+	if (!KEY_SIZE(k)) /* end key */
+		goto out;
 
-				w = array_alloc(&buf->freelist);
+	if (refill->pred(buf, k)) {
+		struct keybuf_key *w;
 
-				w->private = NULL;
-				bkey_copy(&w->key, k);
+		spin_lock(&buf->lock);
 
-				if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
-					array_free(&buf->freelist, w);
+		w = array_alloc(&buf->freelist);
+		if (!w) {
+			spin_unlock(&buf->lock);
+			return MAP_DONE;
+		}
 
-				spin_unlock(&buf->lock);
-			}
-		} else {
-			if (!k)
-				break;
+		w->private = NULL;
+		bkey_copy(&w->key, k);
 
-			btree(refill_keybuf, k, b, op, buf, end, pred);
-			/*
-			 * Might get an error here, but can't really do anything
-			 * and it'll get logged elsewhere. Just read what we
-			 * can.
-			 */
+		if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
+			array_free(&buf->freelist, w);
 
-			if (bkey_cmp(&buf->last_scanned, end) >= 0)
-				break;
+		if (array_freelist_empty(&buf->freelist))
+			ret = MAP_DONE;
 
-			cond_resched();
-		}
+		spin_unlock(&buf->lock);
 	}
-
-	return 0;
+out:
+	buf->last_scanned = *k;
+	return ret;
 }
 
 void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
 		       struct bkey *end, keybuf_pred_fn *pred)
 {
 	struct bkey start = buf->last_scanned;
-	struct btree_op op;
-	bch_btree_op_init_stack(&op);
+	struct refill refill;
 
 	cond_resched();
 
-	btree_root(refill_keybuf, c, &op, buf, end, pred);
-	closure_sync(&op.cl);
+	bch_btree_op_init_stack(&refill.op);
+	refill.buf = buf;
+	refill.end = end;
+	refill.pred = pred;
+
+	bch_btree_map_keys(&refill.op, c, &buf->last_scanned,
+			   refill_keybuf_fn, MAP_END_KEY);
 
 	pr_debug("found %s keys from %llu:%llu to %llu:%llu",
 		 RB_EMPTY_ROOT(&buf->keys) ? "no" :
@@ -2465,9 +2537,9 @@ struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
 }
 
 struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
-					     struct keybuf *buf,
-					     struct bkey *end,
-					     keybuf_pred_fn *pred)
+					  struct keybuf *buf,
+					  struct bkey *end,
+					  keybuf_pred_fn *pred)
 {
 	struct keybuf_key *ret;
 
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index fa9641aaed39..cafdeb01e219 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -400,9 +400,42 @@ static inline void wake_up_gc(struct cache_set *c)
 		wake_up_process(c->gc_thread);
 }
 
+#define MAP_DONE	0
+#define MAP_CONTINUE	1
+
+#define MAP_ALL_NODES	0
+#define MAP_LEAF_NODES	1
+
+#define MAP_END_KEY	1
+
+typedef int (btree_map_nodes_fn)(struct btree_op *, struct btree *);
+int __bch_btree_map_nodes(struct btree_op *, struct cache_set *,
+			  struct bkey *, btree_map_nodes_fn *, int);
+
+static inline int bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
+				      struct bkey *from, btree_map_nodes_fn *fn)
+{
+	return __bch_btree_map_nodes(op, c, from, fn, MAP_ALL_NODES);
+}
+
+static inline int bch_btree_map_leaf_nodes(struct btree_op *op,
+					   struct cache_set *c,
+					   struct bkey *from,
+					   btree_map_nodes_fn *fn)
+{
+	return __bch_btree_map_nodes(op, c, from, fn, MAP_LEAF_NODES);
+}
+
+typedef int (btree_map_keys_fn)(struct btree_op *, struct btree *,
+				struct bkey *);
+int bch_btree_map_keys(struct btree_op *, struct cache_set *,
+		       struct bkey *, btree_map_keys_fn *, int);
+
+typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);
+
 void bch_keybuf_init(struct keybuf *);
-void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *,
-		       keybuf_pred_fn *);
+void bch_refill_keybuf(struct cache_set *, struct keybuf *,
+		       struct bkey *, keybuf_pred_fn *);
 bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *,
 				  struct bkey *);
 void bch_keybuf_del(struct keybuf *, struct keybuf_key *);
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 4392f3f38d62..c68de9f12618 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -433,31 +433,17 @@ static int bch_writeback_thread(void *arg)
 
 /* Init */
 
-static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op,
-					struct cached_dev *dc)
+static int sectors_dirty_init_fn(struct btree_op *op, struct btree *b,
+				 struct bkey *k)
 {
-	struct bkey *k;
-	struct btree_iter iter;
-
-	bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0));
-	while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad)))
-		if (!b->level) {
-			if (KEY_INODE(k) > dc->disk.id)
-				break;
-
-			if (KEY_DIRTY(k))
-				bcache_dev_sectors_dirty_add(b->c, dc->disk.id,
-							     KEY_START(k),
-							     KEY_SIZE(k));
-		} else {
-			btree(sectors_dirty_init, k, b, op, dc);
-			if (KEY_INODE(k) > dc->disk.id)
-				break;
-
-			cond_resched();
-		}
+	if (KEY_INODE(k) > op->inode)
+		return MAP_DONE;
 
-	return 0;
+	if (KEY_DIRTY(k))
+		bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
+					     KEY_START(k), KEY_SIZE(k));
+
+	return MAP_CONTINUE;
 }
 
 void bch_sectors_dirty_init(struct cached_dev *dc)
@@ -465,7 +451,10 @@ void bch_sectors_dirty_init(struct cached_dev *dc)
 	struct btree_op op;
 
 	bch_btree_op_init_stack(&op);
-	btree_root(sectors_dirty_init, dc->disk.c, &op, dc);
+	op.inode = dc->disk.id;
+
+	bch_btree_map_keys(&op, dc->disk.c, &KEY(op.inode, 0, 0),
+			   sectors_dirty_init_fn, 0);
 }
 
 int bch_cached_dev_writeback_init(struct cached_dev *dc)
-- 
cgit v1.2.3


From df8e89701fb02cba6e09c5f46f002778b5b52dd2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:37:59 -0700
Subject: bcache: Move some stuff to btree.c

With the new btree_map() functions, we don't need to export the stuff
needed for traversing the btree anymore.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c   | 95 ++++++++++++++++++++++++++++++++++++++++++++-
 drivers/md/bcache/btree.h   | 82 +-------------------------------------
 drivers/md/bcache/request.c | 16 +-------
 3 files changed, 96 insertions(+), 97 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index cfbdcf349b7f..bfd60e6a2312 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -99,6 +99,13 @@ static const char *op_type(struct btree_op *op)
 	return op_types[op->type];
 }
 
+enum {
+	BTREE_INSERT_STATUS_INSERT,
+	BTREE_INSERT_STATUS_BACK_MERGE,
+	BTREE_INSERT_STATUS_OVERWROTE,
+	BTREE_INSERT_STATUS_FRONT_MERGE,
+};
+
 #define MAX_NEED_GC		64
 #define MAX_SAVE_PRIO		72
 
@@ -116,6 +123,78 @@ void bch_btree_op_init_stack(struct btree_op *op)
 	op->lock = -1;
 }
 
+static inline bool should_split(struct btree *b)
+{
+	struct bset *i = write_block(b);
+	return b->written >= btree_blocks(b) ||
+		(b->written + __set_blocks(i, i->keys + 15, b->c)
+		 > btree_blocks(b));
+}
+
+#define insert_lock(s, b)	((b)->level <= (s)->lock)
+
+/*
+ * These macros are for recursing down the btree - they handle the details of
+ * locking and looking up nodes in the cache for you. They're best treated as
+ * mere syntax when reading code that uses them.
+ *
+ * op->lock determines whether we take a read or a write lock at a given depth.
+ * If you've got a read lock and find that you need a write lock (i.e. you're
+ * going to have to split), set op->lock and return -EINTR; btree_root() will
+ * call you again and you'll have the correct lock.
+ */
+
+/**
+ * btree - recurse down the btree on a specified key
+ * @fn:		function to call, which will be passed the child node
+ * @key:	key to recurse on
+ * @b:		parent btree node
+ * @op:		pointer to struct btree_op
+ */
+#define btree(fn, key, b, op, ...)					\
+({									\
+	int _r, l = (b)->level - 1;					\
+	bool _w = l <= (op)->lock;					\
+	struct btree *_child = bch_btree_node_get((b)->c, key, l, _w);	\
+	if (!IS_ERR(_child)) {						\
+		_child->parent = (b);					\
+		_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);	\
+		rw_unlock(_w, _child);					\
+	} else								\
+		_r = PTR_ERR(_child);					\
+	_r;								\
+})
+
+/**
+ * btree_root - call a function on the root of the btree
+ * @fn:		function to call, which will be passed the child node
+ * @c:		cache set
+ * @op:		pointer to struct btree_op
+ */
+#define btree_root(fn, c, op, ...)					\
+({									\
+	int _r = -EINTR;						\
+	do {								\
+		struct btree *_b = (c)->root;				\
+		bool _w = insert_lock(op, _b);				\
+		rw_lock(_w, _b, _b->level);				\
+		if (_b == (c)->root &&					\
+		    _w == insert_lock(op, _b)) {			\
+			_b->parent = NULL;				\
+			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\
+		}							\
+		rw_unlock(_w, _b);					\
+		bch_cannibalize_unlock(c);				\
+		if (_r == -ENOSPC) {					\
+			wait_event((c)->try_wait,			\
+				   !(c)->try_harder);			\
+			_r = -EINTR;					\
+		}							\
+	} while (_r == -EINTR);						\
+									\
+	_r;								\
+})
+
 /* Btree key manipulation */
 
 void __bkey_put(struct cache_set *c, struct bkey *k)
@@ -811,7 +890,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
  * cannibalize_bucket() will take. This means every time we unlock the root of
  * the btree, we need to release this lock if we have it held.
  */
-void bch_cannibalize_unlock(struct cache_set *c)
+static void bch_cannibalize_unlock(struct cache_set *c)
 {
 	if (c->try_harder == current) {
 		bch_time_stats_update(&c->try_harder_time, c->try_harder_start);
@@ -2262,7 +2341,7 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
 	return 0;
 }
 
-int bch_btree_search_recurse(struct btree *b, struct btree_op *op)
+static int bch_btree_search_recurse(struct btree *b, struct btree_op *op)
 {
 	struct search *s = container_of(op, struct search, op);
 	struct bio *bio = &s->bio.bio;
@@ -2296,6 +2375,18 @@ int bch_btree_search_recurse(struct btree *b, struct btree_op *op)
 	return ret;
 }
 
+void bch_btree_search_async(struct closure *cl)
+{
+	struct btree_op *op = container_of(cl, struct btree_op, cl);
+
+	int ret = btree_root(search_recurse, op->c, op);
+
+	if (ret == -EAGAIN)
+		continue_at(cl, bch_btree_search_async, bcache_wq);
+
+	closure_return(cl);
+}
+
 /* Map across nodes or keys */
 
 static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index cafdeb01e219..1690f4731c1e 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -270,13 +270,6 @@ struct btree_op {
 	BKEY_PADDED(replace);
 };
 
-enum {
-	BTREE_INSERT_STATUS_INSERT,
-	BTREE_INSERT_STATUS_BACK_MERGE,
-	BTREE_INSERT_STATUS_OVERWROTE,
-	BTREE_INSERT_STATUS_FRONT_MERGE,
-};
-
 void bch_btree_op_init_stack(struct btree_op *);
 
 static inline void rw_lock(bool w, struct btree *b, int level)
@@ -302,82 +295,9 @@ static inline void rw_unlock(bool w, struct btree *b)
 	(w ? up_write : up_read)(&b->lock);
 }
 
-#define insert_lock(s, b)	((b)->level <= (s)->lock)
-
-/*
- * These macros are for recursing down the btree - they handle the details of
- * locking and looking up nodes in the cache for you. They're best treated as
- * mere syntax when reading code that uses them.
- *
- * op->lock determines whether we take a read or a write lock at a given depth.
- * If you've got a read lock and find that you need a write lock (i.e. you're
- * going to have to split), set op->lock and return -EINTR; btree_root() will
- * call you again and you'll have the correct lock.
- */
-
-/**
- * btree - recurse down the btree on a specified key
- * @fn:		function to call, which will be passed the child node
- * @key:	key to recurse on
- * @b:		parent btree node
- * @op:		pointer to struct btree_op
- */
-#define btree(fn, key, b, op, ...)					\
-({									\
-	int _r, l = (b)->level - 1;					\
-	bool _w = l <= (op)->lock;					\
-	struct btree *_child = bch_btree_node_get((b)->c, key, l, _w);	\
-	if (!IS_ERR(_child)) {						\
-		_child->parent = (b);					\
-		_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);	\
-		rw_unlock(_w, _child);					\
-	} else								\
-		_r = PTR_ERR(_child);					\
-	_r;								\
-})
-
-/**
- * btree_root - call a function on the root of the btree
- * @fn:		function to call, which will be passed the child node
- * @c:		cache set
- * @op:		pointer to struct btree_op
- */
-#define btree_root(fn, c, op, ...)					\
-({									\
-	int _r = -EINTR;						\
-	do {								\
-		struct btree *_b = (c)->root;				\
-		bool _w = insert_lock(op, _b);				\
-		rw_lock(_w, _b, _b->level);				\
-		if (_b == (c)->root &&					\
-		    _w == insert_lock(op, _b)) {			\
-			_b->parent = NULL;				\
-			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\
-		}							\
-		rw_unlock(_w, _b);					\
-		bch_cannibalize_unlock(c);				\
-		if (_r == -ENOSPC) {					\
-			wait_event((c)->try_wait,			\
-				   !(c)->try_harder);			\
-			_r = -EINTR;					\
-		}							\
-	} while (_r == -EINTR);						\
-									\
-	_r;								\
-})
-
-static inline bool should_split(struct btree *b)
-{
-	struct bset *i = write_block(b);
-	return b->written >= btree_blocks(b) ||
-		(b->written + __set_blocks(i, i->keys + 15, b->c)
-		 > btree_blocks(b));
-}
-
 void bch_btree_node_read(struct btree *);
 void bch_btree_node_write(struct btree *, struct closure *);
 
-void bch_cannibalize_unlock(struct cache_set *);
 void bch_btree_set_root(struct btree *);
 struct btree *bch_btree_node_alloc(struct cache_set *, int);
 struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool);
@@ -386,7 +306,7 @@ int bch_btree_insert_check_key(struct btree *, struct btree_op *,
 			       struct bkey *);
 int bch_btree_insert(struct btree_op *, struct cache_set *, struct keylist *);
 
-int bch_btree_search_recurse(struct btree *, struct btree_op *);
+void bch_btree_search_async(struct closure *);
 
 int bch_gc_thread_start(struct cache_set *);
 size_t bch_btree_gc_finish(struct cache_set *);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index f779eb420d69..a16872541038 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -754,18 +754,6 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
 	return s;
 }
 
-static void btree_read_async(struct closure *cl)
-{
-	struct btree_op *op = container_of(cl, struct btree_op, cl);
-
-	int ret = btree_root(search_recurse, op->c, op);
-
-	if (ret == -EAGAIN)
-		continue_at(cl, btree_read_async, bcache_wq);
-
-	closure_return(cl);
-}
-
 /* Cached devices */
 
 static void cached_dev_bio_complete(struct closure *cl)
@@ -1087,7 +1075,7 @@ static void cached_dev_read(struct cached_dev *dc, struct search *s)
 {
 	struct closure *cl = &s->cl;
 
-	closure_call(&s->op.cl, btree_read_async, NULL, cl);
+	closure_call(&s->op.cl, bch_btree_search_async, NULL, cl);
 	continue_at(cl, cached_dev_read_done_bh, NULL);
 }
 
@@ -1351,7 +1339,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 
 		closure_call(&s->op.cl, bch_data_insert, NULL, cl);
 	} else {
-		closure_call(&s->op.cl, btree_read_async, NULL, cl);
+		closure_call(&s->op.cl, bch_btree_search_async, NULL, cl);
 	}
 
 	continue_at(cl, search_free, NULL);
-- 
cgit v1.2.3


From 2c1953e201a05ddfb1ea53f23d81a492c6513028 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:41:08 -0700
Subject: bcache: Convert bch_btree_read_async() to bch_btree_map_keys()

This is a fairly straightforward conversion, mostly reshuffling -
op->lookup_done goes away, replaced by MAP_DONE/MAP_CONTINUE. And the
code for handling cache hits and misses wasn't really btree code, so it
gets moved to request.c.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c   | 133 --------------------------------------
 drivers/md/bcache/btree.h   |   3 -
 drivers/md/bcache/journal.c |   1 -
 drivers/md/bcache/request.c | 154 +++++++++++++++++++++++++++++++++++---------
 drivers/md/bcache/request.h |   2 -
 5 files changed, 125 insertions(+), 168 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index bfd60e6a2312..3949673cb1b0 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -23,7 +23,6 @@
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
-#include "request.h"
 #include "writeback.h"
 
 #include <linux/slab.h>
@@ -2255,138 +2254,6 @@ void bch_btree_set_root(struct btree *b)
 	closure_sync(&cl);
 }
 
-/* Cache lookup */
-
-static int submit_partial_cache_miss(struct btree *b, struct btree_op *op,
-				     struct bkey *k)
-{
-	struct search *s = container_of(op, struct search, op);
-	struct bio *bio = &s->bio.bio;
-	int ret = 0;
-
-	while (!ret &&
-	       !op->lookup_done) {
-		unsigned sectors = INT_MAX;
-
-		if (KEY_INODE(k) == op->inode) {
-			if (KEY_START(k) <= bio->bi_sector)
-				break;
-
-			sectors = min_t(uint64_t, sectors,
-					KEY_START(k) - bio->bi_sector);
-		}
-
-		ret = s->d->cache_miss(b, s, bio, sectors);
-	}
-
-	return ret;
-}
-
-/*
- * Read from a single key, handling the initial cache miss if the key starts in
- * the middle of the bio
- */
-static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
-				    struct bkey *k)
-{
-	struct search *s = container_of(op, struct search, op);
-	struct bio *bio = &s->bio.bio;
-	unsigned ptr;
-	struct bio *n;
-
-	int ret = submit_partial_cache_miss(b, op, k);
-	if (ret || op->lookup_done)
-		return ret;
-
-	/* XXX: figure out best pointer - for multiple cache devices */
-	ptr = 0;
-
-	PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
-
-	while (!op->lookup_done &&
-	       KEY_INODE(k) == op->inode &&
-	       bio->bi_sector < KEY_OFFSET(k)) {
-		struct bkey *bio_key;
-		sector_t sector = PTR_OFFSET(k, ptr) +
-			(bio->bi_sector - KEY_START(k));
-		unsigned sectors = min_t(uint64_t, INT_MAX,
-					 KEY_OFFSET(k) - bio->bi_sector);
-
-		n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
-		if (n == bio)
-			op->lookup_done = true;
-
-		bio_key = &container_of(n, struct bbio, bio)->key;
-
-		/*
-		 * The bucket we're reading from might be reused while our bio
-		 * is in flight, and we could then end up reading the wrong
-		 * data.
-		 *
-		 * We guard against this by checking (in cache_read_endio()) if
-		 * the pointer is stale again; if so, we treat it as an error
-		 * and reread from the backing device (but we don't pass that
-		 * error up anywhere).
-		 */
-
-		bch_bkey_copy_single_ptr(bio_key, k, ptr);
-		SET_PTR_OFFSET(bio_key, 0, sector);
-
-		n->bi_end_io	= bch_cache_read_endio;
-		n->bi_private	= &s->cl;
-
-		__bch_submit_bbio(n, b->c);
-	}
-
-	return 0;
-}
-
-static int bch_btree_search_recurse(struct btree *b, struct btree_op *op)
-{
-	struct search *s = container_of(op, struct search, op);
-	struct bio *bio = &s->bio.bio;
-
-	int ret = 0;
-	struct bkey *k;
-	struct btree_iter iter;
-	bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0));
-
-	do {
-		k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
-		if (!k) {
-			/*
-			 * b->key would be exactly what we want, except that
-			 * pointers to btree nodes have nonzero size - we
-			 * wouldn't go far enough
-			 */
-
-			ret = submit_partial_cache_miss(b, op,
-					&KEY(KEY_INODE(&b->key),
-					     KEY_OFFSET(&b->key), 0));
-			break;
-		}
-
-		ret = b->level
-			? btree(search_recurse, k, b, op)
-			: submit_partial_cache_hit(b, op, k);
-	} while (!ret &&
-		 !op->lookup_done);
-
-	return ret;
-}
-
-void bch_btree_search_async(struct closure *cl)
-{
-	struct btree_op *op = container_of(cl, struct btree_op, cl);
-
-	int ret = btree_root(search_recurse, op->c, op);
-
-	if (ret == -EAGAIN)
-		continue_at(cl, bch_btree_search_async, bcache_wq);
-
-	closure_return(cl);
-}
-
 /* Map across nodes or keys */
 
 static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 1690f4731c1e..60dadd722ace 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -264,7 +264,6 @@ struct btree_op {
 	unsigned		flush_journal:1;
 
 	unsigned		insert_data_done:1;
-	unsigned		lookup_done:1;
 	unsigned		insert_collision:1;
 
 	BKEY_PADDED(replace);
@@ -306,8 +305,6 @@ int bch_btree_insert_check_key(struct btree *, struct btree_op *,
 			       struct bkey *);
 int bch_btree_insert(struct btree_op *, struct cache_set *, struct keylist *);
 
-void bch_btree_search_async(struct closure *);
-
 int bch_gc_thread_start(struct cache_set *);
 size_t bch_btree_gc_finish(struct cache_set *);
 void bch_moving_gc(struct cache_set *);
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 8866f8ee3a07..6f4daf031410 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -7,7 +7,6 @@
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
-#include "request.h"
 
 #include <trace/events/bcache.h>
 
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index a16872541038..854743e85e76 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -638,24 +638,9 @@ void bch_data_insert(struct closure *cl)
 	bch_data_insert_start(cl);
 }
 
-/* Common code for the make_request functions */
+/* Cache lookup */
 
-static void request_endio(struct bio *bio, int error)
-{
-	struct closure *cl = bio->bi_private;
-
-	if (error) {
-		struct search *s = container_of(cl, struct search, cl);
-		s->error = error;
-		/* Only cache read errors are recoverable */
-		s->recoverable = false;
-	}
-
-	bio_put(bio);
-	closure_put(cl);
-}
-
-void bch_cache_read_endio(struct bio *bio, int error)
+static void bch_cache_read_endio(struct bio *bio, int error)
 {
 	struct bbio *b = container_of(bio, struct bbio, bio);
 	struct closure *cl = bio->bi_private;
@@ -678,6 +663,120 @@ void bch_cache_read_endio(struct bio *bio, int error)
 	bch_bbio_endio(s->op.c, bio, error, "reading from cache");
 }
 
+static int submit_partial_cache_miss(struct btree *b, struct search *s,
+				     struct bkey *k)
+{
+	struct bio *bio = &s->bio.bio;
+	int ret = MAP_CONTINUE;
+
+	do {
+		unsigned sectors = INT_MAX;
+
+		if (KEY_INODE(k) == s->op.inode) {
+			if (KEY_START(k) <= bio->bi_sector)
+				break;
+
+			sectors = min_t(uint64_t, sectors,
+					KEY_START(k) - bio->bi_sector);
+		}
+
+		ret = s->d->cache_miss(b, s, bio, sectors);
+	} while (ret == MAP_CONTINUE);
+
+	return ret;
+}
+
+/*
+ * Read from a single key, handling the initial cache miss if the key starts in
+ * the middle of the bio
+ */
+static int submit_partial_cache_hit(struct btree_op *op, struct btree *b,
+				    struct bkey *k)
+{
+	struct search *s = container_of(op, struct search, op);
+	struct bio *bio = &s->bio.bio;
+	unsigned ptr;
+	struct bio *n;
+
+	int ret = submit_partial_cache_miss(b, s, k);
+	if (ret != MAP_CONTINUE || !KEY_SIZE(k))
+		return ret;
+
+	/* XXX: figure out best pointer - for multiple cache devices */
+	ptr = 0;
+
+	PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
+
+	while (ret == MAP_CONTINUE &&
+	       KEY_INODE(k) == op->inode &&
+	       bio->bi_sector < KEY_OFFSET(k)) {
+		struct bkey *bio_key;
+		sector_t sector = PTR_OFFSET(k, ptr) +
+			(bio->bi_sector - KEY_START(k));
+		unsigned sectors = min_t(uint64_t, INT_MAX,
+					 KEY_OFFSET(k) - bio->bi_sector);
+
+		n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+		if (n == bio)
+			ret = MAP_DONE;
+
+		bio_key = &container_of(n, struct bbio, bio)->key;
+
+		/*
+		 * The bucket we're reading from might be reused while our bio
+		 * is in flight, and we could then end up reading the wrong
+		 * data.
+		 *
+		 * We guard against this by checking (in cache_read_endio()) if
+		 * the pointer is stale again; if so, we treat it as an error
+		 * and reread from the backing device (but we don't pass that
+		 * error up anywhere).
+		 */
+
+		bch_bkey_copy_single_ptr(bio_key, k, ptr);
+		SET_PTR_OFFSET(bio_key, 0, sector);
+
+		n->bi_end_io	= bch_cache_read_endio;
+		n->bi_private	= &s->cl;
+
+		__bch_submit_bbio(n, b->c);
+	}
+
+	return ret;
+}
+
+static void cache_lookup(struct closure *cl)
+{
+	struct btree_op *op = container_of(cl, struct btree_op, cl);
+	struct search *s = container_of(op, struct search, op);
+	struct bio *bio = &s->bio.bio;
+
+	int ret = bch_btree_map_keys(op, op->c,
+				     &KEY(op->inode, bio->bi_sector, 0),
+				     submit_partial_cache_hit, 1);
+	if (ret == -EAGAIN)
+		continue_at(cl, cache_lookup, bcache_wq);
+
+	closure_return(cl);
+}
+
+/* Common code for the make_request functions */
+
+static void request_endio(struct bio *bio, int error)
+{
+	struct closure *cl = bio->bi_private;
+
+	if (error) {
+		struct search *s = container_of(cl, struct search, cl);
+		s->error = error;
+		/* Only cache read errors are recoverable */
+		s->recoverable = false;
+	}
+
+	bio_put(bio);
+	closure_put(cl);
+}
+
 static void bio_complete(struct search *s)
 {
 	if (s->orig_bio) {
@@ -1005,15 +1104,14 @@ static void cached_dev_read_done_bh(struct closure *cl)
 static int cached_dev_cache_miss(struct btree *b, struct search *s,
 				 struct bio *bio, unsigned sectors)
 {
-	int ret = 0;
+	int ret = MAP_CONTINUE;
 	unsigned reada = 0;
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	struct bio *miss, *cache_bio;
 
 	if (s->cache_miss || s->op.bypass) {
 		miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
-		if (miss == bio)
-			s->op.lookup_done = true;
+		ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
 		goto out_submit;
 	}
 
@@ -1033,11 +1131,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 		return ret;
 
 	miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
-	if (miss == bio)
-		s->op.lookup_done = true;
-	else
-		/* btree_search_recurse()'s btree iterator is no good anymore */
-		ret = -EINTR;
+
+	/* btree_search_recurse()'s btree iterator is no good anymore */
+	ret = miss == bio ? MAP_DONE : -EINTR;
 
 	cache_bio = bio_alloc_bioset(GFP_NOWAIT,
 			DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS),
@@ -1075,7 +1171,7 @@ static void cached_dev_read(struct cached_dev *dc, struct search *s)
 {
 	struct closure *cl = &s->cl;
 
-	closure_call(&s->op.cl, bch_btree_search_async, NULL, cl);
+	closure_call(&s->op.cl, cache_lookup, NULL, cl);
 	continue_at(cl, cached_dev_read_done_bh, NULL);
 }
 
@@ -1287,9 +1383,9 @@ static int flash_dev_cache_miss(struct btree *b, struct search *s,
 	bio_advance(bio, min(sectors << 9, bio->bi_size));
 
 	if (!bio->bi_size)
-		s->op.lookup_done = true;
+		return MAP_DONE;
 
-	return 0;
+	return MAP_CONTINUE;
 }
 
 static void flash_dev_nodata(struct closure *cl)
@@ -1339,7 +1435,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 
 		closure_call(&s->op.cl, bch_data_insert, NULL, cl);
 	} else {
-		closure_call(&s->op.cl, bch_btree_search_async, NULL, cl);
+		closure_call(&s->op.cl, cache_lookup, NULL, cl);
 	}
 
 	continue_at(cl, search_free, NULL);
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 7d02ac5f936e..b0b4b0b5b7e9 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -31,10 +31,8 @@ struct search {
 	struct keylist		insert_keys;
 };
 
-void bch_cache_read_endio(struct bio *, int);
 unsigned bch_get_congested(struct cache_set *);
 void bch_data_insert(struct closure *cl);
-void bch_cache_read_endio(struct bio *, int);
 
 void bch_open_buckets_free(struct cache_set *);
 int bch_open_buckets_alloc(struct cache_set *);
-- 
cgit v1.2.3


From cc23196631fbcd1bc3eafedbb712413fdbf946a3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:41:13 -0700
Subject: bcache: Clean up cache_lookup_fn

There was some looping in submit_partial_cache_hit() and
submit_partial_cache_hit() that isn't needed anymore - originally, we
wouldn't necessarily process the full hit or miss all at once because
when splitting the bio, we took into account the restrictions of the
device we were sending it to.

But, device bio size restrictions are now handled elsewhere, with a
wrapper around generic_make_request() - so that looping has been
unnecessary for awhile now and we can now do quite a bit of cleanup.

And if we trim the key we're reading from to match the subset we're
actually reading, we don't have to explicitly calculate bi_sector
anymore. Neat.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/request.c | 108 +++++++++++++++++++-------------------------
 1 file changed, 46 insertions(+), 62 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 854743e85e76..de3fc76ffcfc 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -663,86 +663,70 @@ static void bch_cache_read_endio(struct bio *bio, int error)
 	bch_bbio_endio(s->op.c, bio, error, "reading from cache");
 }
 
-static int submit_partial_cache_miss(struct btree *b, struct search *s,
-				     struct bkey *k)
-{
-	struct bio *bio = &s->bio.bio;
-	int ret = MAP_CONTINUE;
-
-	do {
-		unsigned sectors = INT_MAX;
-
-		if (KEY_INODE(k) == s->op.inode) {
-			if (KEY_START(k) <= bio->bi_sector)
-				break;
-
-			sectors = min_t(uint64_t, sectors,
-					KEY_START(k) - bio->bi_sector);
-		}
-
-		ret = s->d->cache_miss(b, s, bio, sectors);
-	} while (ret == MAP_CONTINUE);
-
-	return ret;
-}
-
 /*
  * Read from a single key, handling the initial cache miss if the key starts in
  * the middle of the bio
  */
-static int submit_partial_cache_hit(struct btree_op *op, struct btree *b,
-				    struct bkey *k)
+static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
 {
 	struct search *s = container_of(op, struct search, op);
-	struct bio *bio = &s->bio.bio;
+	struct bio *n, *bio = &s->bio.bio;
+	struct bkey *bio_key;
 	unsigned ptr;
-	struct bio *n;
 
-	int ret = submit_partial_cache_miss(b, s, k);
-	if (ret != MAP_CONTINUE || !KEY_SIZE(k))
-		return ret;
+	if (bkey_cmp(k, &KEY(op->inode, bio->bi_sector, 0)) <= 0)
+		return MAP_CONTINUE;
+
+	if (KEY_INODE(k) != s->op.inode ||
+	    KEY_START(k) > bio->bi_sector) {
+		unsigned bio_sectors = bio_sectors(bio);
+		unsigned sectors = KEY_INODE(k) == s->op.inode
+			? min_t(uint64_t, INT_MAX,
+				KEY_START(k) - bio->bi_sector)
+			: INT_MAX;
+
+		int ret = s->d->cache_miss(b, s, bio, sectors);
+		if (ret != MAP_CONTINUE)
+			return ret;
+
+		/* if this was a complete miss we shouldn't get here */
+		BUG_ON(bio_sectors <= sectors);
+	}
+
+	if (!KEY_SIZE(k))
+		return MAP_CONTINUE;
 
 	/* XXX: figure out best pointer - for multiple cache devices */
 	ptr = 0;
 
 	PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
 
-	while (ret == MAP_CONTINUE &&
-	       KEY_INODE(k) == op->inode &&
-	       bio->bi_sector < KEY_OFFSET(k)) {
-		struct bkey *bio_key;
-		sector_t sector = PTR_OFFSET(k, ptr) +
-			(bio->bi_sector - KEY_START(k));
-		unsigned sectors = min_t(uint64_t, INT_MAX,
-					 KEY_OFFSET(k) - bio->bi_sector);
-
-		n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
-		if (n == bio)
-			ret = MAP_DONE;
-
-		bio_key = &container_of(n, struct bbio, bio)->key;
+	n = bch_bio_split(bio, min_t(uint64_t, INT_MAX,
+				     KEY_OFFSET(k) - bio->bi_sector),
+			  GFP_NOIO, s->d->bio_split);
 
-		/*
-		 * The bucket we're reading from might be reused while our bio
-		 * is in flight, and we could then end up reading the wrong
-		 * data.
-		 *
-		 * We guard against this by checking (in cache_read_endio()) if
-		 * the pointer is stale again; if so, we treat it as an error
-		 * and reread from the backing device (but we don't pass that
-		 * error up anywhere).
-		 */
+	bio_key = &container_of(n, struct bbio, bio)->key;
+	bch_bkey_copy_single_ptr(bio_key, k, ptr);
 
-		bch_bkey_copy_single_ptr(bio_key, k, ptr);
-		SET_PTR_OFFSET(bio_key, 0, sector);
+	bch_cut_front(&KEY(s->op.inode, n->bi_sector, 0), bio_key);
+	bch_cut_back(&KEY(s->op.inode, bio_end_sector(n), 0), bio_key);
 
-		n->bi_end_io	= bch_cache_read_endio;
-		n->bi_private	= &s->cl;
+	n->bi_end_io	= bch_cache_read_endio;
+	n->bi_private	= &s->cl;
 
-		__bch_submit_bbio(n, b->c);
-	}
+	/*
+	 * The bucket we're reading from might be reused while our bio
+	 * is in flight, and we could then end up reading the wrong
+	 * data.
+	 *
+	 * We guard against this by checking (in cache_read_endio()) if
+	 * the pointer is stale again; if so, we treat it as an error
+	 * and reread from the backing device (but we don't pass that
+	 * error up anywhere).
+	 */
 
-	return ret;
+	__bch_submit_bbio(n, b->c);
+	return n == bio ? MAP_DONE : MAP_CONTINUE;
 }
 
 static void cache_lookup(struct closure *cl)
@@ -753,7 +737,7 @@ static void cache_lookup(struct closure *cl)
 
 	int ret = bch_btree_map_keys(op, op->c,
 				     &KEY(op->inode, bio->bi_sector, 0),
-				     submit_partial_cache_hit, 1);
+				     cache_lookup_fn, MAP_END_KEY);
 	if (ret == -EAGAIN)
 		continue_at(cl, cache_lookup, bcache_wq);
 
-- 
cgit v1.2.3


From c18536a72ddd7fe30d63e6c1500b5c930ac14594 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 17:44:17 -0700
Subject: bcache: Prune struct btree_op

Eventual goal is for struct btree_op to contain only what is necessary
for traversing the btree.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bset.c      |   1 -
 drivers/md/bcache/btree.c     |  32 ++++----
 drivers/md/bcache/btree.h     |  21 +----
 drivers/md/bcache/journal.c   |  32 ++++----
 drivers/md/bcache/journal.h   |   9 +--
 drivers/md/bcache/movinggc.c  |  18 ++---
 drivers/md/bcache/request.c   | 177 +++++++++++++++++++++---------------------
 drivers/md/bcache/request.h   |  14 ++++
 drivers/md/bcache/stats.c     |   8 +-
 drivers/md/bcache/super.c     |  21 +++--
 drivers/md/bcache/writeback.c |  17 ++--
 11 files changed, 179 insertions(+), 171 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 14c2a23d3884..fae5b7b3f5ab 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -1197,7 +1197,6 @@ int bch_bset_print_stats(struct cache_set *c, char *buf)
 
 	memset(&t, 0, sizeof(struct bset_stats));
 	bch_btree_op_init_stack(&t.op);
-	t.op.c = c;
 
 	ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats);
 	if (ret < 0)
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 3949673cb1b0..5cb59c313dc3 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -503,7 +503,7 @@ static void btree_node_write_work(struct work_struct *w)
 	rw_unlock(true, b);
 }
 
-static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op)
+static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
 {
 	struct bset *i = b->sets[b->nsets].data;
 	struct btree_write *w = btree_current_write(b);
@@ -516,15 +516,15 @@ static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op)
 
 	set_btree_node_dirty(b);
 
-	if (op->journal) {
+	if (journal_ref) {
 		if (w->journal &&
-		    journal_pin_cmp(b->c, w, op)) {
+		    journal_pin_cmp(b->c, w->journal, journal_ref)) {
 			atomic_dec_bug(w->journal);
 			w->journal = NULL;
 		}
 
 		if (!w->journal) {
-			w->journal = op->journal;
+			w->journal = journal_ref;
 			atomic_inc(w->journal);
 		}
 	}
@@ -1663,13 +1663,16 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
 	return 0;
 }
 
-int bch_btree_check(struct cache_set *c, struct btree_op *op)
+int bch_btree_check(struct cache_set *c)
 {
 	int ret = -ENOMEM;
 	unsigned i;
 	unsigned long *seen[MAX_CACHES_PER_SET];
+	struct btree_op op;
 
 	memset(seen, 0, sizeof(seen));
+	bch_btree_op_init_stack(&op);
+	op.lock = SHRT_MAX;
 
 	for (i = 0; c->cache[i]; i++) {
 		size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8);
@@ -1681,7 +1684,7 @@ int bch_btree_check(struct cache_set *c, struct btree_op *op)
 		memset(seen[i], 0xFF, n);
 	}
 
-	ret = btree_root(check_recurse, c, op, seen);
+	ret = btree_root(check_recurse, c, &op, seen);
 err:
 	for (i = 0; i < MAX_CACHES_PER_SET; i++)
 		kfree(seen[i]);
@@ -2091,7 +2094,8 @@ err:
 }
 
 static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
-				 struct keylist *insert_keys)
+				 struct keylist *insert_keys,
+				 atomic_t *journal_ref)
 {
 	int ret = 0;
 	struct keylist split_keys;
@@ -2123,7 +2127,7 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 
 			if (bch_btree_insert_keys(b, op, insert_keys)) {
 				if (!b->level)
-					bch_btree_leaf_dirty(b, op);
+					bch_btree_leaf_dirty(b, journal_ref);
 				else
 					bch_btree_node_write(b, &op->cl);
 			}
@@ -2162,7 +2166,7 @@ int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
 
 	BUG_ON(op->type != BTREE_INSERT);
 
-	ret = bch_btree_insert_node(b, op, &insert);
+	ret = bch_btree_insert_node(b, op, &insert, NULL);
 
 	BUG_ON(!ret && !bch_keylist_empty(&insert));
 out:
@@ -2172,7 +2176,7 @@ out:
 }
 
 static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
-				    struct keylist *keys)
+				    struct keylist *keys, atomic_t *journal_ref)
 {
 	if (bch_keylist_empty(keys))
 		return 0;
@@ -2189,14 +2193,14 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
 			return -EIO;
 		}
 
-		return btree(insert_recurse, k, b, op, keys);
+		return btree(insert_recurse, k, b, op, keys, journal_ref);
 	} else {
-		return bch_btree_insert_node(b, op, keys);
+		return bch_btree_insert_node(b, op, keys, journal_ref);
 	}
 }
 
 int bch_btree_insert(struct btree_op *op, struct cache_set *c,
-		     struct keylist *keys)
+		     struct keylist *keys, atomic_t *journal_ref)
 {
 	int ret = 0;
 
@@ -2210,7 +2214,7 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c,
 
 	while (!bch_keylist_empty(keys)) {
 		op->lock = 0;
-		ret = btree_root(insert_recurse, c, op, keys);
+		ret = btree_root(insert_recurse, c, op, keys, journal_ref);
 
 		if (ret == -EAGAIN) {
 			ret = 0;
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 60dadd722ace..3f820b67150c 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -238,17 +238,6 @@ void __bkey_put(struct cache_set *c, struct bkey *k);
 
 struct btree_op {
 	struct closure		cl;
-	struct cache_set	*c;
-
-	/* Journal entry we have a refcount on */
-	atomic_t		*journal;
-
-	/* Bio to be inserted into the cache */
-	struct bio		*cache_bio;
-
-	unsigned		inode;
-
-	uint16_t		write_prio;
 
 	/* Btree level at which we start taking write locks */
 	short			lock;
@@ -259,11 +248,6 @@ struct btree_op {
 		BTREE_REPLACE
 	} type:8;
 
-	unsigned		csum:1;
-	unsigned		bypass:1;
-	unsigned		flush_journal:1;
-
-	unsigned		insert_data_done:1;
 	unsigned		insert_collision:1;
 
 	BKEY_PADDED(replace);
@@ -303,12 +287,13 @@ struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool);
 
 int bch_btree_insert_check_key(struct btree *, struct btree_op *,
 			       struct bkey *);
-int bch_btree_insert(struct btree_op *, struct cache_set *, struct keylist *);
+int bch_btree_insert(struct btree_op *, struct cache_set *,
+		     struct keylist *, atomic_t *);
 
 int bch_gc_thread_start(struct cache_set *);
 size_t bch_btree_gc_finish(struct cache_set *);
 void bch_moving_gc(struct cache_set *);
-int bch_btree_check(struct cache_set *, struct btree_op *);
+int bch_btree_check(struct cache_set *);
 uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *);
 
 static inline void wake_up_gc(struct cache_set *c)
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 6f4daf031410..725c8eb9a62a 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -30,17 +30,20 @@ static void journal_read_endio(struct bio *bio, int error)
 }
 
 static int journal_read_bucket(struct cache *ca, struct list_head *list,
-			       struct btree_op *op, unsigned bucket_index)
+			       unsigned bucket_index)
 {
 	struct journal_device *ja = &ca->journal;
 	struct bio *bio = &ja->bio;
 
 	struct journal_replay *i;
 	struct jset *j, *data = ca->set->journal.w[0].data;
+	struct closure cl;
 	unsigned len, left, offset = 0;
 	int ret = 0;
 	sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]);
 
+	closure_init_stack(&cl);
+
 	pr_debug("reading %llu", (uint64_t) bucket);
 
 	while (offset < ca->sb.bucket_size) {
@@ -54,11 +57,11 @@ reread:		left = ca->sb.bucket_size - offset;
 		bio->bi_size	= len << 9;
 
 		bio->bi_end_io	= journal_read_endio;
-		bio->bi_private = &op->cl;
+		bio->bi_private = &cl;
 		bch_bio_map(bio, data);
 
-		closure_bio_submit(bio, &op->cl, ca);
-		closure_sync(&op->cl);
+		closure_bio_submit(bio, &cl, ca);
+		closure_sync(&cl);
 
 		/* This function could be simpler now since we no longer write
 		 * journal entries that overlap bucket boundaries; this means
@@ -128,12 +131,11 @@ next_set:
 	return ret;
 }
 
-int bch_journal_read(struct cache_set *c, struct list_head *list,
-			struct btree_op *op)
+int bch_journal_read(struct cache_set *c, struct list_head *list)
 {
 #define read_bucket(b)							\
 	({								\
-		int ret = journal_read_bucket(ca, list, op, b);		\
+		int ret = journal_read_bucket(ca, list, b);		\
 		__set_bit(b, bitmap);					\
 		if (ret < 0)						\
 			return ret;					\
@@ -291,8 +293,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
 	}
 }
 
-int bch_journal_replay(struct cache_set *s, struct list_head *list,
-			  struct btree_op *op)
+int bch_journal_replay(struct cache_set *s, struct list_head *list)
 {
 	int ret = 0, keys = 0, entries = 0;
 	struct bkey *k;
@@ -301,8 +302,11 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
 
 	uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
 	struct keylist keylist;
+	struct btree_op op;
 
 	bch_keylist_init(&keylist);
+	bch_btree_op_init_stack(&op);
+	op.lock = SHRT_MAX;
 
 	list_for_each_entry(i, list, list) {
 		BUG_ON(i->pin && atomic_read(i->pin) != 1);
@@ -319,9 +323,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
 			bkey_copy(keylist.top, k);
 			bch_keylist_push(&keylist);
 
-			op->journal = i->pin;
-
-			ret = bch_btree_insert(op, s, &keylist);
+			ret = bch_btree_insert(&op, s, &keylist, i->pin);
 			if (ret)
 				goto err;
 
@@ -346,7 +348,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
 		kfree(i);
 	}
 err:
-	closure_sync(&op->cl);
+	closure_sync(&op.cl);
 	return ret;
 }
 
@@ -368,8 +370,8 @@ retry:
 			if (!best)
 				best = b;
 			else if (journal_pin_cmp(c,
-						 btree_current_write(best),
-						 btree_current_write(b))) {
+					btree_current_write(best)->journal,
+					btree_current_write(b)->journal)) {
 				best = b;
 			}
 		}
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index 7045e6fd2d5a..5e9edb9ef376 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -189,8 +189,7 @@ struct journal_device {
 };
 
 #define journal_pin_cmp(c, l, r)				\
-	(fifo_idx(&(c)->journal.pin, (l)->journal) >		\
-	 fifo_idx(&(c)->journal.pin, (r)->journal))
+	(fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r)))
 
 #define JOURNAL_PIN	20000
 
@@ -206,10 +205,8 @@ atomic_t *bch_journal(struct cache_set *, struct keylist *, struct closure *);
 void bch_journal_next(struct journal *);
 void bch_journal_mark(struct cache_set *, struct list_head *);
 void bch_journal_meta(struct cache_set *, struct closure *);
-int bch_journal_read(struct cache_set *, struct list_head *,
-			struct btree_op *);
-int bch_journal_replay(struct cache_set *, struct list_head *,
-			  struct btree_op *);
+int bch_journal_read(struct cache_set *, struct list_head *);
+int bch_journal_replay(struct cache_set *, struct list_head *);
 
 void bch_journal_free(struct cache_set *);
 int bch_journal_alloc(struct cache_set *);
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 6ba050456ec8..80e30d77221e 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -55,9 +55,9 @@ static void write_moving_finish(struct closure *cl)
 	if (io->s.op.insert_collision)
 		trace_bcache_gc_copy_collision(&io->w->key);
 
-	bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w);
+	bch_keybuf_del(&io->s.c->moving_gc_keys, io->w);
 
-	up(&io->s.op.c->moving_in_flight);
+	up(&io->s.c->moving_in_flight);
 
 	closure_return_with_destructor(cl, moving_io_destructor);
 }
@@ -70,7 +70,7 @@ static void read_moving_endio(struct bio *bio, int error)
 	if (error)
 		io->s.error = error;
 
-	bch_bbio_endio(io->s.op.c, bio, error, "reading data to move");
+	bch_bbio_endio(io->s.c, bio, error, "reading data to move");
 }
 
 static void moving_init(struct moving_io *io)
@@ -99,11 +99,11 @@ static void write_moving(struct closure *cl)
 
 		io->bio.bio.bi_sector	= KEY_START(&io->w->key);
 		s->op.lock		= -1;
-		s->op.write_prio	= 1;
-		s->op.cache_bio		= &io->bio.bio;
+		s->write_prio		= 1;
+		s->cache_bio		= &io->bio.bio;
 
 		s->writeback		= KEY_DIRTY(&io->w->key);
-		s->op.csum		= KEY_CSUM(&io->w->key);
+		s->csum			= KEY_CSUM(&io->w->key);
 
 		s->op.type = BTREE_REPLACE;
 		bkey_copy(&s->op.replace, &io->w->key);
@@ -121,7 +121,7 @@ static void read_moving_submit(struct closure *cl)
 	struct moving_io *io = container_of(s, struct moving_io, s);
 	struct bio *bio = &io->bio.bio;
 
-	bch_submit_bbio(bio, s->op.c, &io->w->key, 0);
+	bch_submit_bbio(bio, s->c, &io->w->key, 0);
 
 	continue_at(cl, write_moving, system_wq);
 }
@@ -151,8 +151,8 @@ static void read_moving(struct cache_set *c)
 
 		w->private	= io;
 		io->w		= w;
-		io->s.op.inode	= KEY_INODE(&w->key);
-		io->s.op.c	= c;
+		io->s.inode	= KEY_INODE(&w->key);
+		io->s.c		= c;
 
 		moving_init(io);
 		bio = &io->bio.bio;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index de3fc76ffcfc..818e2e39e71f 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -217,6 +217,7 @@ static void bch_data_insert_keys(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
 	struct search *s = container_of(op, struct search, op);
+	atomic_t *journal_ref = NULL;
 
 	/*
 	 * If we're looping, might already be waiting on
@@ -231,20 +232,19 @@ static void bch_data_insert_keys(struct closure *cl)
 #endif
 
 	if (s->write)
-		op->journal = bch_journal(op->c, &s->insert_keys,
-					  op->flush_journal
+		journal_ref = bch_journal(s->c, &s->insert_keys,
+					  s->flush_journal
 					  ? &s->cl : NULL);
 
-	if (bch_btree_insert(op, op->c, &s->insert_keys)) {
+	if (bch_btree_insert(op, s->c, &s->insert_keys, journal_ref)) {
 		s->error		= -ENOMEM;
-		op->insert_data_done	= true;
+		s->insert_data_done	= true;
 	}
 
-	if (op->journal)
-		atomic_dec_bug(op->journal);
-	op->journal = NULL;
+	if (journal_ref)
+		atomic_dec_bug(journal_ref);
 
-	if (!op->insert_data_done)
+	if (!s->insert_data_done)
 		continue_at(cl, bch_data_insert_start, bcache_wq);
 
 	bch_keylist_free(&s->insert_keys);
@@ -347,7 +347,7 @@ found:
 static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
 			      struct search *s)
 {
-	struct cache_set *c = s->op.c;
+	struct cache_set *c = s->c;
 	struct open_bucket *b;
 	BKEY_PADDED(key) alloc;
 	unsigned i;
@@ -363,7 +363,7 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
 	spin_lock(&c->data_bucket_lock);
 
 	while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) {
-		unsigned watermark = s->op.write_prio
+		unsigned watermark = s->write_prio
 			? WATERMARK_MOVINGGC
 			: WATERMARK_NONE;
 
@@ -435,7 +435,7 @@ static void bch_data_invalidate(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
 	struct search *s = container_of(op, struct search, op);
-	struct bio *bio = op->cache_bio;
+	struct bio *bio = s->cache_bio;
 
 	pr_debug("invalidating %i sectors from %llu",
 		 bio_sectors(bio), (uint64_t) bio->bi_sector);
@@ -443,17 +443,17 @@ static void bch_data_invalidate(struct closure *cl)
 	while (bio_sectors(bio)) {
 		unsigned len = min(bio_sectors(bio), 1U << 14);
 
-		if (bch_keylist_realloc(&s->insert_keys, 0, op->c))
+		if (bch_keylist_realloc(&s->insert_keys, 0, s->c))
 			goto out;
 
 		bio->bi_sector	+= len;
 		bio->bi_size	-= len << 9;
 
 		bch_keylist_add(&s->insert_keys,
-				&KEY(op->inode, bio->bi_sector, len));
+				&KEY(s->inode, bio->bi_sector, len));
 	}
 
-	op->insert_data_done = true;
+	s->insert_data_done = true;
 	bio_put(bio);
 out:
 	continue_at(cl, bch_data_insert_keys, bcache_wq);
@@ -506,21 +506,21 @@ static void bch_data_insert_endio(struct bio *bio, int error)
 			set_closure_fn(cl, NULL, NULL);
 	}
 
-	bch_bbio_endio(op->c, bio, error, "writing data to cache");
+	bch_bbio_endio(s->c, bio, error, "writing data to cache");
 }
 
 static void bch_data_insert_start(struct closure *cl)
 {
 	struct btree_op *op = container_of(cl, struct btree_op, cl);
 	struct search *s = container_of(op, struct search, op);
-	struct bio *bio = op->cache_bio, *n;
+	struct bio *bio = s->cache_bio, *n;
 
-	if (op->bypass)
+	if (s->bypass)
 		return bch_data_invalidate(cl);
 
-	if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
-		set_gc_sectors(op->c);
-		wake_up_gc(op->c);
+	if (atomic_sub_return(bio_sectors(bio), &s->c->sectors_to_gc) < 0) {
+		set_gc_sectors(s->c);
+		wake_up_gc(s->c);
 	}
 
 	/*
@@ -533,17 +533,17 @@ static void bch_data_insert_start(struct closure *cl)
 		unsigned i;
 		struct bkey *k;
 		struct bio_set *split = s->d
-			? s->d->bio_split : op->c->bio_split;
+			? s->d->bio_split : s->c->bio_split;
 
 		/* 1 for the device pointer and 1 for the chksum */
 		if (bch_keylist_realloc(&s->insert_keys,
-					1 + (op->csum ? 1 : 0),
-					op->c))
+					1 + (s->csum ? 1 : 0),
+					s->c))
 			continue_at(cl, bch_data_insert_keys, bcache_wq);
 
 		k = s->insert_keys.top;
 		bkey_init(k);
-		SET_KEY_INODE(k, op->inode);
+		SET_KEY_INODE(k, s->inode);
 		SET_KEY_OFFSET(k, bio->bi_sector);
 
 		if (!bch_alloc_sectors(k, bio_sectors(bio), s))
@@ -558,11 +558,11 @@ static void bch_data_insert_start(struct closure *cl)
 			SET_KEY_DIRTY(k, true);
 
 			for (i = 0; i < KEY_PTRS(k); i++)
-				SET_GC_MARK(PTR_BUCKET(op->c, k, i),
+				SET_GC_MARK(PTR_BUCKET(s->c, k, i),
 					    GC_MARK_DIRTY);
 		}
 
-		SET_KEY_CSUM(k, op->csum);
+		SET_KEY_CSUM(k, s->csum);
 		if (KEY_CSUM(k))
 			bio_csum(n, k);
 
@@ -570,10 +570,10 @@ static void bch_data_insert_start(struct closure *cl)
 		bch_keylist_push(&s->insert_keys);
 
 		n->bi_rw |= REQ_WRITE;
-		bch_submit_bbio(n, op->c, k, 0);
+		bch_submit_bbio(n, s->c, k, 0);
 	} while (n != bio);
 
-	op->insert_data_done = true;
+	s->insert_data_done = true;
 	continue_at(cl, bch_data_insert_keys, bcache_wq);
 err:
 	/* bch_alloc_sectors() blocks if s->writeback = true */
@@ -592,14 +592,14 @@ err:
 		 * we wait for buckets to be freed up, so just invalidate the
 		 * rest of the write.
 		 */
-		op->bypass = true;
+		s->bypass = true;
 		return bch_data_invalidate(cl);
 	} else {
 		/*
 		 * From a cache miss, we can just insert the keys for the data
 		 * we have written or bail out if we didn't do anything.
 		 */
-		op->insert_data_done = true;
+		s->insert_data_done = true;
 		bio_put(bio);
 
 		if (!bch_keylist_empty(&s->insert_keys))
@@ -622,11 +622,11 @@ err:
  * data is written it calls bch_journal, and after the keys have been added to
  * the next journal write they're inserted into the btree.
  *
- * It inserts the data in op->cache_bio; bi_sector is used for the key offset,
+ * It inserts the data in s->cache_bio; bi_sector is used for the key offset,
  * and op->inode is used for the key inode.
  *
- * If op->bypass is true, instead of inserting the data it invalidates the
- * region of the cache represented by op->cache_bio and op->inode.
+ * If s->bypass is true, instead of inserting the data it invalidates the
+ * region of the cache represented by s->cache_bio and op->inode.
  */
 void bch_data_insert(struct closure *cl)
 {
@@ -634,7 +634,7 @@ void bch_data_insert(struct closure *cl)
 	struct search *s = container_of(op, struct search, op);
 
 	bch_keylist_init(&s->insert_keys);
-	bio_get(op->cache_bio);
+	bio_get(s->cache_bio);
 	bch_data_insert_start(cl);
 }
 
@@ -655,12 +655,12 @@ static void bch_cache_read_endio(struct bio *bio, int error)
 
 	if (error)
 		s->error = error;
-	else if (ptr_stale(s->op.c, &b->key, 0)) {
-		atomic_long_inc(&s->op.c->cache_read_races);
+	else if (ptr_stale(s->c, &b->key, 0)) {
+		atomic_long_inc(&s->c->cache_read_races);
 		s->error = -EINTR;
 	}
 
-	bch_bbio_endio(s->op.c, bio, error, "reading from cache");
+	bch_bbio_endio(s->c, bio, error, "reading from cache");
 }
 
 /*
@@ -674,13 +674,13 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
 	struct bkey *bio_key;
 	unsigned ptr;
 
-	if (bkey_cmp(k, &KEY(op->inode, bio->bi_sector, 0)) <= 0)
+	if (bkey_cmp(k, &KEY(s->inode, bio->bi_sector, 0)) <= 0)
 		return MAP_CONTINUE;
 
-	if (KEY_INODE(k) != s->op.inode ||
+	if (KEY_INODE(k) != s->inode ||
 	    KEY_START(k) > bio->bi_sector) {
 		unsigned bio_sectors = bio_sectors(bio);
-		unsigned sectors = KEY_INODE(k) == s->op.inode
+		unsigned sectors = KEY_INODE(k) == s->inode
 			? min_t(uint64_t, INT_MAX,
 				KEY_START(k) - bio->bi_sector)
 			: INT_MAX;
@@ -708,8 +708,8 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
 	bio_key = &container_of(n, struct bbio, bio)->key;
 	bch_bkey_copy_single_ptr(bio_key, k, ptr);
 
-	bch_cut_front(&KEY(s->op.inode, n->bi_sector, 0), bio_key);
-	bch_cut_back(&KEY(s->op.inode, bio_end_sector(n), 0), bio_key);
+	bch_cut_front(&KEY(s->inode, n->bi_sector, 0), bio_key);
+	bch_cut_back(&KEY(s->inode, bio_end_sector(n), 0), bio_key);
 
 	n->bi_end_io	= bch_cache_read_endio;
 	n->bi_private	= &s->cl;
@@ -735,8 +735,8 @@ static void cache_lookup(struct closure *cl)
 	struct search *s = container_of(op, struct search, op);
 	struct bio *bio = &s->bio.bio;
 
-	int ret = bch_btree_map_keys(op, op->c,
-				     &KEY(op->inode, bio->bi_sector, 0),
+	int ret = bch_btree_map_keys(op, s->c,
+				     &KEY(s->inode, bio->bi_sector, 0),
 				     cache_lookup_fn, MAP_END_KEY);
 	if (ret == -EAGAIN)
 		continue_at(cl, cache_lookup, bcache_wq);
@@ -793,8 +793,8 @@ static void search_free(struct closure *cl)
 	struct search *s = container_of(cl, struct search, cl);
 	bio_complete(s);
 
-	if (s->op.cache_bio)
-		bio_put(s->op.cache_bio);
+	if (s->cache_bio)
+		bio_put(s->cache_bio);
 
 	if (s->unaligned_bvec)
 		mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec);
@@ -813,14 +813,14 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
 
 	__closure_init(&s->cl, NULL);
 
-	s->op.inode		= d->id;
-	s->op.c			= d->c;
+	s->inode		= d->id;
+	s->c			= d->c;
 	s->d			= d;
 	s->op.lock		= -1;
 	s->task			= current;
 	s->orig_bio		= bio;
 	s->write		= (bio->bi_rw & REQ_WRITE) != 0;
-	s->op.flush_journal	= (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
+	s->flush_journal	= (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
 	s->recoverable		= 1;
 	s->start_time		= jiffies;
 	do_bio_hook(s);
@@ -891,7 +891,7 @@ static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
 
 static bool check_should_bypass(struct cached_dev *dc, struct search *s)
 {
-	struct cache_set *c = s->op.c;
+	struct cache_set *c = s->c;
 	struct bio *bio = &s->bio.bio;
 	unsigned mode = cache_mode(dc, bio);
 	unsigned sectors, congested = bch_get_congested(c);
@@ -985,11 +985,11 @@ static void cached_dev_cache_miss_done(struct closure *cl)
 	if (s->op.insert_collision)
 		bch_mark_cache_miss_collision(s);
 
-	if (s->op.cache_bio) {
+	if (s->cache_bio) {
 		int i;
 		struct bio_vec *bv;
 
-		__bio_for_each_segment(bv, s->op.cache_bio, i, 0)
+		bio_for_each_segment_all(bv, s->cache_bio, i)
 			__free_page(bv->bv_page);
 	}
 
@@ -1042,14 +1042,15 @@ static void cached_dev_read_done(struct closure *cl)
 	 * to the buffers the original bio pointed to:
 	 */
 
-	if (s->op.cache_bio) {
-		bio_reset(s->op.cache_bio);
-		s->op.cache_bio->bi_sector	= s->cache_miss->bi_sector;
-		s->op.cache_bio->bi_bdev	= s->cache_miss->bi_bdev;
-		s->op.cache_bio->bi_size	= s->cache_bio_sectors << 9;
-		bch_bio_map(s->op.cache_bio, NULL);
+	if (s->cache_bio) {
+		bio_reset(s->cache_bio);
+		s->cache_bio->bi_sector =
+			s->cache_miss->bi_sector;
+		s->cache_bio->bi_bdev = s->cache_miss->bi_bdev;
+		s->cache_bio->bi_size = s->cache_bio_sectors << 9;
+		bch_bio_map(s->cache_bio, NULL);
 
-		bio_copy_data(s->cache_miss, s->op.cache_bio);
+		bio_copy_data(s->cache_miss, s->cache_bio);
 
 		bio_put(s->cache_miss);
 		s->cache_miss = NULL;
@@ -1060,8 +1061,8 @@ static void cached_dev_read_done(struct closure *cl)
 
 	bio_complete(s);
 
-	if (s->op.cache_bio &&
-	    !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) {
+	if (s->cache_bio &&
+	    !test_bit(CACHE_SET_STOPPING, &s->c->flags)) {
 		s->op.type = BTREE_REPLACE;
 		closure_call(&s->op.cl, bch_data_insert, NULL, cl);
 	}
@@ -1074,12 +1075,12 @@ static void cached_dev_read_done_bh(struct closure *cl)
 	struct search *s = container_of(cl, struct search, cl);
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 
-	bch_mark_cache_accounting(s, !s->cache_miss, s->op.bypass);
-	trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.bypass);
+	bch_mark_cache_accounting(s, !s->cache_miss, s->bypass);
+	trace_bcache_read(s->orig_bio, !s->cache_miss, s->bypass);
 
 	if (s->error)
 		continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq);
-	else if (s->op.cache_bio || verify(dc, &s->bio.bio))
+	else if (s->cache_bio || verify(dc, &s->bio.bio))
 		continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq);
 	else
 		continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
@@ -1093,7 +1094,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	struct bio *miss, *cache_bio;
 
-	if (s->cache_miss || s->op.bypass) {
+	if (s->cache_miss || s->bypass) {
 		miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
 		ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
 		goto out_submit;
@@ -1101,13 +1102,13 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 
 	if (!(bio->bi_rw & REQ_RAHEAD) &&
 	    !(bio->bi_rw & REQ_META) &&
-	    s->op.c->gc_stats.in_use < CUTOFF_CACHE_READA)
+	    s->c->gc_stats.in_use < CUTOFF_CACHE_READA)
 		reada = min_t(sector_t, dc->readahead >> 9,
 			      bdev_sectors(bio->bi_bdev) - bio_end_sector(bio));
 
 	s->cache_bio_sectors = min(sectors, bio_sectors(bio) + reada);
 
-	s->op.replace = KEY(s->op.inode, bio->bi_sector +
+	s->op.replace = KEY(s->inode, bio->bi_sector +
 			    s->cache_bio_sectors, s->cache_bio_sectors);
 
 	ret = bch_btree_insert_check_key(b, &s->op, &s->op.replace);
@@ -1137,7 +1138,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 		goto out_put;
 
 	s->cache_miss	= miss;
-	s->op.cache_bio = cache_bio;
+	s->cache_bio = cache_bio;
 	bio_get(cache_bio);
 	closure_bio_submit(cache_bio, &s->cl, s->d);
 
@@ -1177,7 +1178,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 	struct bkey start = KEY(dc->disk.id, bio->bi_sector, 0);
 	struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0);
 
-	bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end);
+	bch_keybuf_check_overlapping(&s->c->moving_gc_keys, &start, &end);
 
 	down_read_non_owner(&dc->writeback_lock);
 	if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) {
@@ -1185,7 +1186,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 		 * We overlap with some dirty data undergoing background
 		 * writeback, force this write to writeback
 		 */
-		s->op.bypass	= false;
+		s->bypass	= false;
 		s->writeback	= true;
 	}
 
@@ -1197,27 +1198,27 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 	 * so we still want to call it.
 	 */
 	if (bio->bi_rw & REQ_DISCARD)
-		s->op.bypass = true;
+		s->bypass = true;
 
 	if (should_writeback(dc, s->orig_bio,
 			     cache_mode(dc, bio),
-			     s->op.bypass)) {
-		s->op.bypass = false;
+			     s->bypass)) {
+		s->bypass = false;
 		s->writeback = true;
 	}
 
-	trace_bcache_write(s->orig_bio, s->writeback, s->op.bypass);
+	trace_bcache_write(s->orig_bio, s->writeback, s->bypass);
 
-	if (s->op.bypass) {
-		s->op.cache_bio = s->orig_bio;
-		bio_get(s->op.cache_bio);
+	if (s->bypass) {
+		s->cache_bio = s->orig_bio;
+		bio_get(s->cache_bio);
 
 		if (!(bio->bi_rw & REQ_DISCARD) ||
 		    blk_queue_discard(bdev_get_queue(dc->bdev)))
 			closure_bio_submit(bio, cl, s->d);
 	} else if (s->writeback) {
 		bch_writeback_add(dc);
-		s->op.cache_bio = bio;
+		s->cache_bio = bio;
 
 		if (bio->bi_rw & REQ_FLUSH) {
 			/* Also need to send a flush to the backing device */
@@ -1232,8 +1233,8 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 			closure_bio_submit(flush, cl, s->d);
 		}
 	} else {
-		s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
-						   dc->disk.bio_split);
+		s->cache_bio = bio_clone_bioset(bio, GFP_NOIO,
+						dc->disk.bio_split);
 
 		closure_bio_submit(bio, cl, s->d);
 	}
@@ -1247,8 +1248,8 @@ static void cached_dev_nodata(struct closure *cl)
 	struct search *s = container_of(cl, struct search, cl);
 	struct bio *bio = &s->bio.bio;
 
-	if (s->op.flush_journal)
-		bch_journal_meta(s->op.c, cl);
+	if (s->flush_journal)
+		bch_journal_meta(s->c, cl);
 
 	/* If it's a flush, we send the flush to the backing device too */
 	closure_bio_submit(bio, cl, s->d);
@@ -1286,7 +1287,7 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
 					      cached_dev_nodata,
 					      bcache_wq);
 		} else {
-			s->op.bypass = check_should_bypass(dc, s);
+			s->bypass = check_should_bypass(dc, s);
 
 			if (rw)
 				cached_dev_write(dc, s);
@@ -1376,8 +1377,8 @@ static void flash_dev_nodata(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 
-	if (s->op.flush_journal)
-		bch_journal_meta(s->op.c, cl);
+	if (s->flush_journal)
+		bch_journal_meta(s->c, cl);
 
 	continue_at(cl, search_free, NULL);
 }
@@ -1409,13 +1410,13 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 				      flash_dev_nodata,
 				      bcache_wq);
 	} else if (rw) {
-		bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys,
+		bch_keybuf_check_overlapping(&s->c->moving_gc_keys,
 					&KEY(d->id, bio->bi_sector, 0),
 					&KEY(d->id, bio_end_sector(bio), 0));
 
-		s->op.bypass	= (bio->bi_rw & REQ_DISCARD) != 0;
+		s->bypass	= (bio->bi_rw & REQ_DISCARD) != 0;
 		s->writeback	= true;
-		s->op.cache_bio	= bio;
+		s->cache_bio	= bio;
 
 		closure_call(&s->op.cl, bch_data_insert, NULL, cl);
 	} else {
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index b0b4b0b5b7e9..0f79177c4f33 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -8,19 +8,33 @@ struct search {
 	struct closure		cl;
 
 	struct bcache_device	*d;
+	struct cache_set	*c;
 	struct task_struct	*task;
 
 	struct bbio		bio;
 	struct bio		*orig_bio;
 	struct bio		*cache_miss;
+
+	/* Bio to be inserted into the cache */
+	struct bio		*cache_bio;
 	unsigned		cache_bio_sectors;
 
+	unsigned		inode;
+
 	unsigned		recoverable:1;
 	unsigned		unaligned_bvec:1;
 
 	unsigned		write:1;
 	unsigned		writeback:1;
 
+	unsigned		csum:1;
+	unsigned		bypass:1;
+	unsigned		flush_journal:1;
+
+	unsigned		insert_data_done:1;
+
+	uint16_t		write_prio;
+
 	/* IO error returned to s->bio */
 	short			error;
 	unsigned long		start_time;
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
index b8730e714d69..ea77263cf7ef 100644
--- a/drivers/md/bcache/stats.c
+++ b/drivers/md/bcache/stats.c
@@ -200,7 +200,7 @@ void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass)
 {
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	mark_cache_stats(&dc->accounting.collector, hit, bypass);
-	mark_cache_stats(&s->op.c->accounting.collector, hit, bypass);
+	mark_cache_stats(&s->c->accounting.collector, hit, bypass);
 #ifdef CONFIG_CGROUP_BCACHE
 	mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass);
 #endif
@@ -210,21 +210,21 @@ void bch_mark_cache_readahead(struct search *s)
 {
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	atomic_inc(&dc->accounting.collector.cache_readaheads);
-	atomic_inc(&s->op.c->accounting.collector.cache_readaheads);
+	atomic_inc(&s->c->accounting.collector.cache_readaheads);
 }
 
 void bch_mark_cache_miss_collision(struct search *s)
 {
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	atomic_inc(&dc->accounting.collector.cache_miss_collisions);
-	atomic_inc(&s->op.c->accounting.collector.cache_miss_collisions);
+	atomic_inc(&s->c->accounting.collector.cache_miss_collisions);
 }
 
 void bch_mark_sectors_bypassed(struct search *s, int sectors)
 {
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	atomic_add(sectors, &dc->accounting.collector.sectors_bypassed);
-	atomic_add(sectors, &s->op.c->accounting.collector.sectors_bypassed);
+	atomic_add(sectors, &s->c->accounting.collector.sectors_bypassed);
 }
 
 void bch_cache_accounting_init(struct cache_accounting *acc,
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index b79dd5a6679e..a314c771263f 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1493,11 +1493,10 @@ static void run_cache_set(struct cache_set *c)
 	const char *err = "cannot allocate memory";
 	struct cached_dev *dc, *t;
 	struct cache *ca;
+	struct closure cl;
 	unsigned i;
 
-	struct btree_op op;
-	bch_btree_op_init_stack(&op);
-	op.lock = SHRT_MAX;
+	closure_init_stack(&cl);
 
 	for_each_cache(ca, c, i)
 		c->nbuckets += ca->sb.nbuckets;
@@ -1508,7 +1507,7 @@ static void run_cache_set(struct cache_set *c)
 		struct jset *j;
 
 		err = "cannot allocate memory for journal";
-		if (bch_journal_read(c, &journal, &op))
+		if (bch_journal_read(c, &journal))
 			goto err;
 
 		pr_debug("btree_journal_read() done");
@@ -1543,12 +1542,12 @@ static void run_cache_set(struct cache_set *c)
 		list_del_init(&c->root->list);
 		rw_unlock(true, c->root);
 
-		err = uuid_read(c, j, &op.cl);
+		err = uuid_read(c, j, &cl);
 		if (err)
 			goto err;
 
 		err = "error in recovery";
-		if (bch_btree_check(c, &op))
+		if (bch_btree_check(c))
 			goto err;
 
 		bch_journal_mark(c, &journal);
@@ -1580,7 +1579,7 @@ static void run_cache_set(struct cache_set *c)
 		if (j->version < BCACHE_JSET_VERSION_UUID)
 			__uuid_write(c);
 
-		bch_journal_replay(c, &journal, &op);
+		bch_journal_replay(c, &journal);
 	} else {
 		pr_notice("invalidating existing data");
 
@@ -1616,7 +1615,7 @@ static void run_cache_set(struct cache_set *c)
 			goto err;
 
 		bkey_copy_key(&c->root->key, &MAX_KEY);
-		bch_btree_node_write(c->root, &op.cl);
+		bch_btree_node_write(c->root, &cl);
 
 		bch_btree_set_root(c->root);
 		rw_unlock(true, c->root);
@@ -1629,14 +1628,14 @@ static void run_cache_set(struct cache_set *c)
 		SET_CACHE_SYNC(&c->sb, true);
 
 		bch_journal_next(&c->journal);
-		bch_journal_meta(c, &op.cl);
+		bch_journal_meta(c, &cl);
 	}
 
 	err = "error starting gc thread";
 	if (bch_gc_thread_start(c))
 		goto err;
 
-	closure_sync(&op.cl);
+	closure_sync(&cl);
 	c->sb.last_mount = get_seconds();
 	bcache_write_super(c);
 
@@ -1647,7 +1646,7 @@ static void run_cache_set(struct cache_set *c)
 
 	return;
 err:
-	closure_sync(&op.cl);
+	closure_sync(&cl);
 	/* XXX: test this, it's broken */
 	bch_cache_set_error(c, err);
 }
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index c68de9f12618..b58c2bc91e3f 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -155,7 +155,7 @@ static void write_dirty_finish(struct closure *cl)
 		for (i = 0; i < KEY_PTRS(&w->key); i++)
 			atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
 
-		bch_btree_insert(&op, dc->disk.c, &keys);
+		bch_btree_insert(&op, dc->disk.c, &keys, NULL);
 		closure_sync(&op.cl);
 
 		if (op.insert_collision)
@@ -433,9 +433,16 @@ static int bch_writeback_thread(void *arg)
 
 /* Init */
 
-static int sectors_dirty_init_fn(struct btree_op *op, struct btree *b,
+struct sectors_dirty_init {
+	struct btree_op	op;
+	unsigned	inode;
+};
+
+static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
 				 struct bkey *k)
 {
+	struct sectors_dirty_init *op = container_of(_op,
+						struct sectors_dirty_init, op);
 	if (KEY_INODE(k) > op->inode)
 		return MAP_DONE;
 
@@ -448,12 +455,12 @@ static int sectors_dirty_init_fn(struct btree_op *op, struct btree *b,
 
 void bch_sectors_dirty_init(struct cached_dev *dc)
 {
-	struct btree_op op;
+	struct sectors_dirty_init op;
 
-	bch_btree_op_init_stack(&op);
+	bch_btree_op_init_stack(&op.op);
 	op.inode = dc->disk.id;
 
-	bch_btree_map_keys(&op, dc->disk.c, &KEY(op.inode, 0, 0),
+	bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0),
 			   sectors_dirty_init_fn, 0);
 }
 
-- 
cgit v1.2.3


From b54d6934da7857f87b092df9b77dc1f42818ba94 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 18:04:18 -0700
Subject: bcache: Kill op->cl

This isn't used for waiting asynchronously anymore - so this is a fairly
trivial refactoring.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bset.c      |  2 +-
 drivers/md/bcache/btree.c     | 81 +++++++++++++++++++------------------------
 drivers/md/bcache/btree.h     |  8 +++--
 drivers/md/bcache/journal.c   |  8 ++---
 drivers/md/bcache/movinggc.c  |  4 +--
 drivers/md/bcache/request.c   | 35 ++++++++-----------
 drivers/md/bcache/request.h   |  1 +
 drivers/md/bcache/writeback.c |  5 ++-
 8 files changed, 63 insertions(+), 81 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index fae5b7b3f5ab..f7b5525ddafa 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -1196,7 +1196,7 @@ int bch_bset_print_stats(struct cache_set *c, char *buf)
 	int ret;
 
 	memset(&t, 0, sizeof(struct bset_stats));
-	bch_btree_op_init_stack(&t.op);
+	bch_btree_op_init(&t.op, -1);
 
 	ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats);
 	if (ret < 0)
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 5cb59c313dc3..cb1a490f7f86 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -115,13 +115,6 @@ enum {
 
 static struct workqueue_struct *btree_io_wq;
 
-void bch_btree_op_init_stack(struct btree_op *op)
-{
-	memset(op, 0, sizeof(struct btree_op));
-	closure_init_stack(&op->cl);
-	op->lock = -1;
-}
-
 static inline bool should_split(struct btree *b)
 {
 	struct bset *i = write_block(b);
@@ -965,8 +958,7 @@ err:
  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
  * in from disk if necessary.
  *
- * If IO is necessary, it uses the closure embedded in struct btree_op to wait;
- * if that closure is in non blocking mode, will return -EAGAIN.
+ * If IO is necessary and running under generic_make_request, returns -EAGAIN.
  *
  * The btree node will have either a read or a write lock held, depending on
  * level and op->lock.
@@ -1260,6 +1252,9 @@ static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc,
 {
 	unsigned nodes = 0, keys = 0, blocks;
 	int i;
+	struct closure cl;
+
+	closure_init_stack(&cl);
 
 	while (nodes < GC_MERGE_NODES && r[nodes].b)
 		keys += r[nodes++].keys;
@@ -1353,9 +1348,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 {
 	void write(struct btree *r)
 	{
-		if (!r->written)
-			bch_btree_node_write(r, &op->cl);
-		else if (btree_node_dirty(r))
+		if (!r->written || btree_node_dirty(r))
 			bch_btree_node_write(r, writes);
 
 		up_write(&r->lock);
@@ -1431,6 +1424,9 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
 	struct btree *n = NULL;
 	unsigned keys = 0;
 	int ret = 0, stale = btree_gc_mark_node(b, &keys, gc);
+	struct closure cl;
+
+	closure_init_stack(&cl);
 
 	if (b->level || stale > 10)
 		n = btree_node_alloc_replacement(b);
@@ -1442,11 +1438,11 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
 		ret = btree_gc_recurse(b, op, writes, gc);
 
 	if (!b->written || btree_node_dirty(b)) {
-		bch_btree_node_write(b, n ? &op->cl : NULL);
+		bch_btree_node_write(b, n ? &cl : NULL);
 	}
 
 	if (!IS_ERR_OR_NULL(n)) {
-		closure_sync(&op->cl);
+		closure_sync(&cl);
 		bch_btree_set_root(b);
 		btree_node_free(n);
 		rw_unlock(true, b);
@@ -1545,15 +1541,13 @@ static void bch_btree_gc(struct cache_set *c)
 
 	memset(&stats, 0, sizeof(struct gc_stat));
 	closure_init_stack(&writes);
-	bch_btree_op_init_stack(&op);
-	op.lock = SHRT_MAX;
+	bch_btree_op_init(&op, SHRT_MAX);
 
 	btree_gc_start(c);
 
 	atomic_inc(&c->prio_blocked);
 
 	ret = btree_root(gc_root, c, &op, &writes, &stats);
-	closure_sync(&op.cl);
 	closure_sync(&writes);
 
 	if (ret) {
@@ -1562,8 +1556,8 @@ static void bch_btree_gc(struct cache_set *c)
 	}
 
 	/* Possibly wait for new UUIDs or whatever to hit disk */
-	bch_journal_meta(c, &op.cl);
-	closure_sync(&op.cl);
+	bch_journal_meta(c, &writes);
+	closure_sync(&writes);
 
 	available = bch_btree_gc_finish(c);
 
@@ -1671,8 +1665,7 @@ int bch_btree_check(struct cache_set *c)
 	struct btree_op op;
 
 	memset(seen, 0, sizeof(seen));
-	bch_btree_op_init_stack(&op);
-	op.lock = SHRT_MAX;
+	bch_btree_op_init(&op, SHRT_MAX);
 
 	for (i = 0; c->cache[i]; i++) {
 		size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8);
@@ -1980,6 +1973,9 @@ static int btree_split(struct btree *b, struct btree_op *op,
 	bool split;
 	struct btree *n1, *n2 = NULL, *n3 = NULL;
 	uint64_t start_time = local_clock();
+	struct closure cl;
+
+	closure_init_stack(&cl);
 
 	n1 = btree_node_alloc_replacement(b);
 	if (IS_ERR(n1))
@@ -2025,7 +2021,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
 		bkey_copy_key(&n2->key, &b->key);
 
 		bch_keylist_add(parent_keys, &n2->key);
-		bch_btree_node_write(n2, &op->cl);
+		bch_btree_node_write(n2, &cl);
 		rw_unlock(true, n2);
 	} else {
 		trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
@@ -2034,23 +2030,23 @@ static int btree_split(struct btree *b, struct btree_op *op,
 	}
 
 	bch_keylist_add(parent_keys, &n1->key);
-	bch_btree_node_write(n1, &op->cl);
+	bch_btree_node_write(n1, &cl);
 
 	if (n3) {
 		/* Depth increases, make a new root */
 
 		bkey_copy_key(&n3->key, &MAX_KEY);
 		bch_btree_insert_keys(n3, op, parent_keys);
-		bch_btree_node_write(n3, &op->cl);
+		bch_btree_node_write(n3, &cl);
 
-		closure_sync(&op->cl);
+		closure_sync(&cl);
 		bch_btree_set_root(n3);
 		rw_unlock(true, n3);
 	} else if (!b->parent) {
 		/* Root filled up but didn't need to be split */
 
 		bch_keylist_reset(parent_keys);
-		closure_sync(&op->cl);
+		closure_sync(&cl);
 		bch_btree_set_root(n1);
 	} else {
 		unsigned i;
@@ -2065,7 +2061,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
 		}
 
 		bch_keylist_push(parent_keys);
-		closure_sync(&op->cl);
+		closure_sync(&cl);
 		atomic_inc(&b->c->prio_blocked);
 	}
 
@@ -2126,10 +2122,15 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 			BUG_ON(write_block(b) != b->sets[b->nsets].data);
 
 			if (bch_btree_insert_keys(b, op, insert_keys)) {
-				if (!b->level)
+				if (!b->level) {
 					bch_btree_leaf_dirty(b, journal_ref);
-				else
-					bch_btree_node_write(b, &op->cl);
+				} else {
+					struct closure cl;
+
+					closure_init_stack(&cl);
+					bch_btree_node_write(b, &cl);
+					closure_sync(&cl);
+				}
 			}
 		}
 	} while (!bch_keylist_empty(&split_keys));
@@ -2204,12 +2205,6 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c,
 {
 	int ret = 0;
 
-	/*
-	 * Don't want to block with the btree locked unless we have to,
-	 * otherwise we get deadlocks with try_harder and between split/gc
-	 */
-	clear_closure_blocking(&op->cl);
-
 	BUG_ON(bch_keylist_empty(keys));
 
 	while (!bch_keylist_empty(keys)) {
@@ -2217,8 +2212,8 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c,
 		ret = btree_root(insert_recurse, c, op, keys, journal_ref);
 
 		if (ret == -EAGAIN) {
+			BUG();
 			ret = 0;
-			closure_sync(&op->cl);
 		} else if (ret) {
 			struct bkey *k;
 
@@ -2292,10 +2287,7 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
 int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
 			  struct bkey *from, btree_map_nodes_fn *fn, int flags)
 {
-	int ret = btree_root(map_nodes_recurse, c, op, from, fn, flags);
-	if (closure_blocking(&op->cl))
-		closure_sync(&op->cl);
-	return ret;
+	return btree_root(map_nodes_recurse, c, op, from, fn, flags);
 }
 
 static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
@@ -2328,10 +2320,7 @@ static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
 int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
 		       struct bkey *from, btree_map_keys_fn *fn, int flags)
 {
-	int ret = btree_root(map_keys_recurse, c, op, from, fn, flags);
-	if (closure_blocking(&op->cl))
-		closure_sync(&op->cl);
-	return ret;
+	return btree_root(map_keys_recurse, c, op, from, fn, flags);
 }
 
 /* Keybuf code */
@@ -2409,7 +2398,7 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
 
 	cond_resched();
 
-	bch_btree_op_init_stack(&refill.op);
+	bch_btree_op_init(&refill.op, -1);
 	refill.buf = buf;
 	refill.end = end;
 	refill.pred = pred;
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 3f820b67150c..34ee5359b262 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -237,8 +237,6 @@ void __bkey_put(struct cache_set *c, struct bkey *k);
 /* Recursing down the btree */
 
 struct btree_op {
-	struct closure		cl;
-
 	/* Btree level at which we start taking write locks */
 	short			lock;
 
@@ -253,7 +251,11 @@ struct btree_op {
 	BKEY_PADDED(replace);
 };
 
-void bch_btree_op_init_stack(struct btree_op *);
+static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
+{
+	memset(op, 0, sizeof(struct btree_op));
+	op->lock = write_lock_level;
+}
 
 static inline void rw_lock(bool w, struct btree *b, int level)
 {
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 725c8eb9a62a..20e900ad5010 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -305,8 +305,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
 	struct btree_op op;
 
 	bch_keylist_init(&keylist);
-	bch_btree_op_init_stack(&op);
-	op.lock = SHRT_MAX;
+	bch_btree_op_init(&op, SHRT_MAX);
 
 	list_for_each_entry(i, list, list) {
 		BUG_ON(i->pin && atomic_read(i->pin) != 1);
@@ -341,14 +340,13 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
 
 	pr_info("journal replay done, %i keys in %i entries, seq %llu",
 		keys, entries, end);
-
+err:
 	while (!list_empty(list)) {
 		i = list_first_entry(list, struct journal_replay, list);
 		list_del(&i->list);
 		kfree(i);
 	}
-err:
-	closure_sync(&op.cl);
+
 	return ret;
 }
 
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 80e30d77221e..219356f6159d 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -108,8 +108,8 @@ static void write_moving(struct closure *cl)
 		s->op.type = BTREE_REPLACE;
 		bkey_copy(&s->op.replace, &io->w->key);
 
-		closure_init(&s->op.cl, cl);
-		bch_data_insert(&s->op.cl);
+		closure_init(&s->btree, cl);
+		bch_data_insert(&s->btree);
 	}
 
 	continue_at(cl, write_moving_finish, system_wq);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 818e2e39e71f..5df44fbc9e1d 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -215,8 +215,7 @@ static void bio_csum(struct bio *bio, struct bkey *k)
 
 static void bch_data_insert_keys(struct closure *cl)
 {
-	struct btree_op *op = container_of(cl, struct btree_op, cl);
-	struct search *s = container_of(op, struct search, op);
+	struct search *s = container_of(cl, struct search, btree);
 	atomic_t *journal_ref = NULL;
 
 	/*
@@ -236,7 +235,7 @@ static void bch_data_insert_keys(struct closure *cl)
 					  s->flush_journal
 					  ? &s->cl : NULL);
 
-	if (bch_btree_insert(op, s->c, &s->insert_keys, journal_ref)) {
+	if (bch_btree_insert(&s->op, s->c, &s->insert_keys, journal_ref)) {
 		s->error		= -ENOMEM;
 		s->insert_data_done	= true;
 	}
@@ -433,8 +432,7 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
 
 static void bch_data_invalidate(struct closure *cl)
 {
-	struct btree_op *op = container_of(cl, struct btree_op, cl);
-	struct search *s = container_of(op, struct search, op);
+	struct search *s = container_of(cl, struct search, btree);
 	struct bio *bio = s->cache_bio;
 
 	pr_debug("invalidating %i sectors from %llu",
@@ -461,8 +459,7 @@ out:
 
 static void bch_data_insert_error(struct closure *cl)
 {
-	struct btree_op *op = container_of(cl, struct btree_op, cl);
-	struct search *s = container_of(op, struct search, op);
+	struct search *s = container_of(cl, struct search, btree);
 
 	/*
 	 * Our data write just errored, which means we've got a bunch of keys to
@@ -493,8 +490,7 @@ static void bch_data_insert_error(struct closure *cl)
 static void bch_data_insert_endio(struct bio *bio, int error)
 {
 	struct closure *cl = bio->bi_private;
-	struct btree_op *op = container_of(cl, struct btree_op, cl);
-	struct search *s = container_of(op, struct search, op);
+	struct search *s = container_of(cl, struct search, btree);
 
 	if (error) {
 		/* TODO: We could try to recover from this. */
@@ -511,8 +507,7 @@ static void bch_data_insert_endio(struct bio *bio, int error)
 
 static void bch_data_insert_start(struct closure *cl)
 {
-	struct btree_op *op = container_of(cl, struct btree_op, cl);
-	struct search *s = container_of(op, struct search, op);
+	struct search *s = container_of(cl, struct search, btree);
 	struct bio *bio = s->cache_bio, *n;
 
 	if (s->bypass)
@@ -630,8 +625,7 @@ err:
  */
 void bch_data_insert(struct closure *cl)
 {
-	struct btree_op *op = container_of(cl, struct btree_op, cl);
-	struct search *s = container_of(op, struct search, op);
+	struct search *s = container_of(cl, struct search, btree);
 
 	bch_keylist_init(&s->insert_keys);
 	bio_get(s->cache_bio);
@@ -731,11 +725,10 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
 
 static void cache_lookup(struct closure *cl)
 {
-	struct btree_op *op = container_of(cl, struct btree_op, cl);
-	struct search *s = container_of(op, struct search, op);
+	struct search *s = container_of(cl, struct search, btree);
 	struct bio *bio = &s->bio.bio;
 
-	int ret = bch_btree_map_keys(op, s->c,
+	int ret = bch_btree_map_keys(&s->op, s->c,
 				     &KEY(s->inode, bio->bi_sector, 0),
 				     cache_lookup_fn, MAP_END_KEY);
 	if (ret == -EAGAIN)
@@ -1064,7 +1057,7 @@ static void cached_dev_read_done(struct closure *cl)
 	if (s->cache_bio &&
 	    !test_bit(CACHE_SET_STOPPING, &s->c->flags)) {
 		s->op.type = BTREE_REPLACE;
-		closure_call(&s->op.cl, bch_data_insert, NULL, cl);
+		closure_call(&s->btree, bch_data_insert, NULL, cl);
 	}
 
 	continue_at(cl, cached_dev_cache_miss_done, NULL);
@@ -1156,7 +1149,7 @@ static void cached_dev_read(struct cached_dev *dc, struct search *s)
 {
 	struct closure *cl = &s->cl;
 
-	closure_call(&s->op.cl, cache_lookup, NULL, cl);
+	closure_call(&s->btree, cache_lookup, NULL, cl);
 	continue_at(cl, cached_dev_read_done_bh, NULL);
 }
 
@@ -1239,7 +1232,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 		closure_bio_submit(bio, cl, s->d);
 	}
 
-	closure_call(&s->op.cl, bch_data_insert, NULL, cl);
+	closure_call(&s->btree, bch_data_insert, NULL, cl);
 	continue_at(cl, cached_dev_write_complete, NULL);
 }
 
@@ -1418,9 +1411,9 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 		s->writeback	= true;
 		s->cache_bio	= bio;
 
-		closure_call(&s->op.cl, bch_data_insert, NULL, cl);
+		closure_call(&s->btree, bch_data_insert, NULL, cl);
 	} else {
-		closure_call(&s->op.cl, cache_lookup, NULL, cl);
+		closure_call(&s->btree, cache_lookup, NULL, cl);
 	}
 
 	continue_at(cl, search_free, NULL);
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 0f79177c4f33..ed578aa53ee2 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -6,6 +6,7 @@
 struct search {
 	/* Stack frame for bio_complete */
 	struct closure		cl;
+	struct closure		btree;
 
 	struct bcache_device	*d;
 	struct cache_set	*c;
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index b58c2bc91e3f..d0968e8938f7 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -143,7 +143,7 @@ static void write_dirty_finish(struct closure *cl)
 		struct btree_op op;
 		struct keylist keys;
 
-		bch_btree_op_init_stack(&op);
+		bch_btree_op_init(&op, -1);
 		bch_keylist_init(&keys);
 
 		op.type = BTREE_REPLACE;
@@ -156,7 +156,6 @@ static void write_dirty_finish(struct closure *cl)
 			atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
 
 		bch_btree_insert(&op, dc->disk.c, &keys, NULL);
-		closure_sync(&op.cl);
 
 		if (op.insert_collision)
 			trace_bcache_writeback_collision(&w->key);
@@ -457,7 +456,7 @@ void bch_sectors_dirty_init(struct cached_dev *dc)
 {
 	struct sectors_dirty_init op;
 
-	bch_btree_op_init_stack(&op.op);
+	bch_btree_op_init(&op.op, -1);
 	op.inode = dc->disk.id;
 
 	bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0),
-- 
cgit v1.2.3


From faadf0c96547ec8277ad0abd6959f2ef48522f31 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Fri, 1 Nov 2013 18:03:08 -0700
Subject: bcache: Drop some closure stuff

With a the recent bcache refactoring, some of the closure code isn't
needed anymore.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c   |   6 +-
 drivers/md/bcache/closure.c | 103 ++++---------------------
 drivers/md/bcache/closure.h | 181 +++++---------------------------------------
 3 files changed, 40 insertions(+), 250 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index cb1a490f7f86..3e0c90130c2e 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -407,7 +407,7 @@ static void do_btree_node_write(struct btree *b)
 	b->bio = bch_bbio_alloc(b->c);
 
 	b->bio->bi_end_io	= btree_node_write_endio;
-	b->bio->bi_private	= &b->io.cl;
+	b->bio->bi_private	= cl;
 	b->bio->bi_rw		= REQ_META|WRITE_SYNC|REQ_FUA;
 	b->bio->bi_size		= set_blocks(i, b->c) * block_bytes(b->c);
 	bch_bio_map(b->bio, i);
@@ -672,8 +672,8 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush)
 	}
 
 	/* wait for any in flight btree write */
-	closure_wait_event_sync(&b->io.wait, &cl,
-		atomic_read(&b->io.cl.remaining) == -1);
+	closure_wait_event(&b->io.wait, &cl,
+			   atomic_read(&b->io.cl.remaining) == -1);
 
 	return 0;
 }
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index 9aba2017f0d1..dfff2410322e 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -11,17 +11,6 @@
 
 #include "closure.h"
 
-void closure_queue(struct closure *cl)
-{
-	struct workqueue_struct *wq = cl->wq;
-	if (wq) {
-		INIT_WORK(&cl->work, cl->work.func);
-		BUG_ON(!queue_work(wq, &cl->work));
-	} else
-		cl->fn(cl);
-}
-EXPORT_SYMBOL_GPL(closure_queue);
-
 #define CL_FIELD(type, field)					\
 	case TYPE_ ## type:					\
 	return &container_of(cl, struct type, cl)->field
@@ -30,17 +19,6 @@ static struct closure_waitlist *closure_waitlist(struct closure *cl)
 {
 	switch (cl->type) {
 		CL_FIELD(closure_with_waitlist, wait);
-		CL_FIELD(closure_with_waitlist_and_timer, wait);
-	default:
-		return NULL;
-	}
-}
-
-static struct timer_list *closure_timer(struct closure *cl)
-{
-	switch (cl->type) {
-		CL_FIELD(closure_with_timer, timer);
-		CL_FIELD(closure_with_waitlist_and_timer, timer);
 	default:
 		return NULL;
 	}
@@ -51,7 +29,7 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
 	int r = flags & CLOSURE_REMAINING_MASK;
 
 	BUG_ON(flags & CLOSURE_GUARD_MASK);
-	BUG_ON(!r && (flags & ~(CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING)));
+	BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
 
 	/* Must deliver precisely one wakeup */
 	if (r == 1 && (flags & CLOSURE_SLEEPING))
@@ -59,7 +37,6 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
 
 	if (!r) {
 		if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
-			/* CLOSURE_BLOCKING might be set - clear it */
 			atomic_set(&cl->remaining,
 				   CLOSURE_REMAINING_INITIALIZER);
 			closure_queue(cl);
@@ -90,13 +67,13 @@ void closure_sub(struct closure *cl, int v)
 {
 	closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
 }
-EXPORT_SYMBOL_GPL(closure_sub);
+EXPORT_SYMBOL(closure_sub);
 
 void closure_put(struct closure *cl)
 {
 	closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
 }
-EXPORT_SYMBOL_GPL(closure_put);
+EXPORT_SYMBOL(closure_put);
 
 static void set_waiting(struct closure *cl, unsigned long f)
 {
@@ -133,7 +110,7 @@ void __closure_wake_up(struct closure_waitlist *wait_list)
 		closure_sub(cl, CLOSURE_WAITING + 1);
 	}
 }
-EXPORT_SYMBOL_GPL(__closure_wake_up);
+EXPORT_SYMBOL(__closure_wake_up);
 
 bool closure_wait(struct closure_waitlist *list, struct closure *cl)
 {
@@ -146,7 +123,7 @@ bool closure_wait(struct closure_waitlist *list, struct closure *cl)
 
 	return true;
 }
-EXPORT_SYMBOL_GPL(closure_wait);
+EXPORT_SYMBOL(closure_wait);
 
 /**
  * closure_sync() - sleep until a closure a closure has nothing left to wait on
@@ -169,7 +146,7 @@ void closure_sync(struct closure *cl)
 
 	__closure_end_sleep(cl);
 }
-EXPORT_SYMBOL_GPL(closure_sync);
+EXPORT_SYMBOL(closure_sync);
 
 /**
  * closure_trylock() - try to acquire the closure, without waiting
@@ -183,17 +160,17 @@ bool closure_trylock(struct closure *cl, struct closure *parent)
 			   CLOSURE_REMAINING_INITIALIZER) != -1)
 		return false;
 
-	closure_set_ret_ip(cl);
-
 	smp_mb();
+
 	cl->parent = parent;
 	if (parent)
 		closure_get(parent);
 
+	closure_set_ret_ip(cl);
 	closure_debug_create(cl);
 	return true;
 }
-EXPORT_SYMBOL_GPL(closure_trylock);
+EXPORT_SYMBOL(closure_trylock);
 
 void __closure_lock(struct closure *cl, struct closure *parent,
 		    struct closure_waitlist *wait_list)
@@ -205,57 +182,11 @@ void __closure_lock(struct closure *cl, struct closure *parent,
 		if (closure_trylock(cl, parent))
 			return;
 
-		closure_wait_event_sync(wait_list, &wait,
-					atomic_read(&cl->remaining) == -1);
+		closure_wait_event(wait_list, &wait,
+				   atomic_read(&cl->remaining) == -1);
 	}
 }
-EXPORT_SYMBOL_GPL(__closure_lock);
-
-static void closure_delay_timer_fn(unsigned long data)
-{
-	struct closure *cl = (struct closure *) data;
-	closure_sub(cl, CLOSURE_TIMER + 1);
-}
-
-void do_closure_timer_init(struct closure *cl)
-{
-	struct timer_list *timer = closure_timer(cl);
-
-	init_timer(timer);
-	timer->data	= (unsigned long) cl;
-	timer->function = closure_delay_timer_fn;
-}
-EXPORT_SYMBOL_GPL(do_closure_timer_init);
-
-bool __closure_delay(struct closure *cl, unsigned long delay,
-		     struct timer_list *timer)
-{
-	if (atomic_read(&cl->remaining) & CLOSURE_TIMER)
-		return false;
-
-	BUG_ON(timer_pending(timer));
-
-	timer->expires	= jiffies + delay;
-
-	atomic_add(CLOSURE_TIMER + 1, &cl->remaining);
-	add_timer(timer);
-	return true;
-}
-EXPORT_SYMBOL_GPL(__closure_delay);
-
-void __closure_flush(struct closure *cl, struct timer_list *timer)
-{
-	if (del_timer(timer))
-		closure_sub(cl, CLOSURE_TIMER + 1);
-}
-EXPORT_SYMBOL_GPL(__closure_flush);
-
-void __closure_flush_sync(struct closure *cl, struct timer_list *timer)
-{
-	if (del_timer_sync(timer))
-		closure_sub(cl, CLOSURE_TIMER + 1);
-}
-EXPORT_SYMBOL_GPL(__closure_flush_sync);
+EXPORT_SYMBOL(__closure_lock);
 
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
 
@@ -273,7 +204,7 @@ void closure_debug_create(struct closure *cl)
 	list_add(&cl->all, &closure_list);
 	spin_unlock_irqrestore(&closure_list_lock, flags);
 }
-EXPORT_SYMBOL_GPL(closure_debug_create);
+EXPORT_SYMBOL(closure_debug_create);
 
 void closure_debug_destroy(struct closure *cl)
 {
@@ -286,7 +217,7 @@ void closure_debug_destroy(struct closure *cl)
 	list_del(&cl->all);
 	spin_unlock_irqrestore(&closure_list_lock, flags);
 }
-EXPORT_SYMBOL_GPL(closure_debug_destroy);
+EXPORT_SYMBOL(closure_debug_destroy);
 
 static struct dentry *debug;
 
@@ -304,14 +235,12 @@ static int debug_seq_show(struct seq_file *f, void *data)
 			   cl, (void *) cl->ip, cl->fn, cl->parent,
 			   r & CLOSURE_REMAINING_MASK);
 
-		seq_printf(f, "%s%s%s%s%s%s\n",
+		seq_printf(f, "%s%s%s%s\n",
 			   test_bit(WORK_STRUCT_PENDING,
 				    work_data_bits(&cl->work)) ? "Q" : "",
 			   r & CLOSURE_RUNNING	? "R" : "",
-			   r & CLOSURE_BLOCKING	? "B" : "",
 			   r & CLOSURE_STACK	? "S" : "",
-			   r & CLOSURE_SLEEPING	? "Sl" : "",
-			   r & CLOSURE_TIMER	? "T" : "");
+			   r & CLOSURE_SLEEPING	? "Sl" : "");
 
 		if (r & CLOSURE_WAITING)
 			seq_printf(f, " W %pF\n",
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index ab011f03801f..9762f1be3304 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -155,21 +155,6 @@
  * delayed_work embeds a work item and a timer_list. The important thing is, use
  * it exactly like you would a regular closure and closure_put() will magically
  * handle everything for you.
- *
- * We've got closures that embed timers, too. They're called, appropriately
- * enough:
- * struct closure_with_timer;
- *
- * This gives you access to closure_delay(). It takes a refcount for a specified
- * number of jiffies - you could then call closure_sync() (for a slightly
- * convoluted version of msleep()) or continue_at() - which gives you the same
- * effect as using a delayed work item, except you can reuse the work_struct
- * already embedded in struct closure.
- *
- * Lastly, there's struct closure_with_waitlist_and_timer. It does what you
- * probably expect, if you happen to need the features of both. (You don't
- * really want to know how all this is implemented, but if I've done my job
- * right you shouldn't have to care).
  */
 
 struct closure;
@@ -182,16 +167,11 @@ struct closure_waitlist {
 enum closure_type {
 	TYPE_closure				= 0,
 	TYPE_closure_with_waitlist		= 1,
-	TYPE_closure_with_timer			= 2,
-	TYPE_closure_with_waitlist_and_timer	= 3,
-	MAX_CLOSURE_TYPE			= 3,
+	MAX_CLOSURE_TYPE			= 1,
 };
 
 enum closure_state {
 	/*
-	 * CLOSURE_BLOCKING: Causes closure_wait_event() to block, instead of
-	 * waiting asynchronously
-	 *
 	 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
 	 * the thread that owns the closure, and cleared by the thread that's
 	 * waking up the closure.
@@ -200,10 +180,6 @@ enum closure_state {
 	 * - indicates that cl->task is valid and closure_put() may wake it up.
 	 * Only set or cleared by the thread that owns the closure.
 	 *
-	 * CLOSURE_TIMER: Analagous to CLOSURE_WAITING, indicates that a closure
-	 * has an outstanding timer. Must be set by the thread that owns the
-	 * closure, and cleared by the timer function when the timer goes off.
-	 *
 	 * The rest are for debugging and don't affect behaviour:
 	 *
 	 * CLOSURE_RUNNING: Set when a closure is running (i.e. by
@@ -218,19 +194,17 @@ enum closure_state {
 	 * closure with this flag set
 	 */
 
-	CLOSURE_BITS_START	= (1 << 19),
-	CLOSURE_DESTRUCTOR	= (1 << 19),
-	CLOSURE_BLOCKING	= (1 << 21),
-	CLOSURE_WAITING		= (1 << 23),
-	CLOSURE_SLEEPING	= (1 << 25),
-	CLOSURE_TIMER		= (1 << 27),
+	CLOSURE_BITS_START	= (1 << 23),
+	CLOSURE_DESTRUCTOR	= (1 << 23),
+	CLOSURE_WAITING		= (1 << 25),
+	CLOSURE_SLEEPING	= (1 << 27),
 	CLOSURE_RUNNING		= (1 << 29),
 	CLOSURE_STACK		= (1 << 31),
 };
 
 #define CLOSURE_GUARD_MASK					\
-	((CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING|CLOSURE_WAITING|	\
-	  CLOSURE_SLEEPING|CLOSURE_TIMER|CLOSURE_RUNNING|CLOSURE_STACK) << 1)
+	((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_SLEEPING|	\
+	  CLOSURE_RUNNING|CLOSURE_STACK) << 1)
 
 #define CLOSURE_REMAINING_MASK		(CLOSURE_BITS_START - 1)
 #define CLOSURE_REMAINING_INITIALIZER	(1|CLOSURE_RUNNING)
@@ -268,17 +242,6 @@ struct closure_with_waitlist {
 	struct closure_waitlist	wait;
 };
 
-struct closure_with_timer {
-	struct closure		cl;
-	struct timer_list	timer;
-};
-
-struct closure_with_waitlist_and_timer {
-	struct closure		cl;
-	struct closure_waitlist	wait;
-	struct timer_list	timer;
-};
-
 extern unsigned invalid_closure_type(void);
 
 #define __CLOSURE_TYPE(cl, _t)						\
@@ -289,14 +252,11 @@ extern unsigned invalid_closure_type(void);
 (									\
 	__CLOSURE_TYPE(cl, closure)					\
 	__CLOSURE_TYPE(cl, closure_with_waitlist)			\
-	__CLOSURE_TYPE(cl, closure_with_timer)				\
-	__CLOSURE_TYPE(cl, closure_with_waitlist_and_timer)		\
 	invalid_closure_type()						\
 )
 
 void closure_sub(struct closure *cl, int v);
 void closure_put(struct closure *cl);
-void closure_queue(struct closure *cl);
 void __closure_wake_up(struct closure_waitlist *list);
 bool closure_wait(struct closure_waitlist *list, struct closure *cl);
 void closure_sync(struct closure *cl);
@@ -305,12 +265,6 @@ bool closure_trylock(struct closure *cl, struct closure *parent);
 void __closure_lock(struct closure *cl, struct closure *parent,
 		    struct closure_waitlist *wait_list);
 
-void do_closure_timer_init(struct closure *cl);
-bool __closure_delay(struct closure *cl, unsigned long delay,
-		     struct timer_list *timer);
-void __closure_flush(struct closure *cl, struct timer_list *timer);
-void __closure_flush_sync(struct closure *cl, struct timer_list *timer);
-
 #ifdef CONFIG_BCACHE_CLOSURES_DEBUG
 
 void closure_debug_init(void);
@@ -354,11 +308,6 @@ static inline void closure_set_stopped(struct closure *cl)
 	atomic_sub(CLOSURE_RUNNING, &cl->remaining);
 }
 
-static inline bool closure_is_stopped(struct closure *cl)
-{
-	return !(atomic_read(&cl->remaining) & CLOSURE_RUNNING);
-}
-
 static inline bool closure_is_unlocked(struct closure *cl)
 {
 	return atomic_read(&cl->remaining) == -1;
@@ -367,14 +316,6 @@ static inline bool closure_is_unlocked(struct closure *cl)
 static inline void do_closure_init(struct closure *cl, struct closure *parent,
 				   bool running)
 {
-	switch (cl->type) {
-	case TYPE_closure_with_timer:
-	case TYPE_closure_with_waitlist_and_timer:
-		do_closure_timer_init(cl);
-	default:
-		break;
-	}
-
 	cl->parent = parent;
 	if (parent)
 		closure_get(parent);
@@ -429,8 +370,7 @@ do {								\
 static inline void closure_init_stack(struct closure *cl)
 {
 	memset(cl, 0, sizeof(struct closure));
-	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|
-		   CLOSURE_BLOCKING|CLOSURE_STACK);
+	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK);
 }
 
 /**
@@ -461,24 +401,6 @@ do {								\
 #define closure_lock(cl, parent)				\
 	__closure_lock(__to_internal_closure(cl), parent, &(cl)->wait)
 
-/**
- * closure_delay() - delay some number of jiffies
- * @cl:		the closure that will sleep
- * @delay:	the delay in jiffies
- *
- * Takes a refcount on @cl which will be released after @delay jiffies; this may
- * be used to have a function run after a delay with continue_at(), or
- * closure_sync() may be used for a convoluted version of msleep().
- */
-#define closure_delay(cl, delay)			\
-	__closure_delay(__to_internal_closure(cl), delay, &(cl)->timer)
-
-#define closure_flush(cl)				\
-	__closure_flush(__to_internal_closure(cl), &(cl)->timer)
-
-#define closure_flush_sync(cl)				\
-	__closure_flush_sync(__to_internal_closure(cl), &(cl)->timer)
-
 static inline void __closure_end_sleep(struct closure *cl)
 {
 	__set_current_state(TASK_RUNNING);
@@ -497,40 +419,6 @@ static inline void __closure_start_sleep(struct closure *cl)
 		atomic_add(CLOSURE_SLEEPING, &cl->remaining);
 }
 
-/**
- * closure_blocking() - returns true if the closure is in blocking mode.
- *
- * If a closure is in blocking mode, closure_wait_event() will sleep until the
- * condition is true instead of waiting asynchronously.
- */
-static inline bool closure_blocking(struct closure *cl)
-{
-	return atomic_read(&cl->remaining) & CLOSURE_BLOCKING;
-}
-
-/**
- * set_closure_blocking() - put a closure in blocking mode.
- *
- * If a closure is in blocking mode, closure_wait_event() will sleep until the
- * condition is true instead of waiting asynchronously.
- *
- * Not thread safe - can only be called by the thread running the closure.
- */
-static inline void set_closure_blocking(struct closure *cl)
-{
-	if (!closure_blocking(cl))
-		atomic_add(CLOSURE_BLOCKING, &cl->remaining);
-}
-
-/*
- * Not thread safe - can only be called by the thread running the closure.
- */
-static inline void clear_closure_blocking(struct closure *cl)
-{
-	if (closure_blocking(cl))
-		atomic_sub(CLOSURE_BLOCKING, &cl->remaining);
-}
-
 /**
  * closure_wake_up() - wake up all closures on a wait list.
  */
@@ -561,63 +449,36 @@ static inline void closure_wake_up(struct closure_waitlist *list)
  * refcount on our closure. If this was a stack allocated closure, that would be
  * bad.
  */
-#define __closure_wait_event(list, cl, condition, _block)		\
+#define closure_wait_event(list, cl, condition)				\
 ({									\
-	bool block = _block;						\
 	typeof(condition) ret;						\
 									\
 	while (1) {							\
 		ret = (condition);					\
 		if (ret) {						\
 			__closure_wake_up(list);			\
-			if (block)					\
-				closure_sync(cl);			\
-									\
+			closure_sync(cl);				\
 			break;						\
 		}							\
 									\
-		if (block)						\
-			__closure_start_sleep(cl);			\
-									\
-		if (!closure_wait(list, cl)) {				\
-			if (!block)					\
-				break;					\
+		__closure_start_sleep(cl);				\
 									\
+		if (!closure_wait(list, cl))				\
 			schedule();					\
-		}							\
 	}								\
 									\
 	ret;								\
 })
 
-/**
- * closure_wait_event() - wait on a condition, synchronously or asynchronously.
- * @list:	the wait list to wait on
- * @cl:		the closure that is doing the waiting
- * @condition:	a C expression for the event to wait for
- *
- * If the closure is in blocking mode, sleeps until the @condition evaluates to
- * true - exactly like wait_event().
- *
- * If the closure is not in blocking mode, waits asynchronously; if the
- * condition is currently false the @cl is put onto @list and returns. @list
- * owns a refcount on @cl; closure_sync() or continue_at() may be used later to
- * wait for another thread to wake up @list, which drops the refcount on @cl.
- *
- * Returns the value of @condition; @cl will be on @list iff @condition was
- * false.
- *
- * closure_wake_up(@list) must be called after changing any variable that could
- * cause @condition to become true.
- */
-#define closure_wait_event(list, cl, condition)				\
-	__closure_wait_event(list, cl, condition, closure_blocking(cl))
-
-#define closure_wait_event_async(list, cl, condition)			\
-	__closure_wait_event(list, cl, condition, false)
-
-#define closure_wait_event_sync(list, cl, condition)			\
-	__closure_wait_event(list, cl, condition, true)
+static inline void closure_queue(struct closure *cl)
+{
+	struct workqueue_struct *wq = cl->wq;
+	if (wq) {
+		INIT_WORK(&cl->work, cl->work.func);
+		BUG_ON(!queue_work(wq, &cl->work));
+	} else
+		cl->fn(cl);
+}
 
 static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
 				  struct workqueue_struct *wq)
-- 
cgit v1.2.3


From 1b207d80d5b986fb305bc899357435d319319513 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 10 Sep 2013 18:52:54 -0700
Subject: bcache: Kill op->replace

This is prep work for converting bch_btree_insert to
bch_btree_map_leaf_nodes() - we have to convert all its arguments to
actual arguments. Bunch of churn, but should be straightforward.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c     | 102 +++++++++++++++++++++---------------------
 drivers/md/bcache/btree.h     |  10 +----
 drivers/md/bcache/journal.c   |   2 +-
 drivers/md/bcache/movinggc.c  |   4 +-
 drivers/md/bcache/request.c   |  14 +++---
 drivers/md/bcache/request.h   |   2 +
 drivers/md/bcache/writeback.c |  10 ++---
 7 files changed, 71 insertions(+), 73 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 3e0c90130c2e..7a1d8dc19e61 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -89,15 +89,6 @@
  * Test module load/unload
  */
 
-static const char * const op_types[] = {
-	"insert", "replace"
-};
-
-static const char *op_type(struct btree_op *op)
-{
-	return op_types[op->type];
-}
-
 enum {
 	BTREE_INSERT_STATUS_INSERT,
 	BTREE_INSERT_STATUS_BACK_MERGE,
@@ -1699,10 +1690,9 @@ static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert)
 	bch_bset_fix_lookup_table(b, where);
 }
 
-static bool fix_overlapping_extents(struct btree *b,
-				    struct bkey *insert,
+static bool fix_overlapping_extents(struct btree *b, struct bkey *insert,
 				    struct btree_iter *iter,
-				    struct btree_op *op)
+				    struct bkey *replace_key)
 {
 	void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
 	{
@@ -1730,39 +1720,38 @@ static bool fix_overlapping_extents(struct btree *b,
 		 * We might overlap with 0 size extents; we can't skip these
 		 * because if they're in the set we're inserting to we have to
 		 * adjust them so they don't overlap with the key we're
-		 * inserting. But we don't want to check them for BTREE_REPLACE
+		 * inserting. But we don't want to check them for replace
 		 * operations.
 		 */
 
-		if (op->type == BTREE_REPLACE &&
-		    KEY_SIZE(k)) {
+		if (replace_key && KEY_SIZE(k)) {
 			/*
 			 * k might have been split since we inserted/found the
 			 * key we're replacing
 			 */
 			unsigned i;
 			uint64_t offset = KEY_START(k) -
-				KEY_START(&op->replace);
+				KEY_START(replace_key);
 
 			/* But it must be a subset of the replace key */
-			if (KEY_START(k) < KEY_START(&op->replace) ||
-			    KEY_OFFSET(k) > KEY_OFFSET(&op->replace))
+			if (KEY_START(k) < KEY_START(replace_key) ||
+			    KEY_OFFSET(k) > KEY_OFFSET(replace_key))
 				goto check_failed;
 
 			/* We didn't find a key that we were supposed to */
 			if (KEY_START(k) > KEY_START(insert) + sectors_found)
 				goto check_failed;
 
-			if (KEY_PTRS(&op->replace) != KEY_PTRS(k))
+			if (KEY_PTRS(replace_key) != KEY_PTRS(k))
 				goto check_failed;
 
 			/* skip past gen */
 			offset <<= 8;
 
-			BUG_ON(!KEY_PTRS(&op->replace));
+			BUG_ON(!KEY_PTRS(replace_key));
 
-			for (i = 0; i < KEY_PTRS(&op->replace); i++)
-				if (k->ptr[i] != op->replace.ptr[i] + offset)
+			for (i = 0; i < KEY_PTRS(replace_key); i++)
+				if (k->ptr[i] != replace_key->ptr[i] + offset)
 					goto check_failed;
 
 			sectors_found = KEY_OFFSET(k) - KEY_START(insert);
@@ -1833,9 +1822,8 @@ static bool fix_overlapping_extents(struct btree *b,
 	}
 
 check_failed:
-	if (op->type == BTREE_REPLACE) {
+	if (replace_key) {
 		if (!sectors_found) {
-			op->insert_collision = true;
 			return true;
 		} else if (sectors_found < KEY_SIZE(insert)) {
 			SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
@@ -1848,7 +1836,7 @@ check_failed:
 }
 
 static bool btree_insert_key(struct btree *b, struct btree_op *op,
-			     struct bkey *k)
+			     struct bkey *k, struct bkey *replace_key)
 {
 	struct bset *i = b->sets[b->nsets].data;
 	struct bkey *m, *prev;
@@ -1874,8 +1862,10 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
 		prev = NULL;
 		m = bch_btree_iter_init(b, &iter, &search);
 
-		if (fix_overlapping_extents(b, k, &iter, op))
+		if (fix_overlapping_extents(b, k, &iter, replace_key)) {
+			op->insert_collision = true;
 			return false;
+		}
 
 		if (KEY_DIRTY(k))
 			bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
@@ -1903,24 +1893,28 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
 		if (m != end(i) &&
 		    bch_bkey_try_merge(b, k, m))
 			goto copy;
-	} else
+	} else {
+		BUG_ON(replace_key);
 		m = bch_bset_search(b, &b->sets[b->nsets], k);
+	}
 
 insert:	shift_keys(b, m, k);
 copy:	bkey_copy(m, k);
 merged:
-	bch_check_keys(b, "%u for %s", status, op_type(op));
+	bch_check_keys(b, "%u for %s", status,
+		       replace_key ? "replace" : "insert");
 
 	if (b->level && !KEY_OFFSET(k))
 		btree_current_write(b)->prio_blocked++;
 
-	trace_bcache_btree_insert_key(b, k, op->type, status);
+	trace_bcache_btree_insert_key(b, k, replace_key != NULL, status);
 
 	return true;
 }
 
 static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
-				  struct keylist *insert_keys)
+				  struct keylist *insert_keys,
+				  struct bkey *replace_key)
 {
 	bool ret = false;
 	unsigned oldsize = bch_count_data(b);
@@ -1936,11 +1930,11 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
 		if (bkey_cmp(k, &b->key) <= 0) {
 			bkey_put(b->c, k, b->level);
 
-			ret |= btree_insert_key(b, op, k);
+			ret |= btree_insert_key(b, op, k, replace_key);
 			bch_keylist_pop_front(insert_keys);
 		} else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
 #if 0
-			if (op->type == BTREE_REPLACE) {
+			if (replace_key) {
 				bkey_put(b->c, k, b->level);
 				bch_keylist_pop_front(insert_keys);
 				op->insert_collision = true;
@@ -1953,7 +1947,7 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
 			bch_cut_back(&b->key, &temp.key);
 			bch_cut_front(&b->key, insert_keys->keys);
 
-			ret |= btree_insert_key(b, op, &temp.key);
+			ret |= btree_insert_key(b, op, &temp.key, replace_key);
 			break;
 		} else {
 			break;
@@ -1968,7 +1962,8 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
 
 static int btree_split(struct btree *b, struct btree_op *op,
 		       struct keylist *insert_keys,
-		       struct keylist *parent_keys)
+		       struct keylist *parent_keys,
+		       struct bkey *replace_key)
 {
 	bool split;
 	struct btree *n1, *n2 = NULL, *n3 = NULL;
@@ -1998,7 +1993,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
 				goto err_free2;
 		}
 
-		bch_btree_insert_keys(n1, op, insert_keys);
+		bch_btree_insert_keys(n1, op, insert_keys, replace_key);
 
 		/*
 		 * Has to be a linear search because we don't have an auxiliary
@@ -2026,7 +2021,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
 	} else {
 		trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
 
-		bch_btree_insert_keys(n1, op, insert_keys);
+		bch_btree_insert_keys(n1, op, insert_keys, replace_key);
 	}
 
 	bch_keylist_add(parent_keys, &n1->key);
@@ -2036,7 +2031,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
 		/* Depth increases, make a new root */
 
 		bkey_copy_key(&n3->key, &MAX_KEY);
-		bch_btree_insert_keys(n3, op, parent_keys);
+		bch_btree_insert_keys(n3, op, parent_keys, NULL);
 		bch_btree_node_write(n3, &cl);
 
 		closure_sync(&cl);
@@ -2091,7 +2086,8 @@ err:
 
 static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 				 struct keylist *insert_keys,
-				 atomic_t *journal_ref)
+				 atomic_t *journal_ref,
+				 struct bkey *replace_key)
 {
 	int ret = 0;
 	struct keylist split_keys;
@@ -2101,6 +2097,8 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 	BUG_ON(b->level);
 
 	do {
+		BUG_ON(b->level && replace_key);
+
 		if (should_split(b)) {
 			if (current->bio_list) {
 				op->lock = b->c->root->level + 1;
@@ -2112,8 +2110,9 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 				struct btree *parent = b->parent;
 
 				ret = btree_split(b, op, insert_keys,
-						  &split_keys);
+						  &split_keys, replace_key);
 				insert_keys = &split_keys;
+				replace_key = NULL;
 				b = parent;
 				if (!ret)
 					ret = -EINTR;
@@ -2121,7 +2120,8 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 		} else {
 			BUG_ON(write_block(b) != b->sets[b->nsets].data);
 
-			if (bch_btree_insert_keys(b, op, insert_keys)) {
+			if (bch_btree_insert_keys(b, op, insert_keys,
+						  replace_key)) {
 				if (!b->level) {
 					bch_btree_leaf_dirty(b, journal_ref);
 				} else {
@@ -2165,9 +2165,7 @@ int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
 
 	bch_keylist_add(&insert, check_key);
 
-	BUG_ON(op->type != BTREE_INSERT);
-
-	ret = bch_btree_insert_node(b, op, &insert, NULL);
+	ret = bch_btree_insert_node(b, op, &insert, NULL, NULL);
 
 	BUG_ON(!ret && !bch_keylist_empty(&insert));
 out:
@@ -2177,7 +2175,8 @@ out:
 }
 
 static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
-				    struct keylist *keys, atomic_t *journal_ref)
+				    struct keylist *keys, atomic_t *journal_ref,
+				    struct bkey *replace_key)
 {
 	if (bch_keylist_empty(keys))
 		return 0;
@@ -2194,14 +2193,17 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
 			return -EIO;
 		}
 
-		return btree(insert_recurse, k, b, op, keys, journal_ref);
+		return btree(insert_recurse, k, b, op, keys,
+			     journal_ref, replace_key);
 	} else {
-		return bch_btree_insert_node(b, op, keys, journal_ref);
+		return bch_btree_insert_node(b, op, keys,
+					     journal_ref, replace_key);
 	}
 }
 
 int bch_btree_insert(struct btree_op *op, struct cache_set *c,
-		     struct keylist *keys, atomic_t *journal_ref)
+		     struct keylist *keys, atomic_t *journal_ref,
+		     struct bkey *replace_key)
 {
 	int ret = 0;
 
@@ -2209,7 +2211,8 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c,
 
 	while (!bch_keylist_empty(keys)) {
 		op->lock = 0;
-		ret = btree_root(insert_recurse, c, op, keys, journal_ref);
+		ret = btree_root(insert_recurse, c, op, keys,
+				 journal_ref, replace_key);
 
 		if (ret == -EAGAIN) {
 			BUG();
@@ -2217,8 +2220,7 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c,
 		} else if (ret) {
 			struct bkey *k;
 
-			pr_err("error %i trying to insert key for %s",
-			       ret, op_type(op));
+			pr_err("error %i", ret);
 
 			while ((k = bch_keylist_pop(keys)))
 				bkey_put(c, k, 0);
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 34ee5359b262..6ff08be3e0c9 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -240,15 +240,7 @@ struct btree_op {
 	/* Btree level at which we start taking write locks */
 	short			lock;
 
-	/* Btree insertion type */
-	enum {
-		BTREE_INSERT,
-		BTREE_REPLACE
-	} type:8;
-
 	unsigned		insert_collision:1;
-
-	BKEY_PADDED(replace);
 };
 
 static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
@@ -290,7 +282,7 @@ struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool);
 int bch_btree_insert_check_key(struct btree *, struct btree_op *,
 			       struct bkey *);
 int bch_btree_insert(struct btree_op *, struct cache_set *,
-		     struct keylist *, atomic_t *);
+		     struct keylist *, atomic_t *, struct bkey *);
 
 int bch_gc_thread_start(struct cache_set *);
 size_t bch_btree_gc_finish(struct cache_set *);
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 20e900ad5010..592adf51128f 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -322,7 +322,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
 			bkey_copy(keylist.top, k);
 			bch_keylist_push(&keylist);
 
-			ret = bch_btree_insert(&op, s, &keylist, i->pin);
+			ret = bch_btree_insert(&op, s, &keylist, i->pin, NULL);
 			if (ret)
 				goto err;
 
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 219356f6159d..c45ba4f21bae 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -105,8 +105,8 @@ static void write_moving(struct closure *cl)
 		s->writeback		= KEY_DIRTY(&io->w->key);
 		s->csum			= KEY_CSUM(&io->w->key);
 
-		s->op.type = BTREE_REPLACE;
-		bkey_copy(&s->op.replace, &io->w->key);
+		bkey_copy(&s->replace_key, &io->w->key);
+		s->replace = true;
 
 		closure_init(&s->btree, cl);
 		bch_data_insert(&s->btree);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 5df44fbc9e1d..16a3e16f3ff4 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -217,6 +217,7 @@ static void bch_data_insert_keys(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, btree);
 	atomic_t *journal_ref = NULL;
+	struct bkey *replace_key = s->replace ? &s->replace_key : NULL;
 
 	/*
 	 * If we're looping, might already be waiting on
@@ -235,7 +236,8 @@ static void bch_data_insert_keys(struct closure *cl)
 					  s->flush_journal
 					  ? &s->cl : NULL);
 
-	if (bch_btree_insert(&s->op, s->c, &s->insert_keys, journal_ref)) {
+	if (bch_btree_insert(&s->op, s->c, &s->insert_keys,
+			     journal_ref, replace_key)) {
 		s->error		= -ENOMEM;
 		s->insert_data_done	= true;
 	}
@@ -1056,7 +1058,7 @@ static void cached_dev_read_done(struct closure *cl)
 
 	if (s->cache_bio &&
 	    !test_bit(CACHE_SET_STOPPING, &s->c->flags)) {
-		s->op.type = BTREE_REPLACE;
+		BUG_ON(!s->replace);
 		closure_call(&s->btree, bch_data_insert, NULL, cl);
 	}
 
@@ -1101,13 +1103,15 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 
 	s->cache_bio_sectors = min(sectors, bio_sectors(bio) + reada);
 
-	s->op.replace = KEY(s->inode, bio->bi_sector +
-			    s->cache_bio_sectors, s->cache_bio_sectors);
+	s->replace_key = KEY(s->inode, bio->bi_sector +
+			     s->cache_bio_sectors, s->cache_bio_sectors);
 
-	ret = bch_btree_insert_check_key(b, &s->op, &s->op.replace);
+	ret = bch_btree_insert_check_key(b, &s->op, &s->replace_key);
 	if (ret)
 		return ret;
 
+	s->replace = true;
+
 	miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
 
 	/* btree_search_recurse()'s btree iterator is no good anymore */
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index ed578aa53ee2..5ea7a0e6fca0 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -33,6 +33,7 @@ struct search {
 	unsigned		flush_journal:1;
 
 	unsigned		insert_data_done:1;
+	unsigned		replace:1;
 
 	uint16_t		write_prio;
 
@@ -44,6 +45,7 @@ struct search {
 
 	/* Anything past this point won't get zeroed in search_alloc() */
 	struct keylist		insert_keys;
+	BKEY_PADDED(replace_key);
 };
 
 unsigned bch_get_congested(struct cache_set *);
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index d0968e8938f7..346a5341faca 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -146,16 +146,14 @@ static void write_dirty_finish(struct closure *cl)
 		bch_btree_op_init(&op, -1);
 		bch_keylist_init(&keys);
 
-		op.type = BTREE_REPLACE;
-		bkey_copy(&op.replace, &w->key);
-
-		SET_KEY_DIRTY(&w->key, false);
-		bch_keylist_add(&keys, &w->key);
+		bkey_copy(keys.top, &w->key);
+		SET_KEY_DIRTY(keys.top, false);
+		bch_keylist_push(&keys);
 
 		for (i = 0; i < KEY_PTRS(&w->key); i++)
 			atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
 
-		bch_btree_insert(&op, dc->disk.c, &keys, NULL);
+		bch_btree_insert(&op, dc->disk.c, &keys, NULL, &w->key);
 
 		if (op.insert_collision)
 			trace_bcache_writeback_collision(&w->key);
-- 
cgit v1.2.3


From 6054c6d4da1940c7bf8870c6393773aa794f53d8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 18:06:22 -0700
Subject: bcache: Don't use op->insert_collision

When we convert bch_btree_insert() to bch_btree_map_leaf_nodes(), we
won't be passing struct btree_op to bch_btree_insert() anymore - so we
need a different way of returning whether there was a collision (really,
a replace collision).

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c     |  3 +++
 drivers/md/bcache/movinggc.c  |  2 +-
 drivers/md/bcache/request.c   | 10 +++++++---
 drivers/md/bcache/request.h   |  1 +
 drivers/md/bcache/writeback.c |  7 ++++---
 5 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 7a1d8dc19e61..441524dd2d77 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -2227,6 +2227,9 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c,
 		}
 	}
 
+	if (op->insert_collision)
+		return -ESRCH;
+
 	return ret;
 }
 
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index c45ba4f21bae..601c96a62b30 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -52,7 +52,7 @@ static void write_moving_finish(struct closure *cl)
 	bio_for_each_segment_all(bv, bio, i)
 		__free_page(bv->bv_page);
 
-	if (io->s.op.insert_collision)
+	if (io->s.insert_collision)
 		trace_bcache_gc_copy_collision(&io->w->key);
 
 	bch_keybuf_del(&io->s.c->moving_gc_keys, io->w);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 16a3e16f3ff4..bcce06a1e466 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -218,6 +218,7 @@ static void bch_data_insert_keys(struct closure *cl)
 	struct search *s = container_of(cl, struct search, btree);
 	atomic_t *journal_ref = NULL;
 	struct bkey *replace_key = s->replace ? &s->replace_key : NULL;
+	int ret;
 
 	/*
 	 * If we're looping, might already be waiting on
@@ -236,8 +237,11 @@ static void bch_data_insert_keys(struct closure *cl)
 					  s->flush_journal
 					  ? &s->cl : NULL);
 
-	if (bch_btree_insert(&s->op, s->c, &s->insert_keys,
-			     journal_ref, replace_key)) {
+	ret = bch_btree_insert(&s->op, s->c, &s->insert_keys,
+			       journal_ref, replace_key);
+	if (ret == -ESRCH) {
+		s->insert_collision = true;
+	} else if (ret) {
 		s->error		= -ENOMEM;
 		s->insert_data_done	= true;
 	}
@@ -977,7 +981,7 @@ static void cached_dev_cache_miss_done(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 
-	if (s->op.insert_collision)
+	if (s->insert_collision)
 		bch_mark_cache_miss_collision(s);
 
 	if (s->cache_bio) {
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 5ea7a0e6fca0..f0e930b4ca89 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -34,6 +34,7 @@ struct search {
 
 	unsigned		insert_data_done:1;
 	unsigned		replace:1;
+	unsigned		insert_collision:1;
 
 	uint16_t		write_prio;
 
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 346a5341faca..312032ef34f7 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -142,6 +142,7 @@ static void write_dirty_finish(struct closure *cl)
 		unsigned i;
 		struct btree_op op;
 		struct keylist keys;
+		int ret;
 
 		bch_btree_op_init(&op, -1);
 		bch_keylist_init(&keys);
@@ -153,12 +154,12 @@ static void write_dirty_finish(struct closure *cl)
 		for (i = 0; i < KEY_PTRS(&w->key); i++)
 			atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
 
-		bch_btree_insert(&op, dc->disk.c, &keys, NULL, &w->key);
+		ret = bch_btree_insert(&op, dc->disk.c, &keys, NULL, &w->key);
 
-		if (op.insert_collision)
+		if (ret)
 			trace_bcache_writeback_collision(&w->key);
 
-		atomic_long_inc(op.insert_collision
+		atomic_long_inc(ret
 				? &dc->disk.c->writeback_keys_failed
 				: &dc->disk.c->writeback_keys_done);
 	}
-- 
cgit v1.2.3


From cc7b8819212f437fc82f0f9cdc24deb0fb5d775f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 18:07:22 -0700
Subject: bcache: Convert bch_btree_insert() to bch_btree_map_leaf_nodes()

Last of the btree_map() conversions. Main visible effect is
bch_btree_insert() is no longer taking a struct btree_op as an argument
anymore - there's no fancy state machine stuff going on, it's just a
normal function.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c     | 79 ++++++++++++++++++++-----------------------
 drivers/md/bcache/btree.h     |  4 +--
 drivers/md/bcache/journal.c   |  4 +--
 drivers/md/bcache/request.c   |  2 +-
 drivers/md/bcache/writeback.c |  6 ++--
 5 files changed, 43 insertions(+), 52 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 441524dd2d77..f5aa4adadf1d 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -2174,61 +2174,56 @@ out:
 	return ret;
 }
 
-static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
-				    struct keylist *keys, atomic_t *journal_ref,
-				    struct bkey *replace_key)
-{
-	if (bch_keylist_empty(keys))
-		return 0;
-
-	if (b->level) {
-		struct bkey *k;
-
-		k = bch_next_recurse_key(b, &START_KEY(keys->keys));
-		if (!k) {
-			btree_bug(b, "no key to recurse on at level %i/%i",
-				  b->level, b->c->root->level);
+struct btree_insert_op {
+	struct btree_op	op;
+	struct keylist	*keys;
+	atomic_t	*journal_ref;
+	struct bkey	*replace_key;
+};
 
-			bch_keylist_reset(keys);
-			return -EIO;
-		}
+int btree_insert_fn(struct btree_op *b_op, struct btree *b)
+{
+	struct btree_insert_op *op = container_of(b_op,
+					struct btree_insert_op, op);
 
-		return btree(insert_recurse, k, b, op, keys,
-			     journal_ref, replace_key);
-	} else {
-		return bch_btree_insert_node(b, op, keys,
-					     journal_ref, replace_key);
-	}
+	int ret = bch_btree_insert_node(b, &op->op, op->keys,
+					op->journal_ref, op->replace_key);
+	if (ret && !bch_keylist_empty(op->keys))
+		return ret;
+	else
+		return MAP_DONE;
 }
 
-int bch_btree_insert(struct btree_op *op, struct cache_set *c,
-		     struct keylist *keys, atomic_t *journal_ref,
-		     struct bkey *replace_key)
+int bch_btree_insert(struct cache_set *c, struct keylist *keys,
+		     atomic_t *journal_ref, struct bkey *replace_key)
 {
+	struct btree_insert_op op;
 	int ret = 0;
 
+	BUG_ON(current->bio_list);
 	BUG_ON(bch_keylist_empty(keys));
 
-	while (!bch_keylist_empty(keys)) {
-		op->lock = 0;
-		ret = btree_root(insert_recurse, c, op, keys,
-				 journal_ref, replace_key);
+	bch_btree_op_init(&op.op, 0);
+	op.keys		= keys;
+	op.journal_ref	= journal_ref;
+	op.replace_key	= replace_key;
 
-		if (ret == -EAGAIN) {
-			BUG();
-			ret = 0;
-		} else if (ret) {
-			struct bkey *k;
+	while (!ret && !bch_keylist_empty(keys)) {
+		op.op.lock = 0;
+		ret = bch_btree_map_leaf_nodes(&op.op, c,
+					       &START_KEY(keys->keys),
+					       btree_insert_fn);
+	}
 
-			pr_err("error %i", ret);
+	if (ret) {
+		struct bkey *k;
 
-			while ((k = bch_keylist_pop(keys)))
-				bkey_put(c, k, 0);
-		}
-	}
+		pr_err("error %i", ret);
 
-	if (op->insert_collision)
-		return -ESRCH;
+		while ((k = bch_keylist_pop(keys)))
+			bkey_put(c, k, 0);
+	} else if (op.op.insert_collision)
+		ret = -ESRCH;
 
 	return ret;
 }
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 6ff08be3e0c9..8fc1e8925399 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -281,8 +281,8 @@ struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool);
 
 int bch_btree_insert_check_key(struct btree *, struct btree_op *,
 			       struct bkey *);
-int bch_btree_insert(struct btree_op *, struct cache_set *,
-		     struct keylist *, atomic_t *, struct bkey *);
+int bch_btree_insert(struct cache_set *, struct keylist *,
+		     atomic_t *, struct bkey *);
 
 int bch_gc_thread_start(struct cache_set *);
 size_t bch_btree_gc_finish(struct cache_set *);
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 592adf51128f..86de64a6bf26 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -302,10 +302,8 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
 
 	uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
 	struct keylist keylist;
-	struct btree_op op;
 
 	bch_keylist_init(&keylist);
-	bch_btree_op_init(&op, SHRT_MAX);
 
 	list_for_each_entry(i, list, list) {
 		BUG_ON(i->pin && atomic_read(i->pin) != 1);
@@ -322,7 +320,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
 			bkey_copy(keylist.top, k);
 			bch_keylist_push(&keylist);
 
-			ret = bch_btree_insert(&op, s, &keylist, i->pin, NULL);
+			ret = bch_btree_insert(s, &keylist, i->pin, NULL);
 			if (ret)
 				goto err;
 
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index bcce06a1e466..6cee2ae1d87f 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -237,7 +237,7 @@ static void bch_data_insert_keys(struct closure *cl)
 					  s->flush_journal
 					  ? &s->cl : NULL);
 
-	ret = bch_btree_insert(&s->op, s->c, &s->insert_keys,
+	ret = bch_btree_insert(s->c, &s->insert_keys,
 			       journal_ref, replace_key);
 	if (ret == -ESRCH) {
 		s->insert_collision = true;
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 312032ef34f7..ab0f6b449111 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -139,12 +139,10 @@ static void write_dirty_finish(struct closure *cl)
 
 	/* This is kind of a dumb way of signalling errors. */
 	if (KEY_DIRTY(&w->key)) {
+		int ret;
 		unsigned i;
-		struct btree_op op;
 		struct keylist keys;
-		int ret;
 
-		bch_btree_op_init(&op, -1);
 		bch_keylist_init(&keys);
 
 		bkey_copy(keys.top, &w->key);
@@ -154,7 +152,7 @@ static void write_dirty_finish(struct closure *cl)
 		for (i = 0; i < KEY_PTRS(&w->key); i++)
 			atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
 
-		ret = bch_btree_insert(&op, dc->disk.c, &keys, NULL, &w->key);
+		ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key);
 
 		if (ret)
 			trace_bcache_writeback_collision(&w->key);
-- 
cgit v1.2.3


From 220bb38c21b83e2f7b842f33220bf727093eca89 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 10 Sep 2013 19:02:45 -0700
Subject: bcache: Break up struct search

With all the recent refactoring around struct btree op struct search has
gotten rather large.

But we can now easily break it up in a different way - we break out
struct btree_insert_op which is for inserting data into the cache, and
that's now what the copying gc code uses - struct search is now specific
to request.c

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/debug.c     |  38 +--
 drivers/md/bcache/debug.h     |   4 +-
 drivers/md/bcache/movinggc.c  |  56 ++--
 drivers/md/bcache/request.c   | 580 ++++++++++++++++++++++--------------------
 drivers/md/bcache/request.h   |  37 +--
 drivers/md/bcache/stats.c     |  26 +-
 drivers/md/bcache/stats.h     |  13 +-
 drivers/md/bcache/trace.c     |   1 -
 include/trace/events/bcache.h |  18 +-
 9 files changed, 378 insertions(+), 395 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 88e6411eab4f..d9ccb3169aa2 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -8,7 +8,6 @@
 #include "bcache.h"
 #include "btree.h"
 #include "debug.h"
-#include "request.h"
 
 #include <linux/console.h>
 #include <linux/debugfs.h>
@@ -176,42 +175,25 @@ void bch_btree_verify(struct btree *b, struct bset *new)
 	mutex_unlock(&b->c->verify_lock);
 }
 
-static void data_verify_endio(struct bio *bio, int error)
-{
-	struct closure *cl = bio->bi_private;
-	closure_put(cl);
-}
-
-void bch_data_verify(struct search *s)
+void bch_data_verify(struct cached_dev *dc, struct bio *bio)
 {
 	char name[BDEVNAME_SIZE];
-	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
-	struct closure *cl = &s->cl;
 	struct bio *check;
 	struct bio_vec *bv;
 	int i;
 
-	if (!s->unaligned_bvec)
-		bio_for_each_segment(bv, s->orig_bio, i)
-			bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
-
-	check = bio_clone(s->orig_bio, GFP_NOIO);
+	check = bio_clone(bio, GFP_NOIO);
 	if (!check)
 		return;
 
 	if (bio_alloc_pages(check, GFP_NOIO))
 		goto out_put;
 
-	check->bi_rw		= READ_SYNC;
-	check->bi_private	= cl;
-	check->bi_end_io	= data_verify_endio;
-
-	closure_bio_submit(check, cl, &dc->disk);
-	closure_sync(cl);
+	submit_bio_wait(READ_SYNC, check);
 
-	bio_for_each_segment(bv, s->orig_bio, i) {
-		void *p1 = kmap(bv->bv_page);
-		void *p2 = kmap(check->bi_io_vec[i].bv_page);
+	bio_for_each_segment(bv, bio, i) {
+		void *p1 = kmap_atomic(bv->bv_page);
+		void *p2 = page_address(check->bi_io_vec[i].bv_page);
 
 		if (memcmp(p1 + bv->bv_offset,
 			   p2 + bv->bv_offset,
@@ -219,13 +201,11 @@ void bch_data_verify(struct search *s)
 			printk(KERN_ERR
 			       "bcache (%s): verify failed at sector %llu\n",
 			       bdevname(dc->bdev, name),
-			       (uint64_t) s->orig_bio->bi_sector);
-
-		kunmap(bv->bv_page);
-		kunmap(check->bi_io_vec[i].bv_page);
+			       (uint64_t) bio->bi_sector);
+		kunmap_atomic(p1);
 	}
 
-	__bio_for_each_segment(bv, check, i, 0)
+	bio_for_each_segment_all(bv, check, i)
 		__free_page(bv->bv_page);
 out_put:
 	bio_put(check);
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h
index 1c39b5a2489b..0f4b3440512c 100644
--- a/drivers/md/bcache/debug.h
+++ b/drivers/md/bcache/debug.h
@@ -29,12 +29,12 @@ void bch_check_keys(struct btree *, const char *, ...);
 #ifdef CONFIG_BCACHE_DEBUG
 
 void bch_btree_verify(struct btree *, struct bset *);
-void bch_data_verify(struct search *);
+void bch_data_verify(struct cached_dev *, struct bio *);
 
 #else /* DEBUG */
 
 static inline void bch_btree_verify(struct btree *b, struct bset *i) {}
-static inline void bch_data_verify(struct search *s) {};
+static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {};
 
 #endif
 
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 601c96a62b30..7c1275e66025 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -12,8 +12,9 @@
 #include <trace/events/bcache.h>
 
 struct moving_io {
+	struct closure		cl;
 	struct keybuf_key	*w;
-	struct search		s;
+	struct data_insert_op	op;
 	struct bbio		bio;
 };
 
@@ -38,13 +39,13 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k)
 
 static void moving_io_destructor(struct closure *cl)
 {
-	struct moving_io *io = container_of(cl, struct moving_io, s.cl);
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
 	kfree(io);
 }
 
 static void write_moving_finish(struct closure *cl)
 {
-	struct moving_io *io = container_of(cl, struct moving_io, s.cl);
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
 	struct bio *bio = &io->bio.bio;
 	struct bio_vec *bv;
 	int i;
@@ -52,12 +53,12 @@ static void write_moving_finish(struct closure *cl)
 	bio_for_each_segment_all(bv, bio, i)
 		__free_page(bv->bv_page);
 
-	if (io->s.insert_collision)
+	if (io->op.replace_collision)
 		trace_bcache_gc_copy_collision(&io->w->key);
 
-	bch_keybuf_del(&io->s.c->moving_gc_keys, io->w);
+	bch_keybuf_del(&io->op.c->moving_gc_keys, io->w);
 
-	up(&io->s.c->moving_in_flight);
+	up(&io->op.c->moving_in_flight);
 
 	closure_return_with_destructor(cl, moving_io_destructor);
 }
@@ -65,12 +66,12 @@ static void write_moving_finish(struct closure *cl)
 static void read_moving_endio(struct bio *bio, int error)
 {
 	struct moving_io *io = container_of(bio->bi_private,
-					    struct moving_io, s.cl);
+					    struct moving_io, cl);
 
 	if (error)
-		io->s.error = error;
+		io->op.error = error;
 
-	bch_bbio_endio(io->s.c, bio, error, "reading data to move");
+	bch_bbio_endio(io->op.c, bio, error, "reading data to move");
 }
 
 static void moving_init(struct moving_io *io)
@@ -84,32 +85,30 @@ static void moving_init(struct moving_io *io)
 	bio->bi_size		= KEY_SIZE(&io->w->key) << 9;
 	bio->bi_max_vecs	= DIV_ROUND_UP(KEY_SIZE(&io->w->key),
 					       PAGE_SECTORS);
-	bio->bi_private		= &io->s.cl;
+	bio->bi_private		= &io->cl;
 	bio->bi_io_vec		= bio->bi_inline_vecs;
 	bch_bio_map(bio, NULL);
 }
 
 static void write_moving(struct closure *cl)
 {
-	struct search *s = container_of(cl, struct search, cl);
-	struct moving_io *io = container_of(s, struct moving_io, s);
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	struct data_insert_op *op = &io->op;
 
-	if (!s->error) {
+	if (!op->error) {
 		moving_init(io);
 
-		io->bio.bio.bi_sector	= KEY_START(&io->w->key);
-		s->op.lock		= -1;
-		s->write_prio		= 1;
-		s->cache_bio		= &io->bio.bio;
+		io->bio.bio.bi_sector = KEY_START(&io->w->key);
+		op->write_prio		= 1;
+		op->bio			= &io->bio.bio;
 
-		s->writeback		= KEY_DIRTY(&io->w->key);
-		s->csum			= KEY_CSUM(&io->w->key);
+		op->writeback		= KEY_DIRTY(&io->w->key);
+		op->csum		= KEY_CSUM(&io->w->key);
 
-		bkey_copy(&s->replace_key, &io->w->key);
-		s->replace = true;
+		bkey_copy(&op->replace_key, &io->w->key);
+		op->replace		= true;
 
-		closure_init(&s->btree, cl);
-		bch_data_insert(&s->btree);
+		closure_call(&op->cl, bch_data_insert, NULL, cl);
 	}
 
 	continue_at(cl, write_moving_finish, system_wq);
@@ -117,11 +116,10 @@ static void write_moving(struct closure *cl)
 
 static void read_moving_submit(struct closure *cl)
 {
-	struct search *s = container_of(cl, struct search, cl);
-	struct moving_io *io = container_of(s, struct moving_io, s);
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
 	struct bio *bio = &io->bio.bio;
 
-	bch_submit_bbio(bio, s->c, &io->w->key, 0);
+	bch_submit_bbio(bio, io->op.c, &io->w->key, 0);
 
 	continue_at(cl, write_moving, system_wq);
 }
@@ -151,8 +149,8 @@ static void read_moving(struct cache_set *c)
 
 		w->private	= io;
 		io->w		= w;
-		io->s.inode	= KEY_INODE(&w->key);
-		io->s.c		= c;
+		io->op.inode	= KEY_INODE(&w->key);
+		io->op.c	= c;
 
 		moving_init(io);
 		bio = &io->bio.bio;
@@ -166,7 +164,7 @@ static void read_moving(struct cache_set *c)
 		trace_bcache_gc_copy(&w->key);
 
 		down(&c->moving_in_flight);
-		closure_call(&io->s.cl, read_moving_submit, NULL, &cl);
+		closure_call(&io->cl, read_moving_submit, NULL, &cl);
 	}
 
 	if (0) {
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 6cee2ae1d87f..05c7c216f65e 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -215,9 +215,9 @@ static void bio_csum(struct bio *bio, struct bkey *k)
 
 static void bch_data_insert_keys(struct closure *cl)
 {
-	struct search *s = container_of(cl, struct search, btree);
+	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
 	atomic_t *journal_ref = NULL;
-	struct bkey *replace_key = s->replace ? &s->replace_key : NULL;
+	struct bkey *replace_key = op->replace ? &op->replace_key : NULL;
 	int ret;
 
 	/*
@@ -232,27 +232,26 @@ static void bch_data_insert_keys(struct closure *cl)
 		closure_sync(&s->cl);
 #endif
 
-	if (s->write)
-		journal_ref = bch_journal(s->c, &s->insert_keys,
-					  s->flush_journal
-					  ? &s->cl : NULL);
+	if (!op->replace)
+		journal_ref = bch_journal(op->c, &op->insert_keys,
+					  op->flush_journal ? cl : NULL);
 
-	ret = bch_btree_insert(s->c, &s->insert_keys,
+	ret = bch_btree_insert(op->c, &op->insert_keys,
 			       journal_ref, replace_key);
 	if (ret == -ESRCH) {
-		s->insert_collision = true;
+		op->replace_collision = true;
 	} else if (ret) {
-		s->error		= -ENOMEM;
-		s->insert_data_done	= true;
+		op->error		= -ENOMEM;
+		op->insert_data_done	= true;
 	}
 
 	if (journal_ref)
 		atomic_dec_bug(journal_ref);
 
-	if (!s->insert_data_done)
+	if (!op->insert_data_done)
 		continue_at(cl, bch_data_insert_start, bcache_wq);
 
-	bch_keylist_free(&s->insert_keys);
+	bch_keylist_free(&op->insert_keys);
 	closure_return(cl);
 }
 
@@ -349,10 +348,10 @@ found:
  *
  * If s->writeback is true, will not fail.
  */
-static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
-			      struct search *s)
+static bool bch_alloc_sectors(struct data_insert_op *op,
+			      struct bkey *k, unsigned sectors)
 {
-	struct cache_set *c = s->c;
+	struct cache_set *c = op->c;
 	struct open_bucket *b;
 	BKEY_PADDED(key) alloc;
 	unsigned i;
@@ -367,15 +366,15 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
 	bkey_init(&alloc.key);
 	spin_lock(&c->data_bucket_lock);
 
-	while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) {
-		unsigned watermark = s->write_prio
+	while (!(b = pick_data_bucket(c, k, op->task, &alloc.key))) {
+		unsigned watermark = op->write_prio
 			? WATERMARK_MOVINGGC
 			: WATERMARK_NONE;
 
 		spin_unlock(&c->data_bucket_lock);
 
 		if (bch_bucket_alloc_set(c, watermark, &alloc.key,
-					 1, s->writeback))
+					 1, op->writeback))
 			return false;
 
 		spin_lock(&c->data_bucket_lock);
@@ -409,7 +408,7 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
 	 */
 	list_move_tail(&b->list, &c->data_buckets);
 	bkey_copy_key(&b->key, k);
-	b->last = s->task;
+	b->last = op->task;
 
 	b->sectors_free	-= sectors;
 
@@ -438,8 +437,8 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
 
 static void bch_data_invalidate(struct closure *cl)
 {
-	struct search *s = container_of(cl, struct search, btree);
-	struct bio *bio = s->cache_bio;
+	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+	struct bio *bio = op->bio;
 
 	pr_debug("invalidating %i sectors from %llu",
 		 bio_sectors(bio), (uint64_t) bio->bi_sector);
@@ -447,17 +446,17 @@ static void bch_data_invalidate(struct closure *cl)
 	while (bio_sectors(bio)) {
 		unsigned len = min(bio_sectors(bio), 1U << 14);
 
-		if (bch_keylist_realloc(&s->insert_keys, 0, s->c))
+		if (bch_keylist_realloc(&op->insert_keys, 0, op->c))
 			goto out;
 
 		bio->bi_sector	+= len;
 		bio->bi_size	-= len << 9;
 
-		bch_keylist_add(&s->insert_keys,
-				&KEY(s->inode, bio->bi_sector, len));
+		bch_keylist_add(&op->insert_keys,
+				&KEY(op->inode, bio->bi_sector, len));
 	}
 
-	s->insert_data_done = true;
+	op->insert_data_done = true;
 	bio_put(bio);
 out:
 	continue_at(cl, bch_data_insert_keys, bcache_wq);
@@ -465,7 +464,7 @@ out:
 
 static void bch_data_insert_error(struct closure *cl)
 {
-	struct search *s = container_of(cl, struct search, btree);
+	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
 
 	/*
 	 * Our data write just errored, which means we've got a bunch of keys to
@@ -476,9 +475,9 @@ static void bch_data_insert_error(struct closure *cl)
 	 * from the keys we'll accomplish just that.
 	 */
 
-	struct bkey *src = s->insert_keys.keys, *dst = s->insert_keys.keys;
+	struct bkey *src = op->insert_keys.keys, *dst = op->insert_keys.keys;
 
-	while (src != s->insert_keys.top) {
+	while (src != op->insert_keys.top) {
 		struct bkey *n = bkey_next(src);
 
 		SET_KEY_PTRS(src, 0);
@@ -488,7 +487,7 @@ static void bch_data_insert_error(struct closure *cl)
 		src = n;
 	}
 
-	s->insert_keys.top = dst;
+	op->insert_keys.top = dst;
 
 	bch_data_insert_keys(cl);
 }
@@ -496,32 +495,32 @@ static void bch_data_insert_error(struct closure *cl)
 static void bch_data_insert_endio(struct bio *bio, int error)
 {
 	struct closure *cl = bio->bi_private;
-	struct search *s = container_of(cl, struct search, btree);
+	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
 
 	if (error) {
 		/* TODO: We could try to recover from this. */
-		if (s->writeback)
-			s->error = error;
-		else if (s->write)
+		if (op->writeback)
+			op->error = error;
+		else if (!op->replace)
 			set_closure_fn(cl, bch_data_insert_error, bcache_wq);
 		else
 			set_closure_fn(cl, NULL, NULL);
 	}
 
-	bch_bbio_endio(s->c, bio, error, "writing data to cache");
+	bch_bbio_endio(op->c, bio, error, "writing data to cache");
 }
 
 static void bch_data_insert_start(struct closure *cl)
 {
-	struct search *s = container_of(cl, struct search, btree);
-	struct bio *bio = s->cache_bio, *n;
+	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
+	struct bio *bio = op->bio, *n;
 
-	if (s->bypass)
+	if (op->bypass)
 		return bch_data_invalidate(cl);
 
-	if (atomic_sub_return(bio_sectors(bio), &s->c->sectors_to_gc) < 0) {
-		set_gc_sectors(s->c);
-		wake_up_gc(s->c);
+	if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
+		set_gc_sectors(op->c);
+		wake_up_gc(op->c);
 	}
 
 	/*
@@ -533,21 +532,20 @@ static void bch_data_insert_start(struct closure *cl)
 	do {
 		unsigned i;
 		struct bkey *k;
-		struct bio_set *split = s->d
-			? s->d->bio_split : s->c->bio_split;
+		struct bio_set *split = op->c->bio_split;
 
 		/* 1 for the device pointer and 1 for the chksum */
-		if (bch_keylist_realloc(&s->insert_keys,
-					1 + (s->csum ? 1 : 0),
-					s->c))
+		if (bch_keylist_realloc(&op->insert_keys,
+					1 + (op->csum ? 1 : 0),
+					op->c))
 			continue_at(cl, bch_data_insert_keys, bcache_wq);
 
-		k = s->insert_keys.top;
+		k = op->insert_keys.top;
 		bkey_init(k);
-		SET_KEY_INODE(k, s->inode);
+		SET_KEY_INODE(k, op->inode);
 		SET_KEY_OFFSET(k, bio->bi_sector);
 
-		if (!bch_alloc_sectors(k, bio_sectors(bio), s))
+		if (!bch_alloc_sectors(op, k, bio_sectors(bio)))
 			goto err;
 
 		n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
@@ -555,30 +553,30 @@ static void bch_data_insert_start(struct closure *cl)
 		n->bi_end_io	= bch_data_insert_endio;
 		n->bi_private	= cl;
 
-		if (s->writeback) {
+		if (op->writeback) {
 			SET_KEY_DIRTY(k, true);
 
 			for (i = 0; i < KEY_PTRS(k); i++)
-				SET_GC_MARK(PTR_BUCKET(s->c, k, i),
+				SET_GC_MARK(PTR_BUCKET(op->c, k, i),
 					    GC_MARK_DIRTY);
 		}
 
-		SET_KEY_CSUM(k, s->csum);
+		SET_KEY_CSUM(k, op->csum);
 		if (KEY_CSUM(k))
 			bio_csum(n, k);
 
 		trace_bcache_cache_insert(k);
-		bch_keylist_push(&s->insert_keys);
+		bch_keylist_push(&op->insert_keys);
 
 		n->bi_rw |= REQ_WRITE;
-		bch_submit_bbio(n, s->c, k, 0);
+		bch_submit_bbio(n, op->c, k, 0);
 	} while (n != bio);
 
-	s->insert_data_done = true;
+	op->insert_data_done = true;
 	continue_at(cl, bch_data_insert_keys, bcache_wq);
 err:
 	/* bch_alloc_sectors() blocks if s->writeback = true */
-	BUG_ON(s->writeback);
+	BUG_ON(op->writeback);
 
 	/*
 	 * But if it's not a writeback write we'd rather just bail out if
@@ -586,24 +584,24 @@ err:
 	 * we might be starving btree writes for gc or something.
 	 */
 
-	if (s->write) {
+	if (!op->replace) {
 		/*
 		 * Writethrough write: We can't complete the write until we've
 		 * updated the index. But we don't want to delay the write while
 		 * we wait for buckets to be freed up, so just invalidate the
 		 * rest of the write.
 		 */
-		s->bypass = true;
+		op->bypass = true;
 		return bch_data_invalidate(cl);
 	} else {
 		/*
 		 * From a cache miss, we can just insert the keys for the data
 		 * we have written or bail out if we didn't do anything.
 		 */
-		s->insert_data_done = true;
+		op->insert_data_done = true;
 		bio_put(bio);
 
-		if (!bch_keylist_empty(&s->insert_keys))
+		if (!bch_keylist_empty(&op->insert_keys))
 			continue_at(cl, bch_data_insert_keys, bcache_wq);
 		else
 			closure_return(cl);
@@ -631,15 +629,169 @@ err:
  */
 void bch_data_insert(struct closure *cl)
 {
-	struct search *s = container_of(cl, struct search, btree);
+	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
 
-	bch_keylist_init(&s->insert_keys);
-	bio_get(s->cache_bio);
+	trace_bcache_write(op->bio, op->writeback, op->bypass);
+
+	bch_keylist_init(&op->insert_keys);
+	bio_get(op->bio);
 	bch_data_insert_start(cl);
 }
 
+/* Congested? */
+
+unsigned bch_get_congested(struct cache_set *c)
+{
+	int i;
+	long rand;
+
+	if (!c->congested_read_threshold_us &&
+	    !c->congested_write_threshold_us)
+		return 0;
+
+	i = (local_clock_us() - c->congested_last_us) / 1024;
+	if (i < 0)
+		return 0;
+
+	i += atomic_read(&c->congested);
+	if (i >= 0)
+		return 0;
+
+	i += CONGESTED_MAX;
+
+	if (i > 0)
+		i = fract_exp_two(i, 6);
+
+	rand = get_random_int();
+	i -= bitmap_weight(&rand, BITS_PER_LONG);
+
+	return i > 0 ? i : 1;
+}
+
+static void add_sequential(struct task_struct *t)
+{
+	ewma_add(t->sequential_io_avg,
+		 t->sequential_io, 8, 0);
+
+	t->sequential_io = 0;
+}
+
+static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
+{
+	return &dc->io_hash[hash_64(k, RECENT_IO_BITS)];
+}
+
+static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
+{
+	struct cache_set *c = dc->disk.c;
+	unsigned mode = cache_mode(dc, bio);
+	unsigned sectors, congested = bch_get_congested(c);
+	struct task_struct *task = current;
+
+	if (atomic_read(&dc->disk.detaching) ||
+	    c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
+	    (bio->bi_rw & REQ_DISCARD))
+		goto skip;
+
+	if (mode == CACHE_MODE_NONE ||
+	    (mode == CACHE_MODE_WRITEAROUND &&
+	     (bio->bi_rw & REQ_WRITE)))
+		goto skip;
+
+	if (bio->bi_sector & (c->sb.block_size - 1) ||
+	    bio_sectors(bio) & (c->sb.block_size - 1)) {
+		pr_debug("skipping unaligned io");
+		goto skip;
+	}
+
+	if (!congested && !dc->sequential_cutoff)
+		goto rescale;
+
+	if (!congested &&
+	    mode == CACHE_MODE_WRITEBACK &&
+	    (bio->bi_rw & REQ_WRITE) &&
+	    (bio->bi_rw & REQ_SYNC))
+		goto rescale;
+
+	if (dc->sequential_merge) {
+		struct io *i;
+
+		spin_lock(&dc->io_lock);
+
+		hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash)
+			if (i->last == bio->bi_sector &&
+			    time_before(jiffies, i->jiffies))
+				goto found;
+
+		i = list_first_entry(&dc->io_lru, struct io, lru);
+
+		add_sequential(task);
+		i->sequential = 0;
+found:
+		if (i->sequential + bio->bi_size > i->sequential)
+			i->sequential	+= bio->bi_size;
+
+		i->last			 = bio_end_sector(bio);
+		i->jiffies		 = jiffies + msecs_to_jiffies(5000);
+		task->sequential_io	 = i->sequential;
+
+		hlist_del(&i->hash);
+		hlist_add_head(&i->hash, iohash(dc, i->last));
+		list_move_tail(&i->lru, &dc->io_lru);
+
+		spin_unlock(&dc->io_lock);
+	} else {
+		task->sequential_io = bio->bi_size;
+
+		add_sequential(task);
+	}
+
+	sectors = max(task->sequential_io,
+		      task->sequential_io_avg) >> 9;
+
+	if (dc->sequential_cutoff &&
+	    sectors >= dc->sequential_cutoff >> 9) {
+		trace_bcache_bypass_sequential(bio);
+		goto skip;
+	}
+
+	if (congested && sectors >= congested) {
+		trace_bcache_bypass_congested(bio);
+		goto skip;
+	}
+
+rescale:
+	bch_rescale_priorities(c, bio_sectors(bio));
+	return false;
+skip:
+	bch_mark_sectors_bypassed(c, dc, bio_sectors(bio));
+	return true;
+}
+
 /* Cache lookup */
 
+struct search {
+	/* Stack frame for bio_complete */
+	struct closure		cl;
+
+	struct bcache_device	*d;
+
+	struct bbio		bio;
+	struct bio		*orig_bio;
+	struct bio		*cache_miss;
+
+	unsigned		insert_bio_sectors;
+
+	unsigned		recoverable:1;
+	unsigned		unaligned_bvec:1;
+	unsigned		write:1;
+
+	unsigned long		start_time;
+
+	struct btree_op		op;
+	struct data_insert_op	iop;
+};
+
 static void bch_cache_read_endio(struct bio *bio, int error)
 {
 	struct bbio *b = container_of(bio, struct bbio, bio);
@@ -654,13 +806,13 @@ static void bch_cache_read_endio(struct bio *bio, int error)
 	 */
 
 	if (error)
-		s->error = error;
-	else if (ptr_stale(s->c, &b->key, 0)) {
-		atomic_long_inc(&s->c->cache_read_races);
-		s->error = -EINTR;
+		s->iop.error = error;
+	else if (ptr_stale(s->iop.c, &b->key, 0)) {
+		atomic_long_inc(&s->iop.c->cache_read_races);
+		s->iop.error = -EINTR;
 	}
 
-	bch_bbio_endio(s->c, bio, error, "reading from cache");
+	bch_bbio_endio(s->iop.c, bio, error, "reading from cache");
 }
 
 /*
@@ -674,13 +826,13 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
 	struct bkey *bio_key;
 	unsigned ptr;
 
-	if (bkey_cmp(k, &KEY(s->inode, bio->bi_sector, 0)) <= 0)
+	if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_sector, 0)) <= 0)
 		return MAP_CONTINUE;
 
-	if (KEY_INODE(k) != s->inode ||
+	if (KEY_INODE(k) != s->iop.inode ||
 	    KEY_START(k) > bio->bi_sector) {
 		unsigned bio_sectors = bio_sectors(bio);
-		unsigned sectors = KEY_INODE(k) == s->inode
+		unsigned sectors = KEY_INODE(k) == s->iop.inode
 			? min_t(uint64_t, INT_MAX,
 				KEY_START(k) - bio->bi_sector)
 			: INT_MAX;
@@ -708,8 +860,8 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
 	bio_key = &container_of(n, struct bbio, bio)->key;
 	bch_bkey_copy_single_ptr(bio_key, k, ptr);
 
-	bch_cut_front(&KEY(s->inode, n->bi_sector, 0), bio_key);
-	bch_cut_back(&KEY(s->inode, bio_end_sector(n), 0), bio_key);
+	bch_cut_front(&KEY(s->iop.inode, n->bi_sector, 0), bio_key);
+	bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key);
 
 	n->bi_end_io	= bch_cache_read_endio;
 	n->bi_private	= &s->cl;
@@ -731,11 +883,11 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
 
 static void cache_lookup(struct closure *cl)
 {
-	struct search *s = container_of(cl, struct search, btree);
+	struct search *s = container_of(cl, struct search, iop.cl);
 	struct bio *bio = &s->bio.bio;
 
-	int ret = bch_btree_map_keys(&s->op, s->c,
-				     &KEY(s->inode, bio->bi_sector, 0),
+	int ret = bch_btree_map_keys(&s->op, s->iop.c,
+				     &KEY(s->iop.inode, bio->bi_sector, 0),
 				     cache_lookup_fn, MAP_END_KEY);
 	if (ret == -EAGAIN)
 		continue_at(cl, cache_lookup, bcache_wq);
@@ -751,7 +903,7 @@ static void request_endio(struct bio *bio, int error)
 
 	if (error) {
 		struct search *s = container_of(cl, struct search, cl);
-		s->error = error;
+		s->iop.error = error;
 		/* Only cache read errors are recoverable */
 		s->recoverable = false;
 	}
@@ -771,8 +923,8 @@ static void bio_complete(struct search *s)
 		part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration);
 		part_stat_unlock();
 
-		trace_bcache_request_end(s, s->orig_bio);
-		bio_endio(s->orig_bio, s->error);
+		trace_bcache_request_end(s->d, s->orig_bio);
+		bio_endio(s->orig_bio, s->iop.error);
 		s->orig_bio = NULL;
 	}
 }
@@ -792,8 +944,8 @@ static void search_free(struct closure *cl)
 	struct search *s = container_of(cl, struct search, cl);
 	bio_complete(s);
 
-	if (s->cache_bio)
-		bio_put(s->cache_bio);
+	if (s->iop.bio)
+		bio_put(s->iop.bio);
 
 	if (s->unaligned_bvec)
 		mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec);
@@ -808,18 +960,18 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
 	struct bio_vec *bv;
 
 	s = mempool_alloc(d->c->search, GFP_NOIO);
-	memset(s, 0, offsetof(struct search, insert_keys));
+	memset(s, 0, offsetof(struct search, iop.insert_keys));
 
 	__closure_init(&s->cl, NULL);
 
-	s->inode		= d->id;
-	s->c			= d->c;
+	s->iop.inode		= d->id;
+	s->iop.c		= d->c;
 	s->d			= d;
 	s->op.lock		= -1;
-	s->task			= current;
+	s->iop.task		= current;
 	s->orig_bio		= bio;
 	s->write		= (bio->bi_rw & REQ_WRITE) != 0;
-	s->flush_journal	= (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
+	s->iop.flush_journal	= (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
 	s->recoverable		= 1;
 	s->start_time		= jiffies;
 	do_bio_hook(s);
@@ -847,148 +999,20 @@ static void cached_dev_bio_complete(struct closure *cl)
 	cached_dev_put(dc);
 }
 
-unsigned bch_get_congested(struct cache_set *c)
-{
-	int i;
-	long rand;
-
-	if (!c->congested_read_threshold_us &&
-	    !c->congested_write_threshold_us)
-		return 0;
-
-	i = (local_clock_us() - c->congested_last_us) / 1024;
-	if (i < 0)
-		return 0;
-
-	i += atomic_read(&c->congested);
-	if (i >= 0)
-		return 0;
-
-	i += CONGESTED_MAX;
-
-	if (i > 0)
-		i = fract_exp_two(i, 6);
-
-	rand = get_random_int();
-	i -= bitmap_weight(&rand, BITS_PER_LONG);
-
-	return i > 0 ? i : 1;
-}
-
-static void add_sequential(struct task_struct *t)
-{
-	ewma_add(t->sequential_io_avg,
-		 t->sequential_io, 8, 0);
-
-	t->sequential_io = 0;
-}
-
-static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
-{
-	return &dc->io_hash[hash_64(k, RECENT_IO_BITS)];
-}
-
-static bool check_should_bypass(struct cached_dev *dc, struct search *s)
-{
-	struct cache_set *c = s->c;
-	struct bio *bio = &s->bio.bio;
-	unsigned mode = cache_mode(dc, bio);
-	unsigned sectors, congested = bch_get_congested(c);
-
-	if (atomic_read(&dc->disk.detaching) ||
-	    c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
-	    (bio->bi_rw & REQ_DISCARD))
-		goto skip;
-
-	if (mode == CACHE_MODE_NONE ||
-	    (mode == CACHE_MODE_WRITEAROUND &&
-	     (bio->bi_rw & REQ_WRITE)))
-		goto skip;
-
-	if (bio->bi_sector & (c->sb.block_size - 1) ||
-	    bio_sectors(bio) & (c->sb.block_size - 1)) {
-		pr_debug("skipping unaligned io");
-		goto skip;
-	}
-
-	if (!congested && !dc->sequential_cutoff)
-		goto rescale;
-
-	if (!congested &&
-	    mode == CACHE_MODE_WRITEBACK &&
-	    (bio->bi_rw & REQ_WRITE) &&
-	    (bio->bi_rw & REQ_SYNC))
-		goto rescale;
-
-	if (dc->sequential_merge) {
-		struct io *i;
-
-		spin_lock(&dc->io_lock);
-
-		hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash)
-			if (i->last == bio->bi_sector &&
-			    time_before(jiffies, i->jiffies))
-				goto found;
-
-		i = list_first_entry(&dc->io_lru, struct io, lru);
-
-		add_sequential(s->task);
-		i->sequential = 0;
-found:
-		if (i->sequential + bio->bi_size > i->sequential)
-			i->sequential	+= bio->bi_size;
-
-		i->last			 = bio_end_sector(bio);
-		i->jiffies		 = jiffies + msecs_to_jiffies(5000);
-		s->task->sequential_io	 = i->sequential;
-
-		hlist_del(&i->hash);
-		hlist_add_head(&i->hash, iohash(dc, i->last));
-		list_move_tail(&i->lru, &dc->io_lru);
-
-		spin_unlock(&dc->io_lock);
-	} else {
-		s->task->sequential_io = bio->bi_size;
-
-		add_sequential(s->task);
-	}
-
-	sectors = max(s->task->sequential_io,
-		      s->task->sequential_io_avg) >> 9;
-
-	if (dc->sequential_cutoff &&
-	    sectors >= dc->sequential_cutoff >> 9) {
-		trace_bcache_bypass_sequential(s->orig_bio);
-		goto skip;
-	}
-
-	if (congested && sectors >= congested) {
-		trace_bcache_bypass_congested(s->orig_bio);
-		goto skip;
-	}
-
-rescale:
-	bch_rescale_priorities(c, bio_sectors(bio));
-	return false;
-skip:
-	bch_mark_sectors_bypassed(s, bio_sectors(bio));
-	return true;
-}
-
 /* Process reads */
 
 static void cached_dev_cache_miss_done(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 
-	if (s->insert_collision)
-		bch_mark_cache_miss_collision(s);
+	if (s->iop.replace_collision)
+		bch_mark_cache_miss_collision(s->iop.c, s->d);
 
-	if (s->cache_bio) {
+	if (s->iop.bio) {
 		int i;
 		struct bio_vec *bv;
 
-		bio_for_each_segment_all(bv, s->cache_bio, i)
+		bio_for_each_segment_all(bv, s->iop.bio, i)
 			__free_page(bv->bv_page);
 	}
 
@@ -1006,7 +1030,7 @@ static void cached_dev_read_error(struct closure *cl)
 		/* Retry from the backing device: */
 		trace_bcache_read_retry(s->orig_bio);
 
-		s->error = 0;
+		s->iop.error = 0;
 		bv = s->bio.bio.bi_io_vec;
 		do_bio_hook(s);
 		s->bio.bio.bi_io_vec = bv;
@@ -1041,29 +1065,28 @@ static void cached_dev_read_done(struct closure *cl)
 	 * to the buffers the original bio pointed to:
 	 */
 
-	if (s->cache_bio) {
-		bio_reset(s->cache_bio);
-		s->cache_bio->bi_sector =
-			s->cache_miss->bi_sector;
-		s->cache_bio->bi_bdev = s->cache_miss->bi_bdev;
-		s->cache_bio->bi_size = s->cache_bio_sectors << 9;
-		bch_bio_map(s->cache_bio, NULL);
+	if (s->iop.bio) {
+		bio_reset(s->iop.bio);
+		s->iop.bio->bi_sector = s->cache_miss->bi_sector;
+		s->iop.bio->bi_bdev = s->cache_miss->bi_bdev;
+		s->iop.bio->bi_size = s->insert_bio_sectors << 9;
+		bch_bio_map(s->iop.bio, NULL);
 
-		bio_copy_data(s->cache_miss, s->cache_bio);
+		bio_copy_data(s->cache_miss, s->iop.bio);
 
 		bio_put(s->cache_miss);
 		s->cache_miss = NULL;
 	}
 
-	if (verify(dc, &s->bio.bio) && s->recoverable)
-		bch_data_verify(s);
+	if (verify(dc, &s->bio.bio) && s->recoverable && !s->unaligned_bvec)
+		bch_data_verify(dc, s->orig_bio);
 
 	bio_complete(s);
 
-	if (s->cache_bio &&
-	    !test_bit(CACHE_SET_STOPPING, &s->c->flags)) {
-		BUG_ON(!s->replace);
-		closure_call(&s->btree, bch_data_insert, NULL, cl);
+	if (s->iop.bio &&
+	    !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) {
+		BUG_ON(!s->iop.replace);
+		closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
 	}
 
 	continue_at(cl, cached_dev_cache_miss_done, NULL);
@@ -1074,12 +1097,13 @@ static void cached_dev_read_done_bh(struct closure *cl)
 	struct search *s = container_of(cl, struct search, cl);
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 
-	bch_mark_cache_accounting(s, !s->cache_miss, s->bypass);
-	trace_bcache_read(s->orig_bio, !s->cache_miss, s->bypass);
+	bch_mark_cache_accounting(s->iop.c, s->d,
+				  !s->cache_miss, s->iop.bypass);
+	trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass);
 
-	if (s->error)
+	if (s->iop.error)
 		continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq);
-	else if (s->cache_bio || verify(dc, &s->bio.bio))
+	else if (s->iop.bio || verify(dc, &s->bio.bio))
 		continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq);
 	else
 		continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
@@ -1093,7 +1117,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	struct bio *miss, *cache_bio;
 
-	if (s->cache_miss || s->bypass) {
+	if (s->cache_miss || s->iop.bypass) {
 		miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
 		ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
 		goto out_submit;
@@ -1101,20 +1125,21 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 
 	if (!(bio->bi_rw & REQ_RAHEAD) &&
 	    !(bio->bi_rw & REQ_META) &&
-	    s->c->gc_stats.in_use < CUTOFF_CACHE_READA)
+	    s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA)
 		reada = min_t(sector_t, dc->readahead >> 9,
 			      bdev_sectors(bio->bi_bdev) - bio_end_sector(bio));
 
-	s->cache_bio_sectors = min(sectors, bio_sectors(bio) + reada);
+	s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada);
 
-	s->replace_key = KEY(s->inode, bio->bi_sector +
-			     s->cache_bio_sectors, s->cache_bio_sectors);
+	s->iop.replace_key = KEY(s->iop.inode,
+				 bio->bi_sector + s->insert_bio_sectors,
+				 s->insert_bio_sectors);
 
-	ret = bch_btree_insert_check_key(b, &s->op, &s->replace_key);
+	ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key);
 	if (ret)
 		return ret;
 
-	s->replace = true;
+	s->iop.replace = true;
 
 	miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
 
@@ -1122,14 +1147,14 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	ret = miss == bio ? MAP_DONE : -EINTR;
 
 	cache_bio = bio_alloc_bioset(GFP_NOWAIT,
-			DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS),
+			DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS),
 			dc->disk.bio_split);
 	if (!cache_bio)
 		goto out_submit;
 
 	cache_bio->bi_sector	= miss->bi_sector;
 	cache_bio->bi_bdev	= miss->bi_bdev;
-	cache_bio->bi_size	= s->cache_bio_sectors << 9;
+	cache_bio->bi_size	= s->insert_bio_sectors << 9;
 
 	cache_bio->bi_end_io	= request_endio;
 	cache_bio->bi_private	= &s->cl;
@@ -1138,8 +1163,11 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO))
 		goto out_put;
 
+	if (reada)
+		bch_mark_cache_readahead(s->iop.c, s->d);
+
 	s->cache_miss	= miss;
-	s->cache_bio = cache_bio;
+	s->iop.bio	= cache_bio;
 	bio_get(cache_bio);
 	closure_bio_submit(cache_bio, &s->cl, s->d);
 
@@ -1157,7 +1185,7 @@ static void cached_dev_read(struct cached_dev *dc, struct search *s)
 {
 	struct closure *cl = &s->cl;
 
-	closure_call(&s->btree, cache_lookup, NULL, cl);
+	closure_call(&s->iop.cl, cache_lookup, NULL, cl);
 	continue_at(cl, cached_dev_read_done_bh, NULL);
 }
 
@@ -1179,7 +1207,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 	struct bkey start = KEY(dc->disk.id, bio->bi_sector, 0);
 	struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0);
 
-	bch_keybuf_check_overlapping(&s->c->moving_gc_keys, &start, &end);
+	bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end);
 
 	down_read_non_owner(&dc->writeback_lock);
 	if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) {
@@ -1187,8 +1215,8 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 		 * We overlap with some dirty data undergoing background
 		 * writeback, force this write to writeback
 		 */
-		s->bypass	= false;
-		s->writeback	= true;
+		s->iop.bypass = false;
+		s->iop.writeback = true;
 	}
 
 	/*
@@ -1199,27 +1227,25 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 	 * so we still want to call it.
 	 */
 	if (bio->bi_rw & REQ_DISCARD)
-		s->bypass = true;
+		s->iop.bypass = true;
 
 	if (should_writeback(dc, s->orig_bio,
 			     cache_mode(dc, bio),
-			     s->bypass)) {
-		s->bypass = false;
-		s->writeback = true;
+			     s->iop.bypass)) {
+		s->iop.bypass = false;
+		s->iop.writeback = true;
 	}
 
-	trace_bcache_write(s->orig_bio, s->writeback, s->bypass);
-
-	if (s->bypass) {
-		s->cache_bio = s->orig_bio;
-		bio_get(s->cache_bio);
+	if (s->iop.bypass) {
+		s->iop.bio = s->orig_bio;
+		bio_get(s->iop.bio);
 
 		if (!(bio->bi_rw & REQ_DISCARD) ||
 		    blk_queue_discard(bdev_get_queue(dc->bdev)))
 			closure_bio_submit(bio, cl, s->d);
-	} else if (s->writeback) {
+	} else if (s->iop.writeback) {
 		bch_writeback_add(dc);
-		s->cache_bio = bio;
+		s->iop.bio = bio;
 
 		if (bio->bi_rw & REQ_FLUSH) {
 			/* Also need to send a flush to the backing device */
@@ -1234,13 +1260,13 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 			closure_bio_submit(flush, cl, s->d);
 		}
 	} else {
-		s->cache_bio = bio_clone_bioset(bio, GFP_NOIO,
-						dc->disk.bio_split);
+		s->iop.bio = bio_clone_bioset(bio, GFP_NOIO,
+					      dc->disk.bio_split);
 
 		closure_bio_submit(bio, cl, s->d);
 	}
 
-	closure_call(&s->btree, bch_data_insert, NULL, cl);
+	closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
 	continue_at(cl, cached_dev_write_complete, NULL);
 }
 
@@ -1249,8 +1275,8 @@ static void cached_dev_nodata(struct closure *cl)
 	struct search *s = container_of(cl, struct search, cl);
 	struct bio *bio = &s->bio.bio;
 
-	if (s->flush_journal)
-		bch_journal_meta(s->c, cl);
+	if (s->iop.flush_journal)
+		bch_journal_meta(s->iop.c, cl);
 
 	/* If it's a flush, we send the flush to the backing device too */
 	closure_bio_submit(bio, cl, s->d);
@@ -1277,7 +1303,7 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
 
 	if (cached_dev_get(dc)) {
 		s = search_alloc(bio, d);
-		trace_bcache_request_start(s, bio);
+		trace_bcache_request_start(s->d, bio);
 
 		if (!bio->bi_size) {
 			/*
@@ -1288,7 +1314,7 @@ static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
 					      cached_dev_nodata,
 					      bcache_wq);
 		} else {
-			s->bypass = check_should_bypass(dc, s);
+			s->iop.bypass = check_should_bypass(dc, bio);
 
 			if (rw)
 				cached_dev_write(dc, s);
@@ -1378,8 +1404,8 @@ static void flash_dev_nodata(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
 
-	if (s->flush_journal)
-		bch_journal_meta(s->c, cl);
+	if (s->iop.flush_journal)
+		bch_journal_meta(s->iop.c, cl);
 
 	continue_at(cl, search_free, NULL);
 }
@@ -1400,7 +1426,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 	cl = &s->cl;
 	bio = &s->bio.bio;
 
-	trace_bcache_request_start(s, bio);
+	trace_bcache_request_start(s->d, bio);
 
 	if (!bio->bi_size) {
 		/*
@@ -1411,17 +1437,17 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 				      flash_dev_nodata,
 				      bcache_wq);
 	} else if (rw) {
-		bch_keybuf_check_overlapping(&s->c->moving_gc_keys,
+		bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
 					&KEY(d->id, bio->bi_sector, 0),
 					&KEY(d->id, bio_end_sector(bio), 0));
 
-		s->bypass	= (bio->bi_rw & REQ_DISCARD) != 0;
-		s->writeback	= true;
-		s->cache_bio	= bio;
+		s->iop.bypass		= (bio->bi_rw & REQ_DISCARD) != 0;
+		s->iop.writeback	= true;
+		s->iop.bio		= bio;
 
-		closure_call(&s->btree, bch_data_insert, NULL, cl);
+		closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
 	} else {
-		closure_call(&s->btree, cache_lookup, NULL, cl);
+		closure_call(&s->iop.cl, cache_lookup, NULL, cl);
 	}
 
 	continue_at(cl, search_free, NULL);
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index f0e930b4ca89..54d7de27356f 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -3,46 +3,25 @@
 
 #include <linux/cgroup.h>
 
-struct search {
-	/* Stack frame for bio_complete */
+struct data_insert_op {
 	struct closure		cl;
-	struct closure		btree;
-
-	struct bcache_device	*d;
 	struct cache_set	*c;
 	struct task_struct	*task;
-
-	struct bbio		bio;
-	struct bio		*orig_bio;
-	struct bio		*cache_miss;
-
-	/* Bio to be inserted into the cache */
-	struct bio		*cache_bio;
-	unsigned		cache_bio_sectors;
+	struct bio		*bio;
 
 	unsigned		inode;
+	uint16_t		write_prio;
+	short			error;
 
-	unsigned		recoverable:1;
-	unsigned		unaligned_bvec:1;
-
-	unsigned		write:1;
-	unsigned		writeback:1;
-
-	unsigned		csum:1;
 	unsigned		bypass:1;
+	unsigned		writeback:1;
 	unsigned		flush_journal:1;
+	unsigned		csum:1;
 
-	unsigned		insert_data_done:1;
 	unsigned		replace:1;
-	unsigned		insert_collision:1;
-
-	uint16_t		write_prio;
-
-	/* IO error returned to s->bio */
-	short			error;
-	unsigned long		start_time;
+	unsigned		replace_collision:1;
 
-	struct btree_op		op;
+	unsigned		insert_data_done:1;
 
 	/* Anything past this point won't get zeroed in search_alloc() */
 	struct keylist		insert_keys;
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
index ea77263cf7ef..84d0782f702e 100644
--- a/drivers/md/bcache/stats.c
+++ b/drivers/md/bcache/stats.c
@@ -7,7 +7,6 @@
 #include "bcache.h"
 #include "stats.h"
 #include "btree.h"
-#include "request.h"
 #include "sysfs.h"
 
 /*
@@ -196,35 +195,36 @@ static void mark_cache_stats(struct cache_stat_collector *stats,
 			atomic_inc(&stats->cache_bypass_misses);
 }
 
-void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass)
+void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d,
+			       bool hit, bool bypass)
 {
-	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
 	mark_cache_stats(&dc->accounting.collector, hit, bypass);
-	mark_cache_stats(&s->c->accounting.collector, hit, bypass);
+	mark_cache_stats(&c->accounting.collector, hit, bypass);
 #ifdef CONFIG_CGROUP_BCACHE
 	mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass);
 #endif
 }
 
-void bch_mark_cache_readahead(struct search *s)
+void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d)
 {
-	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
 	atomic_inc(&dc->accounting.collector.cache_readaheads);
-	atomic_inc(&s->c->accounting.collector.cache_readaheads);
+	atomic_inc(&c->accounting.collector.cache_readaheads);
 }
 
-void bch_mark_cache_miss_collision(struct search *s)
+void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d)
 {
-	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
 	atomic_inc(&dc->accounting.collector.cache_miss_collisions);
-	atomic_inc(&s->c->accounting.collector.cache_miss_collisions);
+	atomic_inc(&c->accounting.collector.cache_miss_collisions);
 }
 
-void bch_mark_sectors_bypassed(struct search *s, int sectors)
+void bch_mark_sectors_bypassed(struct cache_set *c, struct cached_dev *dc,
+			       int sectors)
 {
-	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	atomic_add(sectors, &dc->accounting.collector.sectors_bypassed);
-	atomic_add(sectors, &s->c->accounting.collector.sectors_bypassed);
+	atomic_add(sectors, &c->accounting.collector.sectors_bypassed);
 }
 
 void bch_cache_accounting_init(struct cache_accounting *acc,
diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h
index c7c7a8fd29fe..adbff141c887 100644
--- a/drivers/md/bcache/stats.h
+++ b/drivers/md/bcache/stats.h
@@ -38,7 +38,9 @@ struct cache_accounting {
 	struct cache_stats day;
 };
 
-struct search;
+struct cache_set;
+struct cached_dev;
+struct bcache_device;
 
 void bch_cache_accounting_init(struct cache_accounting *acc,
 			       struct closure *parent);
@@ -50,9 +52,10 @@ void bch_cache_accounting_clear(struct cache_accounting *acc);
 
 void bch_cache_accounting_destroy(struct cache_accounting *acc);
 
-void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass);
-void bch_mark_cache_readahead(struct search *s);
-void bch_mark_cache_miss_collision(struct search *s);
-void bch_mark_sectors_bypassed(struct search *s, int sectors);
+void bch_mark_cache_accounting(struct cache_set *, struct bcache_device *,
+			       bool, bool);
+void bch_mark_cache_readahead(struct cache_set *, struct bcache_device *);
+void bch_mark_cache_miss_collision(struct cache_set *, struct bcache_device *);
+void bch_mark_sectors_bypassed(struct cache_set *, struct cached_dev *, int);
 
 #endif /* _BCACHE_STATS_H_ */
diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c
index f7b6c197f90f..adbc3df17a80 100644
--- a/drivers/md/bcache/trace.c
+++ b/drivers/md/bcache/trace.c
@@ -1,6 +1,5 @@
 #include "bcache.h"
 #include "btree.h"
-#include "request.h"
 
 #include <linux/blktrace_api.h>
 #include <linux/module.h>
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index 5ebda976ea93..32c89b33c391 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -6,11 +6,9 @@
 
 #include <linux/tracepoint.h>
 
-struct search;
-
 DECLARE_EVENT_CLASS(bcache_request,
-	TP_PROTO(struct search *s, struct bio *bio),
-	TP_ARGS(s, bio),
+	TP_PROTO(struct bcache_device *d, struct bio *bio),
+	TP_ARGS(d, bio),
 
 	TP_STRUCT__entry(
 		__field(dev_t,		dev			)
@@ -24,8 +22,8 @@ DECLARE_EVENT_CLASS(bcache_request,
 
 	TP_fast_assign(
 		__entry->dev		= bio->bi_bdev->bd_dev;
-		__entry->orig_major	= s->d->disk->major;
-		__entry->orig_minor	= s->d->disk->first_minor;
+		__entry->orig_major	= d->disk->major;
+		__entry->orig_minor	= d->disk->first_minor;
 		__entry->sector		= bio->bi_sector;
 		__entry->orig_sector	= bio->bi_sector - 16;
 		__entry->nr_sector	= bio->bi_size >> 9;
@@ -79,13 +77,13 @@ DECLARE_EVENT_CLASS(btree_node,
 /* request.c */
 
 DEFINE_EVENT(bcache_request, bcache_request_start,
-	TP_PROTO(struct search *s, struct bio *bio),
-	TP_ARGS(s, bio)
+	TP_PROTO(struct bcache_device *d, struct bio *bio),
+	TP_ARGS(d, bio)
 );
 
 DEFINE_EVENT(bcache_request, bcache_request_end,
-	TP_PROTO(struct search *s, struct bio *bio),
-	TP_ARGS(s, bio)
+	TP_PROTO(struct bcache_device *d, struct bio *bio),
+	TP_ARGS(d, bio)
 );
 
 DECLARE_EVENT_CLASS(bcache_bio,
-- 
cgit v1.2.3


From 2599b53b7b0ea6103d1661dca74d35480cb8fa1f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 18:11:11 -0700
Subject: bcache: Move sector allocator to alloc.c

Just reorganizing things a bit.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/alloc.c   | 180 ++++++++++++++++++++++++++++++++++++++++++
 drivers/md/bcache/bcache.h  |   4 +
 drivers/md/bcache/request.c | 186 +-------------------------------------------
 drivers/md/bcache/request.h |   5 +-
 4 files changed, 189 insertions(+), 186 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index b9bd5866055d..4970ddc6a7f6 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -487,8 +487,188 @@ int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
 	return ret;
 }
 
+/* Sector allocator */
+
+struct open_bucket {
+	struct list_head	list;
+	unsigned		last_write_point;
+	unsigned		sectors_free;
+	BKEY_PADDED(key);
+};
+
+/*
+ * We keep multiple buckets open for writes, and try to segregate different
+ * write streams for better cache utilization: first we look for a bucket where
+ * the last write to it was sequential with the current write, and failing that
+ * we look for a bucket that was last used by the same task.
+ *
+ * The ideas is if you've got multiple tasks pulling data into the cache at the
+ * same time, you'll get better cache utilization if you try to segregate their
+ * data and preserve locality.
+ *
+ * For example, say you've starting Firefox at the same time you're copying a
+ * bunch of files. Firefox will likely end up being fairly hot and stay in the
+ * cache awhile, but the data you copied might not be; if you wrote all that
+ * data to the same buckets it'd get invalidated at the same time.
+ *
+ * Both of those tasks will be doing fairly random IO so we can't rely on
+ * detecting sequential IO to segregate their data, but going off of the task
+ * should be a sane heuristic.
+ */
+static struct open_bucket *pick_data_bucket(struct cache_set *c,
+					    const struct bkey *search,
+					    unsigned write_point,
+					    struct bkey *alloc)
+{
+	struct open_bucket *ret, *ret_task = NULL;
+
+	list_for_each_entry_reverse(ret, &c->data_buckets, list)
+		if (!bkey_cmp(&ret->key, search))
+			goto found;
+		else if (ret->last_write_point == write_point)
+			ret_task = ret;
+
+	ret = ret_task ?: list_first_entry(&c->data_buckets,
+					   struct open_bucket, list);
+found:
+	if (!ret->sectors_free && KEY_PTRS(alloc)) {
+		ret->sectors_free = c->sb.bucket_size;
+		bkey_copy(&ret->key, alloc);
+		bkey_init(alloc);
+	}
+
+	if (!ret->sectors_free)
+		ret = NULL;
+
+	return ret;
+}
+
+/*
+ * Allocates some space in the cache to write to, and k to point to the newly
+ * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the
+ * end of the newly allocated space).
+ *
+ * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many
+ * sectors were actually allocated.
+ *
+ * If s->writeback is true, will not fail.
+ */
+bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors,
+		       unsigned write_point, unsigned write_prio, bool wait)
+{
+	struct open_bucket *b;
+	BKEY_PADDED(key) alloc;
+	unsigned i;
+
+	/*
+	 * We might have to allocate a new bucket, which we can't do with a
+	 * spinlock held. So if we have to allocate, we drop the lock, allocate
+	 * and then retry. KEY_PTRS() indicates whether alloc points to
+	 * allocated bucket(s).
+	 */
+
+	bkey_init(&alloc.key);
+	spin_lock(&c->data_bucket_lock);
+
+	while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) {
+		unsigned watermark = write_prio
+			? WATERMARK_MOVINGGC
+			: WATERMARK_NONE;
+
+		spin_unlock(&c->data_bucket_lock);
+
+		if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, wait))
+			return false;
+
+		spin_lock(&c->data_bucket_lock);
+	}
+
+	/*
+	 * If we had to allocate, we might race and not need to allocate the
+	 * second time we call find_data_bucket(). If we allocated a bucket but
+	 * didn't use it, drop the refcount bch_bucket_alloc_set() took:
+	 */
+	if (KEY_PTRS(&alloc.key))
+		__bkey_put(c, &alloc.key);
+
+	for (i = 0; i < KEY_PTRS(&b->key); i++)
+		EBUG_ON(ptr_stale(c, &b->key, i));
+
+	/* Set up the pointer to the space we're allocating: */
+
+	for (i = 0; i < KEY_PTRS(&b->key); i++)
+		k->ptr[i] = b->key.ptr[i];
+
+	sectors = min(sectors, b->sectors_free);
+
+	SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors);
+	SET_KEY_SIZE(k, sectors);
+	SET_KEY_PTRS(k, KEY_PTRS(&b->key));
+
+	/*
+	 * Move b to the end of the lru, and keep track of what this bucket was
+	 * last used for:
+	 */
+	list_move_tail(&b->list, &c->data_buckets);
+	bkey_copy_key(&b->key, k);
+	b->last_write_point = write_point;
+
+	b->sectors_free	-= sectors;
+
+	for (i = 0; i < KEY_PTRS(&b->key); i++) {
+		SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors);
+
+		atomic_long_add(sectors,
+				&PTR_CACHE(c, &b->key, i)->sectors_written);
+	}
+
+	if (b->sectors_free < c->sb.block_size)
+		b->sectors_free = 0;
+
+	/*
+	 * k takes refcounts on the buckets it points to until it's inserted
+	 * into the btree, but if we're done with this bucket we just transfer
+	 * get_data_bucket()'s refcount.
+	 */
+	if (b->sectors_free)
+		for (i = 0; i < KEY_PTRS(&b->key); i++)
+			atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin);
+
+	spin_unlock(&c->data_bucket_lock);
+	return true;
+}
+
 /* Init */
 
+void bch_open_buckets_free(struct cache_set *c)
+{
+	struct open_bucket *b;
+
+	while (!list_empty(&c->data_buckets)) {
+		b = list_first_entry(&c->data_buckets,
+				     struct open_bucket, list);
+		list_del(&b->list);
+		kfree(b);
+	}
+}
+
+int bch_open_buckets_alloc(struct cache_set *c)
+{
+	int i;
+
+	spin_lock_init(&c->data_bucket_lock);
+
+	for (i = 0; i < 6; i++) {
+		struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
+		if (!b)
+			return -ENOMEM;
+
+		list_add(&b->list, &c->data_buckets);
+	}
+
+	return 0;
+}
+
 int bch_cache_allocator_start(struct cache *ca)
 {
 	struct task_struct *k = kthread_run(bch_allocator_thread,
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 20fe96c121d9..e32f6fd91755 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -1170,6 +1170,8 @@ int __bch_bucket_alloc_set(struct cache_set *, unsigned,
 			   struct bkey *, int, bool);
 int bch_bucket_alloc_set(struct cache_set *, unsigned,
 			 struct bkey *, int, bool);
+bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned,
+		       unsigned, unsigned, bool);
 
 __printf(2, 3)
 bool bch_cache_set_error(struct cache_set *, const char *, ...);
@@ -1210,6 +1212,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *);
 void bch_btree_cache_free(struct cache_set *);
 int bch_btree_cache_alloc(struct cache_set *);
 void bch_moving_init_cache_set(struct cache_set *);
+int bch_open_buckets_alloc(struct cache_set *);
+void bch_open_buckets_free(struct cache_set *);
 
 int bch_cache_allocator_start(struct cache *ca);
 int bch_cache_allocator_init(struct cache *ca);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 05c7c216f65e..cf7850a7592c 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -255,186 +255,6 @@ static void bch_data_insert_keys(struct closure *cl)
 	closure_return(cl);
 }
 
-struct open_bucket {
-	struct list_head	list;
-	struct task_struct	*last;
-	unsigned		sectors_free;
-	BKEY_PADDED(key);
-};
-
-void bch_open_buckets_free(struct cache_set *c)
-{
-	struct open_bucket *b;
-
-	while (!list_empty(&c->data_buckets)) {
-		b = list_first_entry(&c->data_buckets,
-				     struct open_bucket, list);
-		list_del(&b->list);
-		kfree(b);
-	}
-}
-
-int bch_open_buckets_alloc(struct cache_set *c)
-{
-	int i;
-
-	spin_lock_init(&c->data_bucket_lock);
-
-	for (i = 0; i < 6; i++) {
-		struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
-		if (!b)
-			return -ENOMEM;
-
-		list_add(&b->list, &c->data_buckets);
-	}
-
-	return 0;
-}
-
-/*
- * We keep multiple buckets open for writes, and try to segregate different
- * write streams for better cache utilization: first we look for a bucket where
- * the last write to it was sequential with the current write, and failing that
- * we look for a bucket that was last used by the same task.
- *
- * The ideas is if you've got multiple tasks pulling data into the cache at the
- * same time, you'll get better cache utilization if you try to segregate their
- * data and preserve locality.
- *
- * For example, say you've starting Firefox at the same time you're copying a
- * bunch of files. Firefox will likely end up being fairly hot and stay in the
- * cache awhile, but the data you copied might not be; if you wrote all that
- * data to the same buckets it'd get invalidated at the same time.
- *
- * Both of those tasks will be doing fairly random IO so we can't rely on
- * detecting sequential IO to segregate their data, but going off of the task
- * should be a sane heuristic.
- */
-static struct open_bucket *pick_data_bucket(struct cache_set *c,
-					    const struct bkey *search,
-					    struct task_struct *task,
-					    struct bkey *alloc)
-{
-	struct open_bucket *ret, *ret_task = NULL;
-
-	list_for_each_entry_reverse(ret, &c->data_buckets, list)
-		if (!bkey_cmp(&ret->key, search))
-			goto found;
-		else if (ret->last == task)
-			ret_task = ret;
-
-	ret = ret_task ?: list_first_entry(&c->data_buckets,
-					   struct open_bucket, list);
-found:
-	if (!ret->sectors_free && KEY_PTRS(alloc)) {
-		ret->sectors_free = c->sb.bucket_size;
-		bkey_copy(&ret->key, alloc);
-		bkey_init(alloc);
-	}
-
-	if (!ret->sectors_free)
-		ret = NULL;
-
-	return ret;
-}
-
-/*
- * Allocates some space in the cache to write to, and k to point to the newly
- * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the
- * end of the newly allocated space).
- *
- * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many
- * sectors were actually allocated.
- *
- * If s->writeback is true, will not fail.
- */
-static bool bch_alloc_sectors(struct data_insert_op *op,
-			      struct bkey *k, unsigned sectors)
-{
-	struct cache_set *c = op->c;
-	struct open_bucket *b;
-	BKEY_PADDED(key) alloc;
-	unsigned i;
-
-	/*
-	 * We might have to allocate a new bucket, which we can't do with a
-	 * spinlock held. So if we have to allocate, we drop the lock, allocate
-	 * and then retry. KEY_PTRS() indicates whether alloc points to
-	 * allocated bucket(s).
-	 */
-
-	bkey_init(&alloc.key);
-	spin_lock(&c->data_bucket_lock);
-
-	while (!(b = pick_data_bucket(c, k, op->task, &alloc.key))) {
-		unsigned watermark = op->write_prio
-			? WATERMARK_MOVINGGC
-			: WATERMARK_NONE;
-
-		spin_unlock(&c->data_bucket_lock);
-
-		if (bch_bucket_alloc_set(c, watermark, &alloc.key,
-					 1, op->writeback))
-			return false;
-
-		spin_lock(&c->data_bucket_lock);
-	}
-
-	/*
-	 * If we had to allocate, we might race and not need to allocate the
-	 * second time we call find_data_bucket(). If we allocated a bucket but
-	 * didn't use it, drop the refcount bch_bucket_alloc_set() took:
-	 */
-	if (KEY_PTRS(&alloc.key))
-		__bkey_put(c, &alloc.key);
-
-	for (i = 0; i < KEY_PTRS(&b->key); i++)
-		EBUG_ON(ptr_stale(c, &b->key, i));
-
-	/* Set up the pointer to the space we're allocating: */
-
-	for (i = 0; i < KEY_PTRS(&b->key); i++)
-		k->ptr[i] = b->key.ptr[i];
-
-	sectors = min(sectors, b->sectors_free);
-
-	SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors);
-	SET_KEY_SIZE(k, sectors);
-	SET_KEY_PTRS(k, KEY_PTRS(&b->key));
-
-	/*
-	 * Move b to the end of the lru, and keep track of what this bucket was
-	 * last used for:
-	 */
-	list_move_tail(&b->list, &c->data_buckets);
-	bkey_copy_key(&b->key, k);
-	b->last = op->task;
-
-	b->sectors_free	-= sectors;
-
-	for (i = 0; i < KEY_PTRS(&b->key); i++) {
-		SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors);
-
-		atomic_long_add(sectors,
-				&PTR_CACHE(c, &b->key, i)->sectors_written);
-	}
-
-	if (b->sectors_free < c->sb.block_size)
-		b->sectors_free = 0;
-
-	/*
-	 * k takes refcounts on the buckets it points to until it's inserted
-	 * into the btree, but if we're done with this bucket we just transfer
-	 * get_data_bucket()'s refcount.
-	 */
-	if (b->sectors_free)
-		for (i = 0; i < KEY_PTRS(&b->key); i++)
-			atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin);
-
-	spin_unlock(&c->data_bucket_lock);
-	return true;
-}
-
 static void bch_data_invalidate(struct closure *cl)
 {
 	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
@@ -545,7 +365,9 @@ static void bch_data_insert_start(struct closure *cl)
 		SET_KEY_INODE(k, op->inode);
 		SET_KEY_OFFSET(k, bio->bi_sector);
 
-		if (!bch_alloc_sectors(op, k, bio_sectors(bio)))
+		if (!bch_alloc_sectors(op->c, k, bio_sectors(bio),
+				       op->write_point, op->write_prio,
+				       op->writeback))
 			goto err;
 
 		n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
@@ -968,7 +790,7 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
 	s->iop.c		= d->c;
 	s->d			= d;
 	s->op.lock		= -1;
-	s->iop.task		= current;
+	s->iop.write_point	= hash_long((unsigned long) current, 16);
 	s->orig_bio		= bio;
 	s->write		= (bio->bi_rw & REQ_WRITE) != 0;
 	s->iop.flush_journal	= (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 54d7de27356f..2cd65bf073c2 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -6,10 +6,10 @@
 struct data_insert_op {
 	struct closure		cl;
 	struct cache_set	*c;
-	struct task_struct	*task;
 	struct bio		*bio;
 
 	unsigned		inode;
+	uint16_t		write_point;
 	uint16_t		write_prio;
 	short			error;
 
@@ -31,9 +31,6 @@ struct data_insert_op {
 unsigned bch_get_congested(struct cache_set *);
 void bch_data_insert(struct closure *cl);
 
-void bch_open_buckets_free(struct cache_set *);
-int bch_open_buckets_alloc(struct cache_set *);
-
 void bch_cached_dev_request_init(struct cached_dev *dc);
 void bch_flash_dev_request_init(struct bcache_device *d);
 
-- 
cgit v1.2.3


From 81ab4190ac17df41686a37c97f701623276b652a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Thu, 31 Oct 2013 15:46:42 -0700
Subject: bcache: Pull on disk data structures out into a separate header

Now, the on disk data structures are in a header that can be exported to
userspace - and having them all centralized is nice too.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h  | 244 +----------------------------
 drivers/md/bcache/bset.c    |   4 +-
 drivers/md/bcache/bset.h    |  31 ----
 drivers/md/bcache/btree.c   |   2 +-
 drivers/md/bcache/journal.c |   4 +-
 drivers/md/bcache/journal.h |  37 -----
 drivers/md/bcache/request.c |   9 +-
 drivers/md/bcache/super.c   |  13 +-
 drivers/md/bcache/util.h    |  10 --
 include/uapi/linux/bcache.h | 373 ++++++++++++++++++++++++++++++++++++++++++++
 10 files changed, 387 insertions(+), 340 deletions(-)
 create mode 100644 include/uapi/linux/bcache.h

(limited to 'drivers')

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index e32f6fd91755..045cb99f1ca6 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -177,6 +177,7 @@
 
 #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
 
+#include <linux/bcache.h>
 #include <linux/bio.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
@@ -210,168 +211,6 @@ BITMASK(GC_MARK,	 struct bucket, gc_mark, 0, 2);
 #define GC_MARK_METADATA	2
 BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14);
 
-struct bkey {
-	uint64_t	high;
-	uint64_t	low;
-	uint64_t	ptr[];
-};
-
-/* Enough for a key with 6 pointers */
-#define BKEY_PAD		8
-
-#define BKEY_PADDED(key)					\
-	union { struct bkey key; uint64_t key ## _pad[BKEY_PAD]; }
-
-/* Version 0: Cache device
- * Version 1: Backing device
- * Version 2: Seed pointer into btree node checksum
- * Version 3: Cache device with new UUID format
- * Version 4: Backing device with data offset
- */
-#define BCACHE_SB_VERSION_CDEV			0
-#define BCACHE_SB_VERSION_BDEV			1
-#define BCACHE_SB_VERSION_CDEV_WITH_UUID	3
-#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET	4
-#define BCACHE_SB_MAX_VERSION			4
-
-#define SB_SECTOR		8
-#define SB_SIZE			4096
-#define SB_LABEL_SIZE		32
-#define SB_JOURNAL_BUCKETS	256U
-/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */
-#define MAX_CACHES_PER_SET	8
-
-#define BDEV_DATA_START_DEFAULT	16	/* sectors */
-
-struct cache_sb {
-	uint64_t		csum;
-	uint64_t		offset;	/* sector where this sb was written */
-	uint64_t		version;
-
-	uint8_t			magic[16];
-
-	uint8_t			uuid[16];
-	union {
-		uint8_t		set_uuid[16];
-		uint64_t	set_magic;
-	};
-	uint8_t			label[SB_LABEL_SIZE];
-
-	uint64_t		flags;
-	uint64_t		seq;
-	uint64_t		pad[8];
-
-	union {
-	struct {
-		/* Cache devices */
-		uint64_t	nbuckets;	/* device size */
-
-		uint16_t	block_size;	/* sectors */
-		uint16_t	bucket_size;	/* sectors */
-
-		uint16_t	nr_in_set;
-		uint16_t	nr_this_dev;
-	};
-	struct {
-		/* Backing devices */
-		uint64_t	data_offset;
-
-		/*
-		 * block_size from the cache device section is still used by
-		 * backing devices, so don't add anything here until we fix
-		 * things to not need it for backing devices anymore
-		 */
-	};
-	};
-
-	uint32_t		last_mount;	/* time_t */
-
-	uint16_t		first_bucket;
-	union {
-		uint16_t	njournal_buckets;
-		uint16_t	keys;
-	};
-	uint64_t		d[SB_JOURNAL_BUCKETS];	/* journal buckets */
-};
-
-BITMASK(CACHE_SYNC,		struct cache_sb, flags, 0, 1);
-BITMASK(CACHE_DISCARD,		struct cache_sb, flags, 1, 1);
-BITMASK(CACHE_REPLACEMENT,	struct cache_sb, flags, 2, 3);
-#define CACHE_REPLACEMENT_LRU	0U
-#define CACHE_REPLACEMENT_FIFO	1U
-#define CACHE_REPLACEMENT_RANDOM 2U
-
-BITMASK(BDEV_CACHE_MODE,	struct cache_sb, flags, 0, 4);
-#define CACHE_MODE_WRITETHROUGH	0U
-#define CACHE_MODE_WRITEBACK	1U
-#define CACHE_MODE_WRITEAROUND	2U
-#define CACHE_MODE_NONE		3U
-BITMASK(BDEV_STATE,		struct cache_sb, flags, 61, 2);
-#define BDEV_STATE_NONE		0U
-#define BDEV_STATE_CLEAN	1U
-#define BDEV_STATE_DIRTY	2U
-#define BDEV_STATE_STALE	3U
-
-/* Version 1: Seed pointer into btree node checksum
- */
-#define BCACHE_BSET_VERSION	1
-
-/*
- * This is the on disk format for btree nodes - a btree node on disk is a list
- * of these; within each set the keys are sorted
- */
-struct bset {
-	uint64_t		csum;
-	uint64_t		magic;
-	uint64_t		seq;
-	uint32_t		version;
-	uint32_t		keys;
-
-	union {
-		struct bkey	start[0];
-		uint64_t	d[0];
-	};
-};
-
-/*
- * On disk format for priorities and gens - see super.c near prio_write() for
- * more.
- */
-struct prio_set {
-	uint64_t		csum;
-	uint64_t		magic;
-	uint64_t		seq;
-	uint32_t		version;
-	uint32_t		pad;
-
-	uint64_t		next_bucket;
-
-	struct bucket_disk {
-		uint16_t	prio;
-		uint8_t		gen;
-	} __attribute((packed)) data[];
-};
-
-struct uuid_entry {
-	union {
-		struct {
-			uint8_t		uuid[16];
-			uint8_t		label[32];
-			uint32_t	first_reg;
-			uint32_t	last_reg;
-			uint32_t	invalidated;
-
-			uint32_t	flags;
-			/* Size of flash only volumes */
-			uint64_t	sectors;
-		};
-
-		uint8_t	pad[128];
-	};
-};
-
-BITMASK(UUID_FLASH_ONLY,	struct uuid_entry, flags, 0, 1);
-
 #include "journal.h"
 #include "stats.h"
 struct search;
@@ -868,12 +707,6 @@ static inline bool key_merging_disabled(struct cache_set *c)
 #endif
 }
 
-static inline bool SB_IS_BDEV(const struct cache_sb *sb)
-{
-	return sb->version == BCACHE_SB_VERSION_BDEV
-		|| sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
-}
-
 struct bbio {
 	unsigned		submit_time_us;
 	union {
@@ -927,59 +760,6 @@ static inline unsigned local_clock_us(void)
 #define prio_buckets(c)					\
 	DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c))
 
-#define JSET_MAGIC		0x245235c1a3625032ULL
-#define PSET_MAGIC		0x6750e15f87337f91ULL
-#define BSET_MAGIC		0x90135c78b99e07f5ULL
-
-#define jset_magic(c)		((c)->sb.set_magic ^ JSET_MAGIC)
-#define pset_magic(c)		((c)->sb.set_magic ^ PSET_MAGIC)
-#define bset_magic(c)		((c)->sb.set_magic ^ BSET_MAGIC)
-
-/* Bkey fields: all units are in sectors */
-
-#define KEY_FIELD(name, field, offset, size)				\
-	BITMASK(name, struct bkey, field, offset, size)
-
-#define PTR_FIELD(name, offset, size)					\
-	static inline uint64_t name(const struct bkey *k, unsigned i)	\
-	{ return (k->ptr[i] >> offset) & ~(((uint64_t) ~0) << size); }	\
-									\
-	static inline void SET_##name(struct bkey *k, unsigned i, uint64_t v)\
-	{								\
-		k->ptr[i] &= ~(~((uint64_t) ~0 << size) << offset);	\
-		k->ptr[i] |= v << offset;				\
-	}
-
-KEY_FIELD(KEY_PTRS,	high, 60, 3)
-KEY_FIELD(HEADER_SIZE,	high, 58, 2)
-KEY_FIELD(KEY_CSUM,	high, 56, 2)
-KEY_FIELD(KEY_PINNED,	high, 55, 1)
-KEY_FIELD(KEY_DIRTY,	high, 36, 1)
-
-KEY_FIELD(KEY_SIZE,	high, 20, 16)
-KEY_FIELD(KEY_INODE,	high, 0,  20)
-
-/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */
-
-static inline uint64_t KEY_OFFSET(const struct bkey *k)
-{
-	return k->low;
-}
-
-static inline void SET_KEY_OFFSET(struct bkey *k, uint64_t v)
-{
-	k->low = v;
-}
-
-PTR_FIELD(PTR_DEV,		51, 12)
-PTR_FIELD(PTR_OFFSET,		8,  43)
-PTR_FIELD(PTR_GEN,		0,  8)
-
-#define PTR_CHECK_DEV		((1 << 12) - 1)
-
-#define PTR(gen, offset, dev)						\
-	((((uint64_t) dev) << 51) | ((uint64_t) offset) << 8 | gen)
-
 static inline size_t sector_to_bucket(struct cache_set *c, sector_t s)
 {
 	return s >> c->bucket_bits;
@@ -1018,31 +798,11 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c,
 
 /* Btree key macros */
 
-/*
- * The high bit being set is a relic from when we used it to do binary
- * searches - it told you where a key started. It's not used anymore,
- * and can probably be safely dropped.
- */
-#define KEY(dev, sector, len)						\
-((struct bkey) {							\
-	.high = (1ULL << 63) | ((uint64_t) (len) << 20) | (dev),	\
-	.low = (sector)							\
-})
-
 static inline void bkey_init(struct bkey *k)
 {
-	*k = KEY(0, 0, 0);
+	*k = ZERO_KEY;
 }
 
-#define KEY_START(k)		(KEY_OFFSET(k) - KEY_SIZE(k))
-#define START_KEY(k)		KEY(KEY_INODE(k), KEY_START(k), 0)
-
-#define MAX_KEY_INODE		(~(~0 << 20))
-#define MAX_KEY_OFFSET		(((uint64_t) ~0) >> 1)
-#define MAX_KEY			KEY(MAX_KEY_INODE, MAX_KEY_OFFSET, 0)
-
-#define ZERO_KEY		KEY(0, 0, 0)
-
 /*
  * This is used for various on disk data structures - cache_sb, prio_set, bset,
  * jset: The checksum is _always_ the first 8 bytes of these structs
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index f7b5525ddafa..7b8713c66050 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -684,7 +684,7 @@ void bch_bset_init_next(struct btree *b)
 	} else
 		get_random_bytes(&i->seq, sizeof(uint64_t));
 
-	i->magic	= bset_magic(b->c);
+	i->magic	= bset_magic(&b->c->sb);
 	i->version	= 0;
 	i->keys		= 0;
 
@@ -1034,7 +1034,7 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter,
 		 * memcpy()
 		 */
 
-		out->magic	= bset_magic(b->c);
+		out->magic	= bset_magic(&b->c->sb);
 		out->seq	= b->sets[0].data->seq;
 		out->version	= b->sets[0].data->version;
 		swap(out, b->sets[0].data);
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 8a9305685b7e..5cd90565dfe2 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -193,37 +193,6 @@ static __always_inline int64_t bkey_cmp(const struct bkey *l,
 		: (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r);
 }
 
-static inline size_t bkey_u64s(const struct bkey *k)
-{
-	BUG_ON(KEY_CSUM(k) > 1);
-	return 2 + KEY_PTRS(k) + (KEY_CSUM(k) ? 1 : 0);
-}
-
-static inline size_t bkey_bytes(const struct bkey *k)
-{
-	return bkey_u64s(k) * sizeof(uint64_t);
-}
-
-static inline void bkey_copy(struct bkey *dest, const struct bkey *src)
-{
-	memcpy(dest, src, bkey_bytes(src));
-}
-
-static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src)
-{
-	if (!src)
-		src = &KEY(0, 0, 0);
-
-	SET_KEY_INODE(dest, KEY_INODE(src));
-	SET_KEY_OFFSET(dest, KEY_OFFSET(src));
-}
-
-static inline struct bkey *bkey_next(const struct bkey *k)
-{
-	uint64_t *d = (void *) k;
-	return (struct bkey *) (d + bkey_u64s(k));
-}
-
 /* Keylists */
 
 struct keylist {
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index f5aa4adadf1d..aba787d954e5 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -231,7 +231,7 @@ static void bch_btree_node_read_done(struct btree *b)
 			goto err;
 
 		err = "bad magic";
-		if (i->magic != bset_magic(b->c))
+		if (i->magic != bset_magic(&b->c->sb))
 			goto err;
 
 		err = "bad checksum";
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 86de64a6bf26..ecdaa671bd50 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -74,7 +74,7 @@ reread:		left = ca->sb.bucket_size - offset;
 			struct list_head *where;
 			size_t blocks, bytes = set_bytes(j);
 
-			if (j->magic != jset_magic(ca->set))
+			if (j->magic != jset_magic(&ca->sb))
 				return ret;
 
 			if (bytes > left << 9)
@@ -596,7 +596,7 @@ static void journal_write_unlocked(struct closure *cl)
 	for_each_cache(ca, c, i)
 		w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
 
-	w->data->magic		= jset_magic(c);
+	w->data->magic		= jset_magic(&c->sb);
 	w->data->version	= BCACHE_JSET_VERSION;
 	w->data->last_seq	= last_seq(&c->journal);
 	w->data->csum		= csum_set(w->data);
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index 5e9edb9ef376..a6472fda94b2 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -75,43 +75,6 @@
  * nodes that are pinning the oldest journal entries first.
  */
 
-#define BCACHE_JSET_VERSION_UUIDv1	1
-/* Always latest UUID format */
-#define BCACHE_JSET_VERSION_UUID	1
-#define BCACHE_JSET_VERSION		1
-
-/*
- * On disk format for a journal entry:
- * seq is monotonically increasing; every journal entry has its own unique
- * sequence number.
- *
- * last_seq is the oldest journal entry that still has keys the btree hasn't
- * flushed to disk yet.
- *
- * version is for on disk format changes.
- */
-struct jset {
-	uint64_t		csum;
-	uint64_t		magic;
-	uint64_t		seq;
-	uint32_t		version;
-	uint32_t		keys;
-
-	uint64_t		last_seq;
-
-	BKEY_PADDED(uuid_bucket);
-	BKEY_PADDED(btree_root);
-	uint16_t		btree_level;
-	uint16_t		pad[3];
-
-	uint64_t		prio_bucket[MAX_CACHES_PER_SET];
-
-	union {
-		struct bkey	start[0];
-		uint64_t	d[0];
-	};
-};
-
 /*
  * Only used for holding the journal entries we read in btree_journal_read()
  * during cache_registration
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index cf7850a7592c..932300f18973 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -264,16 +264,17 @@ static void bch_data_invalidate(struct closure *cl)
 		 bio_sectors(bio), (uint64_t) bio->bi_sector);
 
 	while (bio_sectors(bio)) {
-		unsigned len = min(bio_sectors(bio), 1U << 14);
+		unsigned sectors = min(bio_sectors(bio),
+				       1U << (KEY_SIZE_BITS - 1));
 
 		if (bch_keylist_realloc(&op->insert_keys, 0, op->c))
 			goto out;
 
-		bio->bi_sector	+= len;
-		bio->bi_size	-= len << 9;
+		bio->bi_sector	+= sectors;
+		bio->bi_size	-= sectors << 9;
 
 		bch_keylist_add(&op->insert_keys,
-				&KEY(op->inode, bio->bi_sector, len));
+				&KEY(op->inode, bio->bi_sector, sectors));
 	}
 
 	op->insert_data_done = true;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index a314c771263f..c67d19a8913d 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -45,15 +45,6 @@ const char * const bch_cache_modes[] = {
 	NULL
 };
 
-struct uuid_entry_v0 {
-	uint8_t		uuid[16];
-	uint8_t		label[32];
-	uint32_t	first_reg;
-	uint32_t	last_reg;
-	uint32_t	invalidated;
-	uint32_t	pad;
-};
-
 static struct kobject *bcache_kobj;
 struct mutex bch_register_lock;
 LIST_HEAD(bch_cache_sets);
@@ -562,7 +553,7 @@ void bch_prio_write(struct cache *ca)
 		}
 
 		p->next_bucket	= ca->prio_buckets[i + 1];
-		p->magic	= pset_magic(ca);
+		p->magic	= pset_magic(&ca->sb);
 		p->csum		= bch_crc64(&p->magic, bucket_bytes(ca) - 8);
 
 		bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true);
@@ -613,7 +604,7 @@ static void prio_read(struct cache *ca, uint64_t bucket)
 			if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8))
 				pr_warn("bad csum reading priorities");
 
-			if (p->magic != pset_magic(ca))
+			if (p->magic != pset_magic(&ca->sb))
 				pr_warn("bad magic reading priorities");
 
 			bucket = p->next_bucket;
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index ea345c6896f4..38ae7a4ce928 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -27,16 +27,6 @@ struct closure;
 
 #endif
 
-#define BITMASK(name, type, field, offset, size)		\
-static inline uint64_t name(const type *k)			\
-{ return (k->field >> offset) & ~(((uint64_t) ~0) << size); }	\
-								\
-static inline void SET_##name(type *k, uint64_t v)		\
-{								\
-	k->field &= ~(~((uint64_t) ~0 << size) << offset);	\
-	k->field |= v << offset;				\
-}
-
 #define DECLARE_HEAP(type, name)					\
 	struct {							\
 		size_t size, used;					\
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
new file mode 100644
index 000000000000..164a7e263988
--- /dev/null
+++ b/include/uapi/linux/bcache.h
@@ -0,0 +1,373 @@
+#ifndef _LINUX_BCACHE_H
+#define _LINUX_BCACHE_H
+
+/*
+ * Bcache on disk data structures
+ */
+
+#include <asm/types.h>
+
+#define BITMASK(name, type, field, offset, size)		\
+static inline __u64 name(const type *k)				\
+{ return (k->field >> offset) & ~(~0ULL << size); }		\
+								\
+static inline void SET_##name(type *k, __u64 v)			\
+{								\
+	k->field &= ~(~(~0ULL << size) << offset);		\
+	k->field |= (v & ~(~0ULL << size)) << offset;		\
+}
+
+/* Btree keys - all units are in sectors */
+
+struct bkey {
+	__u64	high;
+	__u64	low;
+	__u64	ptr[];
+};
+
+#define KEY_FIELD(name, field, offset, size)				\
+	BITMASK(name, struct bkey, field, offset, size)
+
+#define PTR_FIELD(name, offset, size)					\
+static inline __u64 name(const struct bkey *k, unsigned i)		\
+{ return (k->ptr[i] >> offset) & ~(~0ULL << size); }			\
+									\
+static inline void SET_##name(struct bkey *k, unsigned i, __u64 v)	\
+{									\
+	k->ptr[i] &= ~(~(~0ULL << size) << offset);			\
+	k->ptr[i] |= (v & ~(~0ULL << size)) << offset;			\
+}
+
+#define KEY_SIZE_BITS		16
+
+KEY_FIELD(KEY_PTRS,	high, 60, 3)
+KEY_FIELD(HEADER_SIZE,	high, 58, 2)
+KEY_FIELD(KEY_CSUM,	high, 56, 2)
+KEY_FIELD(KEY_PINNED,	high, 55, 1)
+KEY_FIELD(KEY_DIRTY,	high, 36, 1)
+
+KEY_FIELD(KEY_SIZE,	high, 20, KEY_SIZE_BITS)
+KEY_FIELD(KEY_INODE,	high, 0,  20)
+
+/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */
+
+static inline __u64 KEY_OFFSET(const struct bkey *k)
+{
+	return k->low;
+}
+
+static inline void SET_KEY_OFFSET(struct bkey *k, __u64 v)
+{
+	k->low = v;
+}
+
+/*
+ * The high bit being set is a relic from when we used it to do binary
+ * searches - it told you where a key started. It's not used anymore,
+ * and can probably be safely dropped.
+ */
+#define KEY(inode, offset, size)					\
+((struct bkey) {							\
+	.high = (1ULL << 63) | ((__u64) (size) << 20) | (inode),	\
+	.low = (offset)							\
+})
+
+#define ZERO_KEY			KEY(0, 0, 0)
+
+#define MAX_KEY_INODE			(~(~0 << 20))
+#define MAX_KEY_OFFSET			(~0ULL >> 1)
+#define MAX_KEY				KEY(MAX_KEY_INODE, MAX_KEY_OFFSET, 0)
+
+#define KEY_START(k)			(KEY_OFFSET(k) - KEY_SIZE(k))
+#define START_KEY(k)			KEY(KEY_INODE(k), KEY_START(k), 0)
+
+#define PTR_DEV_BITS			12
+
+PTR_FIELD(PTR_DEV,			51, PTR_DEV_BITS)
+PTR_FIELD(PTR_OFFSET,			8,  43)
+PTR_FIELD(PTR_GEN,			0,  8)
+
+#define PTR_CHECK_DEV			((1 << PTR_DEV_BITS) - 1)
+
+#define PTR(gen, offset, dev)						\
+	((((__u64) dev) << 51) | ((__u64) offset) << 8 | gen)
+
+/* Bkey utility code */
+
+static inline unsigned long bkey_u64s(const struct bkey *k)
+{
+	return (sizeof(struct bkey) / sizeof(__u64)) + KEY_PTRS(k);
+}
+
+static inline unsigned long bkey_bytes(const struct bkey *k)
+{
+	return bkey_u64s(k) * sizeof(__u64);
+}
+
+#define bkey_copy(_dest, _src)	memcpy(_dest, _src, bkey_bytes(_src))
+
+static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src)
+{
+	SET_KEY_INODE(dest, KEY_INODE(src));
+	SET_KEY_OFFSET(dest, KEY_OFFSET(src));
+}
+
+static inline struct bkey *bkey_next(const struct bkey *k)
+{
+	__u64 *d = (void *) k;
+	return (struct bkey *) (d + bkey_u64s(k));
+}
+
+static inline struct bkey *bkey_last(const struct bkey *k, unsigned nr_keys)
+{
+	__u64 *d = (void *) k;
+	return (struct bkey *) (d + nr_keys);
+}
+/* Enough for a key with 6 pointers */
+#define BKEY_PAD		8
+
+#define BKEY_PADDED(key)					\
+	union { struct bkey key; __u64 key ## _pad[BKEY_PAD]; }
+
+/* Superblock */
+
+/* Version 0: Cache device
+ * Version 1: Backing device
+ * Version 2: Seed pointer into btree node checksum
+ * Version 3: Cache device with new UUID format
+ * Version 4: Backing device with data offset
+ */
+#define BCACHE_SB_VERSION_CDEV		0
+#define BCACHE_SB_VERSION_BDEV		1
+#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3
+#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4
+#define BCACHE_SB_MAX_VERSION		4
+
+#define SB_SECTOR			8
+#define SB_SIZE				4096
+#define SB_LABEL_SIZE			32
+#define SB_JOURNAL_BUCKETS		256U
+/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */
+#define MAX_CACHES_PER_SET		8
+
+#define BDEV_DATA_START_DEFAULT		16	/* sectors */
+
+struct cache_sb {
+	__u64			csum;
+	__u64			offset;	/* sector where this sb was written */
+	__u64			version;
+
+	__u8			magic[16];
+
+	__u8			uuid[16];
+	union {
+		__u8		set_uuid[16];
+		__u64		set_magic;
+	};
+	__u8			label[SB_LABEL_SIZE];
+
+	__u64			flags;
+	__u64			seq;
+	__u64			pad[8];
+
+	union {
+	struct {
+		/* Cache devices */
+		__u64		nbuckets;	/* device size */
+
+		__u16		block_size;	/* sectors */
+		__u16		bucket_size;	/* sectors */
+
+		__u16		nr_in_set;
+		__u16		nr_this_dev;
+	};
+	struct {
+		/* Backing devices */
+		__u64		data_offset;
+
+		/*
+		 * block_size from the cache device section is still used by
+		 * backing devices, so don't add anything here until we fix
+		 * things to not need it for backing devices anymore
+		 */
+	};
+	};
+
+	__u32			last_mount;	/* time_t */
+
+	__u16			first_bucket;
+	union {
+		__u16		njournal_buckets;
+		__u16		keys;
+	};
+	__u64			d[SB_JOURNAL_BUCKETS];	/* journal buckets */
+};
+
+static inline _Bool SB_IS_BDEV(const struct cache_sb *sb)
+{
+	return sb->version == BCACHE_SB_VERSION_BDEV
+		|| sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
+}
+
+BITMASK(CACHE_SYNC,			struct cache_sb, flags, 0, 1);
+BITMASK(CACHE_DISCARD,			struct cache_sb, flags, 1, 1);
+BITMASK(CACHE_REPLACEMENT,		struct cache_sb, flags, 2, 3);
+#define CACHE_REPLACEMENT_LRU		0U
+#define CACHE_REPLACEMENT_FIFO		1U
+#define CACHE_REPLACEMENT_RANDOM	2U
+
+BITMASK(BDEV_CACHE_MODE,		struct cache_sb, flags, 0, 4);
+#define CACHE_MODE_WRITETHROUGH		0U
+#define CACHE_MODE_WRITEBACK		1U
+#define CACHE_MODE_WRITEAROUND		2U
+#define CACHE_MODE_NONE			3U
+BITMASK(BDEV_STATE,			struct cache_sb, flags, 61, 2);
+#define BDEV_STATE_NONE			0U
+#define BDEV_STATE_CLEAN		1U
+#define BDEV_STATE_DIRTY		2U
+#define BDEV_STATE_STALE		3U
+
+/*
+ * Magic numbers
+ *
+ * The various other data structures have their own magic numbers, which are
+ * xored with the first part of the cache set's UUID
+ */
+
+#define JSET_MAGIC			0x245235c1a3625032ULL
+#define PSET_MAGIC			0x6750e15f87337f91ULL
+#define BSET_MAGIC			0x90135c78b99e07f5ULL
+
+static inline __u64 jset_magic(struct cache_sb *sb)
+{
+	return sb->set_magic ^ JSET_MAGIC;
+}
+
+static inline __u64 pset_magic(struct cache_sb *sb)
+{
+	return sb->set_magic ^ PSET_MAGIC;
+}
+
+static inline __u64 bset_magic(struct cache_sb *sb)
+{
+	return sb->set_magic ^ BSET_MAGIC;
+}
+
+/*
+ * Journal
+ *
+ * On disk format for a journal entry:
+ * seq is monotonically increasing; every journal entry has its own unique
+ * sequence number.
+ *
+ * last_seq is the oldest journal entry that still has keys the btree hasn't
+ * flushed to disk yet.
+ *
+ * version is for on disk format changes.
+ */
+
+#define BCACHE_JSET_VERSION_UUIDv1	1
+#define BCACHE_JSET_VERSION_UUID	1	/* Always latest UUID format */
+#define BCACHE_JSET_VERSION		1
+
+struct jset {
+	__u64			csum;
+	__u64			magic;
+	__u64			seq;
+	__u32			version;
+	__u32			keys;
+
+	__u64			last_seq;
+
+	BKEY_PADDED(uuid_bucket);
+	BKEY_PADDED(btree_root);
+	__u16			btree_level;
+	__u16			pad[3];
+
+	__u64			prio_bucket[MAX_CACHES_PER_SET];
+
+	union {
+		struct bkey	start[0];
+		__u64		d[0];
+	};
+};
+
+/* Bucket prios/gens */
+
+struct prio_set {
+	__u64			csum;
+	__u64			magic;
+	__u64			seq;
+	__u32			version;
+	__u32			pad;
+
+	__u64			next_bucket;
+
+	struct bucket_disk {
+		__u16		prio;
+		__u8		gen;
+	} __attribute((packed)) data[];
+};
+
+/* UUIDS - per backing device/flash only volume metadata */
+
+struct uuid_entry {
+	union {
+		struct {
+			__u8	uuid[16];
+			__u8	label[32];
+			__u32	first_reg;
+			__u32	last_reg;
+			__u32	invalidated;
+
+			__u32	flags;
+			/* Size of flash only volumes */
+			__u64	sectors;
+		};
+
+		__u8		pad[128];
+	};
+};
+
+BITMASK(UUID_FLASH_ONLY,	struct uuid_entry, flags, 0, 1);
+
+/* Btree nodes */
+
+/* Version 1: Seed pointer into btree node checksum
+ */
+#define BCACHE_BSET_CSUM		1
+#define BCACHE_BSET_VERSION		1
+
+/*
+ * Btree nodes
+ *
+ * On disk a btree node is a list/log of these; within each set the keys are
+ * sorted
+ */
+struct bset {
+	__u64			csum;
+	__u64			magic;
+	__u64			seq;
+	__u32			version;
+	__u32			keys;
+
+	union {
+		struct bkey	start[0];
+		__u64		d[0];
+	};
+};
+
+/* OBSOLETE */
+
+/* UUIDS - per backing device/flash only volume metadata */
+
+struct uuid_entry_v0 {
+	__u8		uuid[16];
+	__u8		label[32];
+	__u32		first_reg;
+	__u32		last_reg;
+	__u32		invalidated;
+	__u32		pad;
+};
+
+#endif /* _LINUX_BCACHE_H */
-- 
cgit v1.2.3


From e58ff155034791ed3a5563d24a50fae0a8c1617c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 18:14:44 -0700
Subject: bcache: Fix bch_ptr_bad()

Previously, bch_ptr_bad() could return false when there was a pointer to
a nonexistant device... it only filtered out keys with PTR_CHECK_DEV
pointers.

This behaviour was intended for multiple cache device support; for that,
just because the device for one of the pointers has gone away doesn't
mean we want to filter out the rest of the pointers.

But we don't yet explicitly filter/check individual pointers, so without
that this behaviour was wrong - a corrupt bkey with a bad device pointer
could cause us to deref a bad pointer. Doh.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bset.c | 67 ++++++++++++++++++++++++------------------------
 1 file changed, 33 insertions(+), 34 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 7b8713c66050..f32216c75948 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -116,47 +116,46 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k)
 	    bch_ptr_invalid(b, k))
 		return true;
 
-	if (KEY_PTRS(k) && PTR_DEV(k, 0) == PTR_CHECK_DEV)
-		return true;
+	for (i = 0; i < KEY_PTRS(k); i++) {
+		if (!ptr_available(b->c, k, i))
+			return true;
 
-	for (i = 0; i < KEY_PTRS(k); i++)
-		if (ptr_available(b->c, k, i)) {
-			g = PTR_BUCKET(b->c, k, i);
-			stale = ptr_stale(b->c, k, i);
+		g = PTR_BUCKET(b->c, k, i);
+		stale = ptr_stale(b->c, k, i);
 
-			btree_bug_on(stale > 96, b,
-				     "key too stale: %i, need_gc %u",
-				     stale, b->c->need_gc);
+		btree_bug_on(stale > 96, b,
+			     "key too stale: %i, need_gc %u",
+			     stale, b->c->need_gc);
 
-			btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
-				     b, "stale dirty pointer");
+		btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
+			     b, "stale dirty pointer");
 
-			if (stale)
-				return true;
+		if (stale)
+			return true;
 
 #ifdef CONFIG_BCACHE_EDEBUG
-			if (!mutex_trylock(&b->c->bucket_lock))
-				continue;
-
-			if (b->level) {
-				if (KEY_DIRTY(k) ||
-				    g->prio != BTREE_PRIO ||
-				    (b->c->gc_mark_valid &&
-				     GC_MARK(g) != GC_MARK_METADATA))
-					goto bug;
-
-			} else {
-				if (g->prio == BTREE_PRIO)
-					goto bug;
-
-				if (KEY_DIRTY(k) &&
-				    b->c->gc_mark_valid &&
-				    GC_MARK(g) != GC_MARK_DIRTY)
-					goto bug;
-			}
-			mutex_unlock(&b->c->bucket_lock);
-#endif
+		if (!mutex_trylock(&b->c->bucket_lock))
+			continue;
+
+		if (b->level) {
+			if (KEY_DIRTY(k) ||
+			    g->prio != BTREE_PRIO ||
+			    (b->c->gc_mark_valid &&
+			     GC_MARK(g) != GC_MARK_METADATA))
+				goto bug;
+
+		} else {
+			if (g->prio == BTREE_PRIO)
+				goto bug;
+
+			if (KEY_DIRTY(k) &&
+			    b->c->gc_mark_valid &&
+			    GC_MARK(g) != GC_MARK_DIRTY)
+				goto bug;
 		}
+		mutex_unlock(&b->c->bucket_lock);
+#endif
+	}
 
 	return false;
 #ifdef CONFIG_BCACHE_EDEBUG
-- 
cgit v1.2.3


From 280481d06c8a683d9aaa26125476222e76b733c5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Thu, 24 Oct 2013 16:36:03 -0700
Subject: bcache: Debug code improvements

Couple changes:
 * Consolidate bch_check_keys() and bch_check_key_order(), and move the
   checks that only check_key_order() could do to bch_btree_iter_next().

 * Get rid of CONFIG_BCACHE_EDEBUG - now, all that code is compiled in
   when CONFIG_BCACHE_DEBUG is enabled, and there's now a sysfs file to
   flip on the EDEBUG checks at runtime.

 * Dropped an old not terribly useful check in rw_unlock(), and
   refactored/improved a some of the other debug code.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/Kconfig  |  11 +---
 drivers/md/bcache/alloc.c  |   5 +-
 drivers/md/bcache/bcache.h |  10 +---
 drivers/md/bcache/bset.c   | 112 ++++++++++++++++++++-----------------
 drivers/md/bcache/bset.h   |   3 +
 drivers/md/bcache/btree.c  |   8 ++-
 drivers/md/bcache/btree.h  |   8 ---
 drivers/md/bcache/debug.c  | 136 +++++++++++++++++++--------------------------
 drivers/md/bcache/debug.h  |  46 +++++++--------
 drivers/md/bcache/sysfs.c  |   5 ++
 drivers/md/bcache/util.h   |   4 +-
 11 files changed, 162 insertions(+), 186 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index f950c9d29f3e..2638417b19aa 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -13,15 +13,8 @@ config BCACHE_DEBUG
 	---help---
 	Don't select this option unless you're a developer
 
-	Enables extra debugging tools (primarily a fuzz tester)
-
-config BCACHE_EDEBUG
-	bool "Extended runtime checks"
-	depends on BCACHE
-	---help---
-	Don't select this option unless you're a developer
-
-	Enables extra runtime checks which significantly affect performance
+	Enables extra debugging tools, allows expensive runtime checks to be
+	turned on.
 
 config BCACHE_CLOSURES_DEBUG
 	bool "Debug closures"
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 4970ddc6a7f6..ed5920b20c61 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -398,8 +398,7 @@ long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait)
 out:
 	wake_up_process(ca->alloc_thread);
 
-#ifdef CONFIG_BCACHE_EDEBUG
-	{
+	if (expensive_debug_checks(ca->set)) {
 		size_t iter;
 		long i;
 
@@ -413,7 +412,7 @@ out:
 		fifo_for_each(i, &ca->unused, iter)
 			BUG_ON(i == r);
 	}
-#endif
+
 	b = ca->buckets + r;
 
 	BUG_ON(atomic_read(&b->pin) != 1);
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 045cb99f1ca6..d03bc6f66493 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -690,6 +690,7 @@ struct cache_set {
 	unsigned short		journal_delay_ms;
 	unsigned		verify:1;
 	unsigned		key_merging_disabled:1;
+	unsigned		expensive_debug_checks:1;
 	unsigned		gc_always_rewrite:1;
 	unsigned		shrinker_disabled:1;
 	unsigned		copy_gc_enabled:1;
@@ -698,15 +699,6 @@ struct cache_set {
 	struct hlist_head	bucket_hash[1 << BUCKET_HASH_BITS];
 };
 
-static inline bool key_merging_disabled(struct cache_set *c)
-{
-#ifdef CONFIG_BCACHE_DEBUG
-	return c->key_merging_disabled;
-#else
-	return 0;
-#endif
-}
-
 struct bbio {
 	unsigned		submit_time_us;
 	union {
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index f32216c75948..6bffde478926 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -106,6 +106,43 @@ bad:
 	return true;
 }
 
+static bool ptr_bad_expensive_checks(struct btree *b, const struct bkey *k,
+				     unsigned ptr)
+{
+	struct bucket *g = PTR_BUCKET(b->c, k, ptr);
+	char buf[80];
+
+	if (mutex_trylock(&b->c->bucket_lock)) {
+		if (b->level) {
+			if (KEY_DIRTY(k) ||
+			    g->prio != BTREE_PRIO ||
+			    (b->c->gc_mark_valid &&
+			     GC_MARK(g) != GC_MARK_METADATA))
+				goto err;
+
+		} else {
+			if (g->prio == BTREE_PRIO)
+				goto err;
+
+			if (KEY_DIRTY(k) &&
+			    b->c->gc_mark_valid &&
+			    GC_MARK(g) != GC_MARK_DIRTY)
+				goto err;
+		}
+		mutex_unlock(&b->c->bucket_lock);
+	}
+
+	return false;
+err:
+	mutex_unlock(&b->c->bucket_lock);
+	bch_bkey_to_text(buf, sizeof(buf), k);
+	btree_bug(b,
+"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
+		  buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
+		  g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
+	return true;
+}
+
 bool bch_ptr_bad(struct btree *b, const struct bkey *k)
 {
 	struct bucket *g;
@@ -133,46 +170,12 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k)
 		if (stale)
 			return true;
 
-#ifdef CONFIG_BCACHE_EDEBUG
-		if (!mutex_trylock(&b->c->bucket_lock))
-			continue;
-
-		if (b->level) {
-			if (KEY_DIRTY(k) ||
-			    g->prio != BTREE_PRIO ||
-			    (b->c->gc_mark_valid &&
-			     GC_MARK(g) != GC_MARK_METADATA))
-				goto bug;
-
-		} else {
-			if (g->prio == BTREE_PRIO)
-				goto bug;
-
-			if (KEY_DIRTY(k) &&
-			    b->c->gc_mark_valid &&
-			    GC_MARK(g) != GC_MARK_DIRTY)
-				goto bug;
-		}
-		mutex_unlock(&b->c->bucket_lock);
-#endif
+		if (expensive_debug_checks(b->c) &&
+		    ptr_bad_expensive_checks(b, k, i))
+			return true;
 	}
 
 	return false;
-#ifdef CONFIG_BCACHE_EDEBUG
-bug:
-	mutex_unlock(&b->c->bucket_lock);
-
-	{
-		char buf[80];
-
-		bch_bkey_to_text(buf, sizeof(buf), k);
-		btree_bug(b,
-"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
-			  buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
-			  g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
-	}
-	return true;
-#endif
 }
 
 /* Key/pointer manipulation */
@@ -821,16 +824,16 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
 	} else
 		i = bset_search_write_set(b, t, search);
 
-#ifdef CONFIG_BCACHE_EDEBUG
-	BUG_ON(bset_written(b, t) &&
-	       i.l != t->data->start &&
-	       bkey_cmp(tree_to_prev_bkey(t,
-		  inorder_to_tree(bkey_to_cacheline(t, i.l), t)),
-			search) > 0);
+	if (expensive_debug_checks(b->c)) {
+		BUG_ON(bset_written(b, t) &&
+		       i.l != t->data->start &&
+		       bkey_cmp(tree_to_prev_bkey(t,
+			  inorder_to_tree(bkey_to_cacheline(t, i.l), t)),
+				search) > 0);
 
-	BUG_ON(i.r != end(t->data) &&
-	       bkey_cmp(i.r, search) <= 0);
-#endif
+		BUG_ON(i.r != end(t->data) &&
+		       bkey_cmp(i.r, search) <= 0);
+	}
 
 	while (likely(i.l != i.r) &&
 	       bkey_cmp(i.l, search) <= 0)
@@ -871,12 +874,16 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
 }
 
 struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter,
-			       struct bkey *search, struct bset_tree *start)
+				   struct bkey *search, struct bset_tree *start)
 {
 	struct bkey *ret = NULL;
 	iter->size = ARRAY_SIZE(iter->data);
 	iter->used = 0;
 
+#ifdef CONFIG_BCACHE_DEBUG
+	iter->b = b;
+#endif
+
 	for (; start <= &b->sets[b->nsets]; start++) {
 		ret = bch_bset_search(b, start, search);
 		bch_btree_iter_push(iter, ret, end(start->data));
@@ -891,6 +898,8 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter)
 	struct bkey *ret = NULL;
 
 	if (!btree_iter_end(iter)) {
+		bch_btree_iter_next_check(iter);
+
 		ret = iter->data->k;
 		iter->data->k = bkey_next(iter->data->k);
 
@@ -1002,7 +1011,6 @@ static void btree_mergesort(struct btree *b, struct bset *out,
 	out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0;
 
 	pr_debug("sorted %i keys", out->keys);
-	bch_check_key_order(b, out);
 }
 
 static void __btree_sort(struct btree *b, struct btree_iter *iter,
@@ -1063,15 +1071,15 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter,
 
 void bch_btree_sort_partial(struct btree *b, unsigned start)
 {
-	size_t oldsize = 0, order = b->page_order, keys = 0;
+	size_t order = b->page_order, keys = 0;
 	struct btree_iter iter;
+	int oldsize = bch_count_data(b);
+
 	__bch_btree_iter_init(b, &iter, NULL, &b->sets[start]);
 
 	BUG_ON(b->sets[b->nsets].data == write_block(b) &&
 	       (b->sets[b->nsets].size || b->nsets));
 
-	if (b->written)
-		oldsize = bch_count_data(b);
 
 	if (start) {
 		unsigned i;
@@ -1087,7 +1095,7 @@ void bch_btree_sort_partial(struct btree *b, unsigned start)
 
 	__btree_sort(b, &iter, start, order, false);
 
-	EBUG_ON(b->written && bch_count_data(b) != oldsize);
+	EBUG_ON(b->written && oldsize >= 0 && bch_count_data(b) != oldsize);
 }
 
 void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter)
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 5cd90565dfe2..a043a92d4dc9 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -148,6 +148,9 @@
 
 struct btree_iter {
 	size_t size, used;
+#ifdef CONFIG_BCACHE_DEBUG
+	struct btree *b;
+#endif
 	struct btree_iter_set {
 		struct bkey *k, *end;
 	} data[MAX_BSETS];
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index aba787d954e5..fa4d0b1f6d75 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -216,6 +216,10 @@ static void bch_btree_node_read_done(struct btree *b)
 	iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
 	iter->used = 0;
 
+#ifdef CONFIG_BCACHE_DEBUG
+	iter->b = b;
+#endif
+
 	if (!i->seq)
 		goto err;
 
@@ -454,7 +458,7 @@ void bch_btree_node_write(struct btree *b, struct closure *parent)
 	BUG_ON(b->written >= btree_blocks(b));
 	BUG_ON(b->written && !i->keys);
 	BUG_ON(b->sets->data->seq != i->seq);
-	bch_check_key_order(b, i);
+	bch_check_keys(b, "writing");
 
 	cancel_delayed_work(&b->work);
 
@@ -1917,7 +1921,7 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
 				  struct bkey *replace_key)
 {
 	bool ret = false;
-	unsigned oldsize = bch_count_data(b);
+	int oldsize = bch_count_data(b);
 
 	while (!bch_keylist_empty(insert_keys)) {
 		struct bset *i = write_block(b);
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 8fc1e8925399..27e90b189112 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -259,14 +259,6 @@ static inline void rw_lock(bool w, struct btree *b, int level)
 
 static inline void rw_unlock(bool w, struct btree *b)
 {
-#ifdef CONFIG_BCACHE_EDEBUG
-	unsigned i;
-
-	if (w && b->key.ptr[0])
-		for (i = 0; i <= b->nsets; i++)
-			bch_check_key_order(b, b->sets[i].data);
-#endif
-
 	if (w)
 		b->seq++;
 	(w ? up_write : up_read)(&b->lock);
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index d9ccb3169aa2..e99e6b8852b2 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -76,29 +76,17 @@ int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k)
 	return out - buf;
 }
 
-int bch_btree_to_text(char *buf, size_t size, const struct btree *b)
-{
-	return scnprintf(buf, size, "%zu level %i/%i",
-			 PTR_BUCKET_NR(b->c, &b->key, 0),
-			 b->level, b->c->root ? b->c->root->level : -1);
-}
-
-#if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG)
-
-static bool skipped_backwards(struct btree *b, struct bkey *k)
-{
-	return bkey_cmp(k, (!b->level)
-			? &START_KEY(bkey_next(k))
-			: bkey_next(k)) > 0;
-}
+#ifdef CONFIG_BCACHE_DEBUG
 
 static void dump_bset(struct btree *b, struct bset *i)
 {
-	struct bkey *k;
+	struct bkey *k, *next;
 	unsigned j;
 	char buf[80];
 
-	for (k = i->start; k < end(i); k = bkey_next(k)) {
+	for (k = i->start; k < end(i); k = next) {
+		next = bkey_next(k);
+
 		bch_bkey_to_text(buf, sizeof(buf), k);
 		printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b),
 		       (uint64_t *) k - i->d, i->keys, buf);
@@ -114,15 +102,21 @@ static void dump_bset(struct btree *b, struct bset *i)
 
 		printk(" %s\n", bch_ptr_status(b->c, k));
 
-		if (bkey_next(k) < end(i) &&
-		    skipped_backwards(b, k))
+		if (next < end(i) &&
+		    bkey_cmp(k, !b->level ? &START_KEY(next) : next) > 0)
 			printk(KERN_ERR "Key skipped backwards\n");
 	}
 }
 
-#endif
+static void bch_dump_bucket(struct btree *b)
+{
+	unsigned i;
 
-#ifdef CONFIG_BCACHE_DEBUG
+	console_lock();
+	for (i = 0; i <= b->nsets; i++)
+		dump_bset(b, b->sets[i].data);
+	console_unlock();
+}
 
 void bch_btree_verify(struct btree *b, struct bset *new)
 {
@@ -211,11 +205,7 @@ out_put:
 	bio_put(check);
 }
 
-#endif
-
-#ifdef CONFIG_BCACHE_EDEBUG
-
-unsigned bch_count_data(struct btree *b)
+int __bch_count_data(struct btree *b)
 {
 	unsigned ret = 0;
 	struct btree_iter iter;
@@ -227,72 +217,60 @@ unsigned bch_count_data(struct btree *b)
 	return ret;
 }
 
-static void vdump_bucket_and_panic(struct btree *b, const char *fmt,
-				   va_list args)
-{
-	unsigned i;
-	char buf[80];
-
-	console_lock();
-
-	for (i = 0; i <= b->nsets; i++)
-		dump_bset(b, b->sets[i].data);
-
-	vprintk(fmt, args);
-
-	console_unlock();
-
-	bch_btree_to_text(buf, sizeof(buf), b);
-	panic("at %s\n", buf);
-}
-
-void bch_check_key_order_msg(struct btree *b, struct bset *i,
-			     const char *fmt, ...)
-{
-	struct bkey *k;
-
-	if (!i->keys)
-		return;
-
-	for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k))
-		if (skipped_backwards(b, k)) {
-			va_list args;
-			va_start(args, fmt);
-
-			vdump_bucket_and_panic(b, fmt, args);
-			va_end(args);
-		}
-}
-
-void bch_check_keys(struct btree *b, const char *fmt, ...)
+void __bch_check_keys(struct btree *b, const char *fmt, ...)
 {
 	va_list args;
 	struct bkey *k, *p = NULL;
 	struct btree_iter iter;
-
-	if (b->level)
-		return;
+	const char *err;
 
 	for_each_key(b, k, &iter) {
-		if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) {
-			printk(KERN_ERR "Keys out of order:\n");
-			goto bug;
-		}
-
-		if (bch_ptr_invalid(b, k))
-			continue;
-
-		if (p && bkey_cmp(p, &START_KEY(k)) > 0) {
-			printk(KERN_ERR "Overlapping keys:\n");
-			goto bug;
+		if (!b->level) {
+			err = "Keys out of order";
+			if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0)
+				goto bug;
+
+			if (bch_ptr_invalid(b, k))
+				continue;
+
+			err =  "Overlapping keys";
+			if (p && bkey_cmp(p, &START_KEY(k)) > 0)
+				goto bug;
+		} else {
+			if (bch_ptr_bad(b, k))
+				continue;
+
+			err = "Duplicate keys";
+			if (p && !bkey_cmp(p, k))
+				goto bug;
 		}
 		p = k;
 	}
+
+	err = "Key larger than btree node key";
+	if (p && bkey_cmp(p, &b->key) > 0)
+		goto bug;
+
 	return;
 bug:
+	bch_dump_bucket(b);
+
 	va_start(args, fmt);
-	vdump_bucket_and_panic(b, fmt, args);
+	vprintk(fmt, args);
 	va_end(args);
+
+	panic("bcache error: %s:\n", err);
+}
+
+void bch_btree_iter_next_check(struct btree_iter *iter)
+{
+	struct bkey *k = iter->data->k, *next = bkey_next(k);
+
+	if (next < iter->data->end &&
+	    bkey_cmp(k, iter->b->level ? next : &START_KEY(next)) > 0) {
+		bch_dump_bucket(iter->b);
+		panic("Key skipped backwards\n");
+	}
 }
 
 #endif
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h
index 0f4b3440512c..7914ba0ff316 100644
--- a/drivers/md/bcache/debug.h
+++ b/drivers/md/bcache/debug.h
@@ -4,40 +4,42 @@
 /* Btree/bkey debug printing */
 
 int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k);
-int bch_btree_to_text(char *buf, size_t size, const struct btree *b);
-
-#ifdef CONFIG_BCACHE_EDEBUG
-
-unsigned bch_count_data(struct btree *);
-void bch_check_key_order_msg(struct btree *, struct bset *, const char *, ...);
-void bch_check_keys(struct btree *, const char *, ...);
-
-#define bch_check_key_order(b, i)			\
-	bch_check_key_order_msg(b, i, "keys out of order")
-#define EBUG_ON(cond)		BUG_ON(cond)
-
-#else /* EDEBUG */
-
-#define bch_count_data(b)				0
-#define bch_check_key_order(b, i)			do {} while (0)
-#define bch_check_key_order_msg(b, i, ...)		do {} while (0)
-#define bch_check_keys(b, ...)				do {} while (0)
-#define EBUG_ON(cond)					do {} while (0)
-
-#endif
 
 #ifdef CONFIG_BCACHE_DEBUG
 
 void bch_btree_verify(struct btree *, struct bset *);
 void bch_data_verify(struct cached_dev *, struct bio *);
+int __bch_count_data(struct btree *);
+void __bch_check_keys(struct btree *, const char *, ...);
+void bch_btree_iter_next_check(struct btree_iter *);
+
+#define EBUG_ON(cond)			BUG_ON(cond)
+#define expensive_debug_checks(c)	((c)->expensive_debug_checks)
+#define key_merging_disabled(c)		((c)->key_merging_disabled)
 
 #else /* DEBUG */
 
 static inline void bch_btree_verify(struct btree *b, struct bset *i) {}
-static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {};
+static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {}
+static inline int __bch_count_data(struct btree *b) { return -1; }
+static inline void __bch_check_keys(struct btree *b, const char *fmt, ...) {}
+static inline void bch_btree_iter_next_check(struct btree_iter *iter) {}
+
+#define EBUG_ON(cond)			do { if (cond); } while (0)
+#define expensive_debug_checks(c)	0
+#define key_merging_disabled(c)		0
 
 #endif
 
+#define bch_count_data(b)						\
+	(expensive_debug_checks((b)->c) ? __bch_count_data(b) : -1)
+
+#define bch_check_keys(b, ...)						\
+do {									\
+	if (expensive_debug_checks((b)->c))				\
+		__bch_check_keys(b, __VA_ARGS__);			\
+} while (0)
+
 #ifdef CONFIG_DEBUG_FS
 void bch_debug_init_cache_set(struct cache_set *);
 #else
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index ab286b9b5e40..9687771ec6f3 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -102,6 +102,7 @@ rw_attribute(io_error_halflife);
 rw_attribute(verify);
 rw_attribute(key_merging_disabled);
 rw_attribute(gc_always_rewrite);
+rw_attribute(expensive_debug_checks);
 rw_attribute(freelist_percent);
 rw_attribute(cache_replacement_policy);
 rw_attribute(btree_shrinker_disabled);
@@ -517,6 +518,8 @@ lock_root:
 	sysfs_print(active_journal_entries,	fifo_used(&c->journal.pin));
 	sysfs_printf(verify,			"%i", c->verify);
 	sysfs_printf(key_merging_disabled,	"%i", c->key_merging_disabled);
+	sysfs_printf(expensive_debug_checks,
+		     "%i", c->expensive_debug_checks);
 	sysfs_printf(gc_always_rewrite,		"%i", c->gc_always_rewrite);
 	sysfs_printf(btree_shrinker_disabled,	"%i", c->shrinker_disabled);
 	sysfs_printf(copy_gc_enabled,		"%i", c->copy_gc_enabled);
@@ -599,6 +602,7 @@ STORE(__bch_cache_set)
 	sysfs_strtoul(journal_delay_ms,		c->journal_delay_ms);
 	sysfs_strtoul(verify,			c->verify);
 	sysfs_strtoul(key_merging_disabled,	c->key_merging_disabled);
+	sysfs_strtoul(expensive_debug_checks,	c->expensive_debug_checks);
 	sysfs_strtoul(gc_always_rewrite,	c->gc_always_rewrite);
 	sysfs_strtoul(btree_shrinker_disabled,	c->shrinker_disabled);
 	sysfs_strtoul(copy_gc_enabled,		c->copy_gc_enabled);
@@ -674,6 +678,7 @@ static struct attribute *bch_cache_set_internal_files[] = {
 #ifdef CONFIG_BCACHE_DEBUG
 	&sysfs_verify,
 	&sysfs_key_merging_disabled,
+	&sysfs_expensive_debug_checks,
 #endif
 	&sysfs_gc_always_rewrite,
 	&sysfs_btree_shrinker_disabled,
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 38ae7a4ce928..8ce5aab55962 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -15,12 +15,12 @@
 
 struct closure;
 
-#ifdef CONFIG_BCACHE_EDEBUG
+#ifdef CONFIG_BCACHE_DEBUG
 
 #define atomic_dec_bug(v)	BUG_ON(atomic_dec_return(v) < 0)
 #define atomic_inc_bug(v, i)	BUG_ON(atomic_inc_return(v) <= i)
 
-#else /* EDEBUG */
+#else /* DEBUG */
 
 #define atomic_dec_bug(v)	atomic_dec(v)
 #define atomic_inc_bug(v, i)	atomic_inc(v)
-- 
cgit v1.2.3


From 3a3b6a4e075188342b58d4b6560f5540af64cac0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 16:46:42 -0700
Subject: bcache: Don't bother with bucket refcount for btree node allocations

The bucket refcount (dropped with bkey_put()) is only needed to prevent
the newly allocated bucket from being garbage collected until we've
added a pointer to it somewhere. But for btree node allocations, the
fact that we have btree nodes locked is enough to guard against races
with garbage collection.

Eventually the per bucket refcount is going to be replaced with
something specific to bch_alloc_sectors().

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/alloc.c |  4 ++--
 drivers/md/bcache/btree.c | 28 +++++-----------------------
 drivers/md/bcache/btree.h |  2 +-
 drivers/md/bcache/super.c |  2 +-
 4 files changed, 9 insertions(+), 27 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index ed5920b20c61..2b46bf1d7e40 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -472,7 +472,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
 	return 0;
 err:
 	bch_bucket_free(c, k);
-	__bkey_put(c, k);
+	bkey_put(c, k);
 	return -1;
 }
 
@@ -588,7 +588,7 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors,
 	 * didn't use it, drop the refcount bch_bucket_alloc_set() took:
 	 */
 	if (KEY_PTRS(&alloc.key))
-		__bkey_put(c, &alloc.key);
+		bkey_put(c, &alloc.key);
 
 	for (i = 0; i < KEY_PTRS(&b->key); i++)
 		EBUG_ON(ptr_stale(c, &b->key, i));
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index fa4d0b1f6d75..7dff73ba1b71 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -180,7 +180,7 @@ static inline bool should_split(struct btree *b)
 
 /* Btree key manipulation */
 
-void __bkey_put(struct cache_set *c, struct bkey *k)
+void bkey_put(struct cache_set *c, struct bkey *k)
 {
 	unsigned i;
 
@@ -189,12 +189,6 @@ void __bkey_put(struct cache_set *c, struct bkey *k)
 			atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
 }
 
-static void bkey_put(struct cache_set *c, struct bkey *k, int level)
-{
-	if ((level && KEY_OFFSET(k)) || !level)
-		__bkey_put(c, k);
-}
-
 /* Btree IO */
 
 static uint64_t btree_csum_set(struct btree *b, struct bset *i)
@@ -1068,6 +1062,7 @@ retry:
 	if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true))
 		goto err;
 
+	bkey_put(c, &k.key);
 	SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
 
 	b = mca_alloc(c, &k.key, level);
@@ -1077,7 +1072,6 @@ retry:
 	if (!b) {
 		cache_bug(c,
 			"Tried to allocate bucket that was in btree cache");
-		__bkey_put(c, &k.key);
 		goto retry;
 	}
 
@@ -1090,7 +1084,6 @@ retry:
 	return b;
 err_free:
 	bch_bucket_free(c, &k.key);
-	__bkey_put(c, &k.key);
 err:
 	mutex_unlock(&c->bucket_lock);
 
@@ -1217,7 +1210,6 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k)
 
 	if (!IS_ERR_OR_NULL(n)) {
 		swap(b, n);
-		__bkey_put(b->c, &b->key);
 
 		memcpy(k->ptr, b->key.ptr,
 		       sizeof(uint64_t) * KEY_PTRS(&b->key));
@@ -1932,19 +1924,12 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
 			break;
 
 		if (bkey_cmp(k, &b->key) <= 0) {
-			bkey_put(b->c, k, b->level);
+			if (!b->level)
+				bkey_put(b->c, k);
 
 			ret |= btree_insert_key(b, op, k, replace_key);
 			bch_keylist_pop_front(insert_keys);
 		} else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
-#if 0
-			if (replace_key) {
-				bkey_put(b->c, k, b->level);
-				bch_keylist_pop_front(insert_keys);
-				op->insert_collision = true;
-				break;
-			}
-#endif
 			BKEY_PADDED(key) temp;
 			bkey_copy(&temp.key, insert_keys->keys);
 
@@ -2071,11 +2056,9 @@ static int btree_split(struct btree *b, struct btree_op *op,
 
 	return 0;
 err_free2:
-	__bkey_put(n2->c, &n2->key);
 	btree_node_free(n2);
 	rw_unlock(true, n2);
 err_free1:
-	__bkey_put(n1->c, &n1->key);
 	btree_node_free(n1);
 	rw_unlock(true, n1);
 err:
@@ -2225,7 +2208,7 @@ int bch_btree_insert(struct cache_set *c, struct keylist *keys,
 		pr_err("error %i", ret);
 
 		while ((k = bch_keylist_pop(keys)))
-			bkey_put(c, k, 0);
+			bkey_put(c, k);
 	} else if (op.op.insert_collision)
 		ret = -ESRCH;
 
@@ -2251,7 +2234,6 @@ void bch_btree_set_root(struct btree *b)
 	mutex_unlock(&b->c->bucket_lock);
 
 	b->c->root = b;
-	__bkey_put(b->c, &b->key);
 
 	bch_journal_meta(b->c, &cl);
 	closure_sync(&cl);
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index 27e90b189112..d4b705eeec24 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -216,7 +216,7 @@ static inline struct bkey *bch_btree_iter_init(struct btree *b,
 	return __bch_btree_iter_init(b, iter, search, b->sets);
 }
 
-void __bkey_put(struct cache_set *c, struct bkey *k);
+void bkey_put(struct cache_set *c, struct bkey *k);
 
 /* Looping macros */
 
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index c67d19a8913d..05f8ccb9f8ca 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -426,7 +426,7 @@ static int __uuid_write(struct cache_set *c)
 	closure_sync(&cl);
 
 	bkey_copy(&c->uuid_bucket, &k.key);
-	__bkey_put(c, &k.key);
+	bkey_put(c, &k.key);
 	return 0;
 }
 
-- 
cgit v1.2.3


From d5cc66e95744065f96024add4bf7d7e019be54ac Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 23:06:40 -0700
Subject: bcache: bch_(btree|extent)_ptr_invalid()

Trying to treat btree pointers and leaf node pointers the same way was a
mistake - going to start being more explicit about the type of
key/pointer we're dealing with. This is the first part of that
refactoring; this patch shouldn't change any actual behaviour.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bset.c  | 49 ++++++++++++++++++++++++++++++++++-------------
 drivers/md/bcache/bset.h  |  4 +++-
 drivers/md/bcache/btree.h | 13 ++++++++-----
 drivers/md/bcache/super.c |  4 ++--
 4 files changed, 49 insertions(+), 21 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 6bffde478926..b615348c45fc 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -73,19 +73,9 @@ void bch_keylist_pop_front(struct keylist *l)
 
 /* Pointer validation */
 
-bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
+static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
 {
 	unsigned i;
-	char buf[80];
-
-	if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)))
-		goto bad;
-
-	if (!level && KEY_SIZE(k) > KEY_OFFSET(k))
-		goto bad;
-
-	if (!KEY_SIZE(k))
-		return true;
 
 	for (i = 0; i < KEY_PTRS(k); i++)
 		if (ptr_available(c, k, i)) {
@@ -96,13 +86,46 @@ bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
 			if (KEY_SIZE(k) + r > c->sb.bucket_size ||
 			    bucket <  ca->sb.first_bucket ||
 			    bucket >= ca->sb.nbuckets)
-				goto bad;
+				return true;
 		}
 
+	return false;
+}
+
+bool bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k)
+{
+	char buf[80];
+
+	if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))
+		goto bad;
+
+	if (__ptr_invalid(c, k))
+		goto bad;
+
+	return false;
+bad:
+	bch_bkey_to_text(buf, sizeof(buf), k);
+	cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k));
+	return true;
+}
+
+bool bch_extent_ptr_invalid(struct cache_set *c, const struct bkey *k)
+{
+	char buf[80];
+
+	if (!KEY_SIZE(k))
+		return true;
+
+	if (KEY_SIZE(k) > KEY_OFFSET(k))
+		goto bad;
+
+	if (__ptr_invalid(c, k))
+		goto bad;
+
 	return false;
 bad:
 	bch_bkey_to_text(buf, sizeof(buf), k);
-	cache_bug(c, "spotted bad key %s: %s", buf, bch_ptr_status(c, k));
+	cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k));
 	return true;
 }
 
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index a043a92d4dc9..e67386001814 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -277,7 +277,9 @@ static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
 }
 
 const char *bch_ptr_status(struct cache_set *, const struct bkey *);
-bool __bch_ptr_invalid(struct cache_set *, int level, const struct bkey *);
+bool bch_btree_ptr_invalid(struct cache_set *, const struct bkey *);
+bool bch_extent_ptr_invalid(struct cache_set *, const struct bkey *);
+
 bool bch_ptr_bad(struct btree *, const struct bkey *);
 
 static inline uint8_t gen_after(uint8_t a, uint8_t b)
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index d4b705eeec24..e11bb8571d24 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -204,11 +204,6 @@ static inline void set_gc_sectors(struct cache_set *c)
 	atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8);
 }
 
-static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k)
-{
-	return __bch_ptr_invalid(b->c, b->level, k);
-}
-
 static inline struct bkey *bch_btree_iter_init(struct btree *b,
 					       struct btree_iter *iter,
 					       struct bkey *search)
@@ -216,6 +211,14 @@ static inline struct bkey *bch_btree_iter_init(struct btree *b,
 	return __bch_btree_iter_init(b, iter, search, b->sets);
 }
 
+static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k)
+{
+	if (b->level)
+		return bch_btree_ptr_invalid(b->c, k);
+	else
+		return bch_extent_ptr_invalid(b->c, k);
+}
+
 void bkey_put(struct cache_set *c, struct bkey *k);
 
 /* Looping macros */
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 05f8ccb9f8ca..7ab926d94d81 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -373,7 +373,7 @@ static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
 {
 	struct bkey *k = &j->uuid_bucket;
 
-	if (__bch_ptr_invalid(c, 1, k))
+	if (bch_btree_ptr_invalid(c, k))
 		return "bad uuid pointer";
 
 	bkey_copy(&c->uuid_bucket, k);
@@ -1522,7 +1522,7 @@ static void run_cache_set(struct cache_set *c)
 		k = &j->btree_root;
 
 		err = "bad btree root";
-		if (__bch_ptr_invalid(c, j->btree_level + 1, k))
+		if (bch_btree_ptr_invalid(c, k))
 			goto err;
 
 		err = "error reading btree root";
-- 
cgit v1.2.3


From 0eacac22034ca21c73fe49e800d0b938b2047250 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Mon, 1 Jul 2013 19:29:05 -0700
Subject: bcache: PRECEDING_KEY()

btree_insert_key() was open coding this, this is just refactoring.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bset.h  | 18 ++++++++++++++++++
 drivers/md/bcache/btree.c |  9 ++-------
 2 files changed, 20 insertions(+), 7 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index e67386001814..bc9975526aa1 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -353,12 +353,30 @@ void bch_bset_fix_lookup_table(struct btree *, struct bkey *);
 struct bkey *__bch_bset_search(struct btree *, struct bset_tree *,
 			   const struct bkey *);
 
+/*
+ * Returns the first key that is strictly greater than search
+ */
 static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t,
 					   const struct bkey *search)
 {
 	return search ? __bch_bset_search(b, t, search) : t->data->start;
 }
 
+#define PRECEDING_KEY(_k)					\
+({								\
+	struct bkey *_ret = NULL;				\
+								\
+	if (KEY_INODE(_k) || KEY_OFFSET(_k)) {			\
+		_ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0);	\
+								\
+		if (!_ret->low)					\
+			_ret->high--;				\
+		_ret->low--;					\
+	}							\
+								\
+	_ret;							\
+})
+
 bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *);
 void bch_btree_sort_lazy(struct btree *);
 void bch_btree_sort_into(struct btree *, struct btree *);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 7dff73ba1b71..773b0e929ff4 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1844,19 +1844,14 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op,
 
 	if (!b->level) {
 		struct btree_iter iter;
-		struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0);
 
 		/*
 		 * bset_search() returns the first key that is strictly greater
 		 * than the search key - but for back merging, we want to find
-		 * the first key that is greater than or equal to KEY_START(k) -
-		 * unless KEY_START(k) is 0.
+		 * the previous key.
 		 */
-		if (KEY_OFFSET(&search))
-			SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1);
-
 		prev = NULL;
-		m = bch_btree_iter_init(b, &iter, &search);
+		m = bch_btree_iter_init(b, &iter, PRECEDING_KEY(&START_KEY(k)));
 
 		if (fix_overlapping_extents(b, k, &iter, replace_key)) {
 			op->insert_collision = true;
-- 
cgit v1.2.3


From f269af5a078302712de8ee70d273eba2eb4485ca Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 23 Jul 2013 20:48:29 -0700
Subject: bcache: Add btree_node_write_sync()

More refactoring - mostly making the interfaces more explicit about what
we actually want to do.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 773b0e929ff4..7d6204c41840 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -474,6 +474,15 @@ void bch_btree_node_write(struct btree *b, struct closure *parent)
 		bch_bset_init_next(b);
 }
 
+static void bch_btree_node_write_sync(struct btree *b)
+{
+	struct closure cl;
+
+	closure_init_stack(&cl);
+	bch_btree_node_write(b, &cl);
+	closure_sync(&cl);
+}
+
 static void btree_node_write_work(struct work_struct *w)
 {
 	struct btree *b = container_of(to_delayed_work(w), struct btree, work);
@@ -655,10 +664,8 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush)
 		return -ENOMEM;
 	}
 
-	if (btree_node_dirty(b)) {
-		bch_btree_node_write(b, &cl);
-		closure_sync(&cl);
-	}
+	if (btree_node_dirty(b))
+		bch_btree_node_write_sync(b);
 
 	/* wait for any in flight btree write */
 	closure_wait_event(&b->io.wait, &cl,
@@ -1411,9 +1418,6 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
 	struct btree *n = NULL;
 	unsigned keys = 0;
 	int ret = 0, stale = btree_gc_mark_node(b, &keys, gc);
-	struct closure cl;
-
-	closure_init_stack(&cl);
 
 	if (b->level || stale > 10)
 		n = btree_node_alloc_replacement(b);
@@ -1424,12 +1428,10 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
 	if (b->level)
 		ret = btree_gc_recurse(b, op, writes, gc);
 
-	if (!b->written || btree_node_dirty(b)) {
-		bch_btree_node_write(b, n ? &cl : NULL);
-	}
+	if (!b->written || btree_node_dirty(b))
+		bch_btree_node_write_sync(b);
 
 	if (!IS_ERR_OR_NULL(n)) {
-		closure_sync(&cl);
 		bch_btree_set_root(b);
 		btree_node_free(n);
 		rw_unlock(true, b);
@@ -2104,15 +2106,10 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 
 			if (bch_btree_insert_keys(b, op, insert_keys,
 						  replace_key)) {
-				if (!b->level) {
+				if (!b->level)
 					bch_btree_leaf_dirty(b, journal_ref);
-				} else {
-					struct closure cl;
-
-					closure_init_stack(&cl);
-					bch_btree_node_write(b, &cl);
-					closure_sync(&cl);
-				}
+				else
+					bch_btree_node_write_sync(b);
 			}
 		}
 	} while (!bch_keylist_empty(&split_keys));
-- 
cgit v1.2.3


From 8835c1234dd9a838993a2d5cb7572f57992ebbee Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 24 Jul 2013 23:18:05 -0700
Subject: bcache: Add make_btree_freeing_key()

Refactoring, prep work for incremental garbage collection.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 7d6204c41840..a3f8ca4ee6e0 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1107,6 +1107,22 @@ static struct btree *btree_node_alloc_replacement(struct btree *b)
 	return n;
 }
 
+static void make_btree_freeing_key(struct btree *b, struct bkey *k)
+{
+	unsigned i;
+
+	bkey_copy(k, &b->key);
+	bkey_copy_key(k, &ZERO_KEY);
+
+	for (i = 0; i < KEY_PTRS(k); i++) {
+		uint8_t g = PTR_BUCKET(b->c, k, i)->gen + 1;
+
+		SET_PTR_GEN(k, i, g);
+	}
+
+	atomic_inc(&b->c->prio_blocked);
+}
+
 /* Garbage collection */
 
 uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
@@ -2030,20 +2046,9 @@ static int btree_split(struct btree *b, struct btree_op *op,
 		closure_sync(&cl);
 		bch_btree_set_root(n1);
 	} else {
-		unsigned i;
-
-		bkey_copy(parent_keys->top, &b->key);
-		bkey_copy_key(parent_keys->top, &ZERO_KEY);
-
-		for (i = 0; i < KEY_PTRS(&b->key); i++) {
-			uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1;
-
-			SET_PTR_GEN(parent_keys->top, i, g);
-		}
-
-		bch_keylist_push(parent_keys);
 		closure_sync(&cl);
-		atomic_inc(&b->c->prio_blocked);
+		make_btree_freeing_key(b, parent_keys->top);
+		bch_keylist_push(parent_keys);
 	}
 
 	rw_unlock(true, n1);
-- 
cgit v1.2.3


From a1f0358b2bf69be216cb6e4ea40fe7ae4d38b8a6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 10 Sep 2013 19:07:00 -0700
Subject: bcache: Incremental gc

Big garbage collection rewrite; now, garbage collection uses the same
mechanisms as used elsewhere for inserting/updating btree node pointers,
instead of rewriting interior btree nodes in place.

This makes the code significantly cleaner and less fragile, and means we
can now make garbage collection incremental - it doesn't have to hold a
write lock on the root of the btree for the entire duration of garbage
collection.

This means that there's less of a latency hit for doing garbage
collection, which means we can gc more frequently (and do a better job
of reclaiming from the cache), and we can coalesce across more btree
nodes (improving our space efficiency).

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h |   1 -
 drivers/md/bcache/btree.c  | 388 ++++++++++++++++++++++++++-------------------
 drivers/md/bcache/btree.h  |   2 +-
 drivers/md/bcache/sysfs.c  |   2 -
 4 files changed, 226 insertions(+), 167 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d03bc6f66493..d6970a651e42 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -477,7 +477,6 @@ struct gc_stat {
 
 	size_t			nkeys;
 	uint64_t		data;	/* sectors */
-	uint64_t		dirty;	/* sectors */
 	unsigned		in_use; /* percent */
 };
 
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index a3f8ca4ee6e0..7d283d217438 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1176,12 +1176,10 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
 
 #define btree_mark_key(b, k)	__bch_btree_mark_key(b->c, b->level, k)
 
-static int btree_gc_mark_node(struct btree *b, unsigned *keys,
-			      struct gc_stat *gc)
+static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
 {
 	uint8_t stale = 0;
-	unsigned last_dev = -1;
-	struct bcache_device *d = NULL;
+	unsigned keys = 0, good_keys = 0;
 	struct bkey *k;
 	struct btree_iter iter;
 	struct bset_tree *t;
@@ -1189,27 +1187,17 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys,
 	gc->nodes++;
 
 	for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
-		if (last_dev != KEY_INODE(k)) {
-			last_dev = KEY_INODE(k);
-
-			d = KEY_INODE(k) < b->c->nr_uuids
-				? b->c->devices[last_dev]
-				: NULL;
-		}
-
 		stale = max(stale, btree_mark_key(b, k));
+		keys++;
 
 		if (bch_ptr_bad(b, k))
 			continue;
 
-		*keys += bkey_u64s(k);
-
 		gc->key_bytes += bkey_u64s(k);
 		gc->nkeys++;
+		good_keys++;
 
 		gc->data += KEY_SIZE(k);
-		if (KEY_DIRTY(k))
-			gc->dirty += KEY_SIZE(k);
 	}
 
 	for (t = b->sets; t <= &b->sets[b->nsets]; t++)
@@ -1218,79 +1206,74 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys,
 			     bkey_cmp(&b->key, &t->end) < 0,
 			     b, "found short btree key in gc");
 
-	return stale;
-}
+	if (b->c->gc_always_rewrite)
+		return true;
 
-static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k)
-{
-	/*
-	 * We block priorities from being written for the duration of garbage
-	 * collection, so we can't sleep in btree_alloc() ->
-	 * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it
-	 * our closure.
-	 */
-	struct btree *n = btree_node_alloc_replacement(b);
-
-	if (!IS_ERR_OR_NULL(n)) {
-		swap(b, n);
+	if (stale > 10)
+		return true;
 
-		memcpy(k->ptr, b->key.ptr,
-		       sizeof(uint64_t) * KEY_PTRS(&b->key));
-
-		btree_node_free(n);
-		up_write(&n->lock);
-	}
+	if ((keys - good_keys) * 2 > keys)
+		return true;
 
-	return b;
+	return false;
 }
 
-/*
- * Leaving this at 2 until we've got incremental garbage collection done; it
- * could be higher (and has been tested with 4) except that garbage collection
- * could take much longer, adversely affecting latency.
- */
-#define GC_MERGE_NODES	2U
+#define GC_MERGE_NODES	4U
 
 struct gc_merge_info {
 	struct btree	*b;
-	struct bkey	*k;
 	unsigned	keys;
 };
 
-static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc,
-			      struct gc_merge_info *r)
+static int bch_btree_insert_node(struct btree *, struct btree_op *,
+				 struct keylist *, atomic_t *, struct bkey *);
+
+static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
+			     struct keylist *keylist, struct gc_stat *gc,
+			     struct gc_merge_info *r)
 {
-	unsigned nodes = 0, keys = 0, blocks;
-	int i;
+	unsigned i, nodes = 0, keys = 0, blocks;
+	struct btree *new_nodes[GC_MERGE_NODES];
 	struct closure cl;
+	struct bkey *k;
 
+	memset(new_nodes, 0, sizeof(new_nodes));
 	closure_init_stack(&cl);
 
-	while (nodes < GC_MERGE_NODES && r[nodes].b)
+	while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b))
 		keys += r[nodes++].keys;
 
 	blocks = btree_default_blocks(b->c) * 2 / 3;
 
 	if (nodes < 2 ||
 	    __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1))
-		return;
-
-	for (i = nodes - 1; i >= 0; --i) {
-		if (r[i].b->written)
-			r[i].b = btree_gc_alloc(r[i].b, r[i].k);
+		return 0;
 
-		if (r[i].b->written)
-			return;
+	for (i = 0; i < nodes; i++) {
+		new_nodes[i] = btree_node_alloc_replacement(r[i].b);
+		if (IS_ERR_OR_NULL(new_nodes[i]))
+			goto out_nocoalesce;
 	}
 
 	for (i = nodes - 1; i > 0; --i) {
-		struct bset *n1 = r[i].b->sets->data;
-		struct bset *n2 = r[i - 1].b->sets->data;
+		struct bset *n1 = new_nodes[i]->sets->data;
+		struct bset *n2 = new_nodes[i - 1]->sets->data;
 		struct bkey *k, *last = NULL;
 
 		keys = 0;
 
-		if (i == 1) {
+		if (i > 1) {
+			for (k = n2->start;
+			     k < end(n2);
+			     k = bkey_next(k)) {
+				if (__set_blocks(n1, n1->keys + keys +
+						 bkey_u64s(k), b->c) > blocks)
+					break;
+
+				last = k;
+				keys += bkey_u64s(k);
+			}
+		} else {
 			/*
 			 * Last node we're not getting rid of - we're getting
 			 * rid of the node at r[0]. Have to try and fit all of
@@ -1299,37 +1282,27 @@ static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc,
 			 * length keys (shouldn't be possible in practice,
 			 * though)
 			 */
-			if (__set_blocks(n1, n1->keys + r->keys,
-					 b->c) > btree_blocks(r[i].b))
-				return;
+			if (__set_blocks(n1, n1->keys + n2->keys,
+					 b->c) > btree_blocks(new_nodes[i]))
+				goto out_nocoalesce;
 
 			keys = n2->keys;
+			/* Take the key of the node we're getting rid of */
 			last = &r->b->key;
-		} else
-			for (k = n2->start;
-			     k < end(n2);
-			     k = bkey_next(k)) {
-				if (__set_blocks(n1, n1->keys + keys +
-						 bkey_u64s(k), b->c) > blocks)
-					break;
-
-				last = k;
-				keys += bkey_u64s(k);
-			}
+		}
 
 		BUG_ON(__set_blocks(n1, n1->keys + keys,
-				    b->c) > btree_blocks(r[i].b));
+				    b->c) > btree_blocks(new_nodes[i]));
 
-		if (last) {
-			bkey_copy_key(&r[i].b->key, last);
-			bkey_copy_key(r[i].k, last);
-		}
+		if (last)
+			bkey_copy_key(&new_nodes[i]->key, last);
 
 		memcpy(end(n1),
 		       n2->start,
 		       (void *) node(n2, keys) - (void *) n2->start);
 
 		n1->keys += keys;
+		r[i].keys = n1->keys;
 
 		memmove(n2->start,
 			node(n2, keys),
@@ -1337,93 +1310,175 @@ static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc,
 
 		n2->keys -= keys;
 
-		r[i].keys	= n1->keys;
-		r[i - 1].keys	= n2->keys;
+		if (bch_keylist_realloc(keylist,
+					KEY_PTRS(&new_nodes[i]->key), b->c))
+			goto out_nocoalesce;
+
+		bch_btree_node_write(new_nodes[i], &cl);
+		bch_keylist_add(keylist, &new_nodes[i]->key);
 	}
 
-	btree_node_free(r->b);
-	up_write(&r->b->lock);
+	for (i = 0; i < nodes; i++) {
+		if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c))
+			goto out_nocoalesce;
 
-	trace_bcache_btree_gc_coalesce(nodes);
+		make_btree_freeing_key(r[i].b, keylist->top);
+		bch_keylist_push(keylist);
+	}
+
+	/* We emptied out this node */
+	BUG_ON(new_nodes[0]->sets->data->keys);
+	btree_node_free(new_nodes[0]);
+	rw_unlock(true, new_nodes[0]);
 
+	closure_sync(&cl);
+
+	for (i = 0; i < nodes; i++) {
+		btree_node_free(r[i].b);
+		rw_unlock(true, r[i].b);
+
+		r[i].b = new_nodes[i];
+	}
+
+	bch_btree_insert_node(b, op, keylist, NULL, NULL);
+	BUG_ON(!bch_keylist_empty(keylist));
+
+	memmove(r, r + 1, sizeof(r[0]) * (nodes - 1));
+	r[nodes - 1].b = ERR_PTR(-EINTR);
+
+	trace_bcache_btree_gc_coalesce(nodes);
 	gc->nodes--;
-	nodes--;
 
-	memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes);
-	memset(&r[nodes], 0, sizeof(struct gc_merge_info));
+	/* Invalidated our iterator */
+	return -EINTR;
+
+out_nocoalesce:
+	closure_sync(&cl);
+
+	while ((k = bch_keylist_pop(keylist)))
+		if (!bkey_cmp(k, &ZERO_KEY))
+			atomic_dec(&b->c->prio_blocked);
+
+	for (i = 0; i < nodes; i++)
+		if (!IS_ERR_OR_NULL(new_nodes[i])) {
+			btree_node_free(new_nodes[i]);
+			rw_unlock(true, new_nodes[i]);
+		}
+	return 0;
 }
 
-static int btree_gc_recurse(struct btree *b, struct btree_op *op,
-			    struct closure *writes, struct gc_stat *gc)
+static unsigned btree_gc_count_keys(struct btree *b)
 {
-	void write(struct btree *r)
-	{
-		if (!r->written || btree_node_dirty(r))
-			bch_btree_node_write(r, writes);
+	struct bkey *k;
+	struct btree_iter iter;
+	unsigned ret = 0;
 
-		up_write(&r->lock);
-	}
+	for_each_key_filter(b, k, &iter, bch_ptr_bad)
+		ret += bkey_u64s(k);
 
-	int ret = 0, stale;
+	return ret;
+}
+
+static int btree_gc_recurse(struct btree *b, struct btree_op *op,
+			    struct closure *writes, struct gc_stat *gc)
+{
 	unsigned i;
+	int ret = 0;
+	bool should_rewrite;
+	struct btree *n;
+	struct bkey *k;
+	struct keylist keys;
+	struct btree_iter iter;
 	struct gc_merge_info r[GC_MERGE_NODES];
+	struct gc_merge_info *last = r + GC_MERGE_NODES - 1;
 
-	memset(r, 0, sizeof(r));
+	bch_keylist_init(&keys);
+	bch_btree_iter_init(b, &iter, &b->c->gc_done);
 
-	while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) {
-		r->b = bch_btree_node_get(b->c, r->k, b->level - 1, true);
+	for (i = 0; i < GC_MERGE_NODES; i++)
+		r[i].b = ERR_PTR(-EINTR);
 
-		if (IS_ERR(r->b)) {
-			ret = PTR_ERR(r->b);
-			break;
+	while (1) {
+		k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
+		if (k) {
+			r->b = bch_btree_node_get(b->c, k, b->level - 1, true);
+			if (IS_ERR(r->b)) {
+				ret = PTR_ERR(r->b);
+				break;
+			}
+
+			r->keys = btree_gc_count_keys(r->b);
+
+			ret = btree_gc_coalesce(b, op, &keys, gc, r);
+			if (ret)
+				break;
 		}
 
-		r->keys	= 0;
-		stale = btree_gc_mark_node(r->b, &r->keys, gc);
+		if (!last->b)
+			break;
 
-		if (!b->written &&
-		    (r->b->level || stale > 10 ||
-		     b->c->gc_always_rewrite))
-			r->b = btree_gc_alloc(r->b, r->k);
+		if (!IS_ERR(last->b)) {
+			should_rewrite = btree_gc_mark_node(last->b, gc);
+			if (should_rewrite) {
+				n = btree_node_alloc_replacement(last->b);
 
-		if (r->b->level)
-			ret = btree_gc_recurse(r->b, op, writes, gc);
+				if (!IS_ERR_OR_NULL(n)) {
+					bch_btree_node_write_sync(n);
+					bch_keylist_add(&keys, &n->key);
 
-		if (ret) {
-			write(r->b);
-			break;
-		}
+					make_btree_freeing_key(last->b,
+							       keys.top);
+					bch_keylist_push(&keys);
+
+					btree_node_free(last->b);
 
-		bkey_copy_key(&b->c->gc_done, r->k);
+					bch_btree_insert_node(b, op, &keys,
+							      NULL, NULL);
+					BUG_ON(!bch_keylist_empty(&keys));
 
-		if (!b->written)
-			btree_gc_coalesce(b, gc, r);
+					rw_unlock(true, last->b);
+					last->b = n;
+
+					/* Invalidated our iterator */
+					ret = -EINTR;
+					break;
+				}
+			}
 
-		if (r[GC_MERGE_NODES - 1].b)
-			write(r[GC_MERGE_NODES - 1].b);
+			if (last->b->level) {
+				ret = btree_gc_recurse(last->b, op, writes, gc);
+				if (ret)
+					break;
+			}
 
-		memmove(&r[1], &r[0],
-			sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1));
+			bkey_copy_key(&b->c->gc_done, &last->b->key);
+
+			/*
+			 * Must flush leaf nodes before gc ends, since replace
+			 * operations aren't journalled
+			 */
+			if (btree_node_dirty(last->b))
+				bch_btree_node_write(last->b, writes);
+			rw_unlock(true, last->b);
+		}
+
+		memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
+		r->b = NULL;
 
-		/* When we've got incremental GC working, we'll want to do
-		 * if (should_resched())
-		 *	return -EAGAIN;
-		 */
-		cond_resched();
-#if 0
 		if (need_resched()) {
 			ret = -EAGAIN;
 			break;
 		}
-#endif
 	}
 
-	for (i = 1; i < GC_MERGE_NODES && r[i].b; i++)
-		write(r[i].b);
+	for (i = 0; i < GC_MERGE_NODES; i++)
+		if (!IS_ERR_OR_NULL(r[i].b)) {
+			if (btree_node_dirty(r[i].b))
+				bch_btree_node_write(r[i].b, writes);
+			rw_unlock(true, r[i].b);
+		}
 
-	/* Might have freed some children, must remove their keys */
-	if (!b->written)
-		bch_btree_sort(b);
+	bch_keylist_free(&keys);
 
 	return ret;
 }
@@ -1432,27 +1487,31 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
 			     struct closure *writes, struct gc_stat *gc)
 {
 	struct btree *n = NULL;
-	unsigned keys = 0;
-	int ret = 0, stale = btree_gc_mark_node(b, &keys, gc);
+	int ret = 0;
+	bool should_rewrite;
 
-	if (b->level || stale > 10)
+	should_rewrite = btree_gc_mark_node(b, gc);
+	if (should_rewrite) {
 		n = btree_node_alloc_replacement(b);
 
-	if (!IS_ERR_OR_NULL(n))
-		swap(b, n);
-
-	if (b->level)
-		ret = btree_gc_recurse(b, op, writes, gc);
+		if (!IS_ERR_OR_NULL(n)) {
+			bch_btree_node_write_sync(n);
+			bch_btree_set_root(n);
+			btree_node_free(b);
+			rw_unlock(true, n);
 
-	if (!b->written || btree_node_dirty(b))
-		bch_btree_node_write_sync(b);
+			return -EINTR;
+		}
+	}
 
-	if (!IS_ERR_OR_NULL(n)) {
-		bch_btree_set_root(b);
-		btree_node_free(n);
-		rw_unlock(true, b);
+	if (b->level) {
+		ret = btree_gc_recurse(b, op, writes, gc);
+		if (ret)
+			return ret;
 	}
 
+	bkey_copy_key(&b->c->gc_done, &b->key);
+
 	return ret;
 }
 
@@ -1550,29 +1609,20 @@ static void bch_btree_gc(struct cache_set *c)
 
 	btree_gc_start(c);
 
-	atomic_inc(&c->prio_blocked);
-
-	ret = btree_root(gc_root, c, &op, &writes, &stats);
-	closure_sync(&writes);
-
-	if (ret) {
-		pr_warn("gc failed!");
-		return;
-	}
+	do {
+		ret = btree_root(gc_root, c, &op, &writes, &stats);
+		closure_sync(&writes);
 
-	/* Possibly wait for new UUIDs or whatever to hit disk */
-	bch_journal_meta(c, &writes);
-	closure_sync(&writes);
+		if (ret && ret != -EAGAIN)
+			pr_warn("gc failed!");
+	} while (ret);
 
 	available = bch_btree_gc_finish(c);
-
-	atomic_dec(&c->prio_blocked);
 	wake_up_allocators(c);
 
 	bch_time_stats_update(&c->btree_gc_time, start_time);
 
 	stats.key_bytes *= sizeof(uint64_t);
-	stats.dirty	<<= 9;
 	stats.data	<<= 9;
 	stats.in_use	= (c->nbuckets - available) * 100 / c->nbuckets;
 	memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
@@ -1585,14 +1635,28 @@ static void bch_btree_gc(struct cache_set *c)
 static int bch_gc_thread(void *arg)
 {
 	struct cache_set *c = arg;
+	struct cache *ca;
+	unsigned i;
 
 	while (1) {
+again:
 		bch_btree_gc(c);
 
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (kthread_should_stop())
 			break;
 
+		mutex_lock(&c->bucket_lock);
+
+		for_each_cache(ca, c, i)
+			if (ca->invalidate_needs_gc) {
+				mutex_unlock(&c->bucket_lock);
+				set_current_state(TASK_RUNNING);
+				goto again;
+			}
+
+		mutex_unlock(&c->bucket_lock);
+
 		try_to_freeze();
 		schedule();
 	}
@@ -2083,8 +2147,6 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 
 	bch_keylist_init(&split_keys);
 
-	BUG_ON(b->level);
-
 	do {
 		BUG_ON(b->level && replace_key);
 
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index e11bb8571d24..b5a46affe8eb 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -201,7 +201,7 @@ static inline bool bkey_written(struct btree *b, struct bkey *k)
 
 static inline void set_gc_sectors(struct cache_set *c)
 {
-	atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8);
+	atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16);
 }
 
 static inline struct bkey *bch_btree_iter_init(struct btree *b,
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 9687771ec6f3..c5f73e34d016 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -489,7 +489,6 @@ lock_root:
 
 	sysfs_print(btree_used_percent,	btree_used(c));
 	sysfs_print(btree_nodes,	c->gc_stats.nodes);
-	sysfs_hprint(dirty_data,	c->gc_stats.dirty);
 	sysfs_hprint(average_key_size,	average_key_size(c));
 
 	sysfs_print(cache_read_races,
@@ -642,7 +641,6 @@ static struct attribute *bch_cache_set_files[] = {
 	&sysfs_cache_available_percent,
 
 	&sysfs_average_key_size,
-	&sysfs_dirty_data,
 
 	&sysfs_errors,
 	&sysfs_io_error_limit,
-- 
cgit v1.2.3


From bc9389eefe479b7b7b323c2729b61a7155d2d0ea Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 10 Sep 2013 19:07:35 -0700
Subject: bcache: Avoid deadlocking in garbage collection

Not a complete fix - we could still deadlock if btree_insert_node() has
to split...

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c | 21 +++++++++++----------
 drivers/md/bcache/btree.h |  2 +-
 drivers/md/bcache/super.c |  2 +-
 3 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 7d283d217438..bf8f5174b253 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1059,14 +1059,14 @@ static void btree_node_free(struct btree *b)
 	mutex_unlock(&b->c->bucket_lock);
 }
 
-struct btree *bch_btree_node_alloc(struct cache_set *c, int level)
+struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait)
 {
 	BKEY_PADDED(key) k;
 	struct btree *b = ERR_PTR(-EAGAIN);
 
 	mutex_lock(&c->bucket_lock);
 retry:
-	if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true))
+	if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, wait))
 		goto err;
 
 	bkey_put(c, &k.key);
@@ -1098,9 +1098,9 @@ err:
 	return b;
 }
 
-static struct btree *btree_node_alloc_replacement(struct btree *b)
+static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait)
 {
-	struct btree *n = bch_btree_node_alloc(b->c, b->level);
+	struct btree *n = bch_btree_node_alloc(b->c, b->level, wait);
 	if (!IS_ERR_OR_NULL(n))
 		bch_btree_sort_into(b, n);
 
@@ -1250,7 +1250,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
 		return 0;
 
 	for (i = 0; i < nodes; i++) {
-		new_nodes[i] = btree_node_alloc_replacement(r[i].b);
+		new_nodes[i] = btree_node_alloc_replacement(r[i].b, false);
 		if (IS_ERR_OR_NULL(new_nodes[i]))
 			goto out_nocoalesce;
 	}
@@ -1420,7 +1420,8 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 		if (!IS_ERR(last->b)) {
 			should_rewrite = btree_gc_mark_node(last->b, gc);
 			if (should_rewrite) {
-				n = btree_node_alloc_replacement(last->b);
+				n = btree_node_alloc_replacement(last->b,
+								 false);
 
 				if (!IS_ERR_OR_NULL(n)) {
 					bch_btree_node_write_sync(n);
@@ -1492,7 +1493,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
 
 	should_rewrite = btree_gc_mark_node(b, gc);
 	if (should_rewrite) {
-		n = btree_node_alloc_replacement(b);
+		n = btree_node_alloc_replacement(b, false);
 
 		if (!IS_ERR_OR_NULL(n)) {
 			bch_btree_node_write_sync(n);
@@ -2038,7 +2039,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
 
 	closure_init_stack(&cl);
 
-	n1 = btree_node_alloc_replacement(b);
+	n1 = btree_node_alloc_replacement(b, true);
 	if (IS_ERR(n1))
 		goto err;
 
@@ -2049,12 +2050,12 @@ static int btree_split(struct btree *b, struct btree_op *op,
 
 		trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
 
-		n2 = bch_btree_node_alloc(b->c, b->level);
+		n2 = bch_btree_node_alloc(b->c, b->level, true);
 		if (IS_ERR(n2))
 			goto err_free1;
 
 		if (!b->parent) {
-			n3 = bch_btree_node_alloc(b->c, b->level + 1);
+			n3 = bch_btree_node_alloc(b->c, b->level + 1, true);
 			if (IS_ERR(n3))
 				goto err_free2;
 		}
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index b5a46affe8eb..767e75570896 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -271,7 +271,7 @@ void bch_btree_node_read(struct btree *);
 void bch_btree_node_write(struct btree *, struct closure *);
 
 void bch_btree_set_root(struct btree *);
-struct btree *bch_btree_node_alloc(struct cache_set *, int);
+struct btree *bch_btree_node_alloc(struct cache_set *, int, bool);
 struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool);
 
 int bch_btree_insert_check_key(struct btree *, struct btree_op *,
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 7ab926d94d81..e21200e98da6 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1601,7 +1601,7 @@ static void run_cache_set(struct cache_set *c)
 			goto err;
 
 		err = "cannot allocate new btree root";
-		c->root = bch_btree_node_alloc(c, 0);
+		c->root = bch_btree_node_alloc(c, 0, true);
 		if (IS_ERR_OR_NULL(c->root))
 			goto err;
 
-- 
cgit v1.2.3


From 50310164bcd789eb3690f45a9baf8a507bf93358 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 10 Sep 2013 17:18:59 -0700
Subject: bcache: Kill bch_next_recurse_key()

This dates from before the btree iterator, and now it's finally gone

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bset.c  |  8 --------
 drivers/md/bcache/bset.h  |  1 -
 drivers/md/bcache/btree.c | 23 +++++++++++------------
 3 files changed, 11 insertions(+), 21 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index b615348c45fc..b0fe0dc59ee6 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -952,14 +952,6 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
 	return ret;
 }
 
-struct bkey *bch_next_recurse_key(struct btree *b, struct bkey *search)
-{
-	struct btree_iter iter;
-
-	bch_btree_iter_init(b, &iter, search);
-	return bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
-}
-
 /* Mergesort */
 
 static void sort_key_next(struct btree_iter *iter,
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index bc9975526aa1..1d3c24f9fa0e 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -303,7 +303,6 @@ static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
 
 typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *);
 
-struct bkey *bch_next_recurse_key(struct btree *, struct bkey *);
 struct bkey *bch_btree_iter_next(struct btree_iter *);
 struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
 					struct btree *, ptr_filter_fn);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index bf8f5174b253..adc5bb0d8e92 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1680,9 +1680,9 @@ int bch_gc_thread_start(struct cache_set *c)
 static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
 				   unsigned long **seen)
 {
-	int ret;
+	int ret = 0;
 	unsigned i;
-	struct bkey *k;
+	struct bkey *k, *p = NULL;
 	struct bucket *g;
 	struct btree_iter iter;
 
@@ -1709,19 +1709,18 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
 	}
 
 	if (b->level) {
-		k = bch_next_recurse_key(b, &ZERO_KEY);
+		bch_btree_iter_init(b, &iter, NULL);
 
-		while (k) {
-			struct bkey *p = bch_next_recurse_key(b, k);
-			if (p)
-				btree_node_prefetch(b->c, p, b->level - 1);
+		do {
+			k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
+			if (k)
+				btree_node_prefetch(b->c, k, b->level - 1);
 
-			ret = btree(check_recurse, k, b, op, seen);
-			if (ret)
-				return ret;
+			if (p)
+				ret = btree(check_recurse, p, b, op, seen);
 
-			k = p;
-		}
+			p = k;
+		} while (p && !ret);
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 8aee122071a69ca6fa3314da7713bdf0b61dc07c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 30 Jul 2013 22:34:40 -0700
Subject: bcache: Kill sequential_merge option

It never really made sense to expose this, so just kill it.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h  |  1 -
 drivers/md/bcache/request.c | 43 ++++++++++++++++++-------------------------
 drivers/md/bcache/super.c   |  1 -
 drivers/md/bcache/sysfs.c   |  4 ----
 4 files changed, 18 insertions(+), 31 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d6970a651e42..322735547eab 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -364,7 +364,6 @@ struct cached_dev {
 	unsigned		sequential_cutoff;
 	unsigned		readahead;
 
-	unsigned		sequential_merge:1;
 	unsigned		verify:1;
 
 	unsigned		partial_stripes_expensive:1;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 932300f18973..f645da61189a 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -510,6 +510,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	unsigned mode = cache_mode(dc, bio);
 	unsigned sectors, congested = bch_get_congested(c);
 	struct task_struct *task = current;
+	struct io *i;
 
 	if (atomic_read(&dc->disk.detaching) ||
 	    c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
@@ -536,38 +537,30 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	    (bio->bi_rw & REQ_SYNC))
 		goto rescale;
 
-	if (dc->sequential_merge) {
-		struct io *i;
+	spin_lock(&dc->io_lock);
 
-		spin_lock(&dc->io_lock);
+	hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash)
+		if (i->last == bio->bi_sector &&
+		    time_before(jiffies, i->jiffies))
+			goto found;
 
-		hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash)
-			if (i->last == bio->bi_sector &&
-			    time_before(jiffies, i->jiffies))
-				goto found;
+	i = list_first_entry(&dc->io_lru, struct io, lru);
 
-		i = list_first_entry(&dc->io_lru, struct io, lru);
-
-		add_sequential(task);
-		i->sequential = 0;
+	add_sequential(task);
+	i->sequential = 0;
 found:
-		if (i->sequential + bio->bi_size > i->sequential)
-			i->sequential	+= bio->bi_size;
-
-		i->last			 = bio_end_sector(bio);
-		i->jiffies		 = jiffies + msecs_to_jiffies(5000);
-		task->sequential_io	 = i->sequential;
+	if (i->sequential + bio->bi_size > i->sequential)
+		i->sequential	+= bio->bi_size;
 
-		hlist_del(&i->hash);
-		hlist_add_head(&i->hash, iohash(dc, i->last));
-		list_move_tail(&i->lru, &dc->io_lru);
+	i->last			 = bio_end_sector(bio);
+	i->jiffies		 = jiffies + msecs_to_jiffies(5000);
+	task->sequential_io	 = i->sequential;
 
-		spin_unlock(&dc->io_lock);
-	} else {
-		task->sequential_io = bio->bi_size;
+	hlist_del(&i->hash);
+	hlist_add_head(&i->hash, iohash(dc, i->last));
+	list_move_tail(&i->lru, &dc->io_lru);
 
-		add_sequential(task);
-	}
+	spin_unlock(&dc->io_lock);
 
 	sectors = max(task->sequential_io,
 		      task->sequential_io_avg) >> 9;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index e21200e98da6..041dd9d1d882 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1079,7 +1079,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
 	spin_lock_init(&dc->io_lock);
 	bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
 
-	dc->sequential_merge		= true;
 	dc->sequential_cutoff		= 4 << 20;
 
 	for (io = dc->io; io < dc->io + RECENT_IO; io++) {
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index c5f73e34d016..4b672449ffaf 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -72,7 +72,6 @@ rw_attribute(congested_read_threshold_us);
 rw_attribute(congested_write_threshold_us);
 
 rw_attribute(sequential_cutoff);
-rw_attribute(sequential_merge);
 rw_attribute(data_csum);
 rw_attribute(cache_mode);
 rw_attribute(writeback_metadata);
@@ -161,7 +160,6 @@ SHOW(__bch_cached_dev)
 	sysfs_hprint(stripe_size,	dc->disk.stripe_size << 9);
 	var_printf(partial_stripes_expensive,	"%u");
 
-	var_printf(sequential_merge,	"%i");
 	var_hprint(sequential_cutoff);
 	var_hprint(readahead);
 
@@ -207,7 +205,6 @@ STORE(__cached_dev)
 			    dc->writeback_rate_p_term_inverse, 1, INT_MAX);
 	d_strtoul(writeback_rate_d_smooth);
 
-	d_strtoul(sequential_merge);
 	d_strtoi_h(sequential_cutoff);
 	d_strtoi_h(readahead);
 
@@ -319,7 +316,6 @@ static struct attribute *bch_cached_dev_files[] = {
 	&sysfs_stripe_size,
 	&sysfs_partial_stripes_expensive,
 	&sysfs_sequential_cutoff,
-	&sysfs_sequential_merge,
 	&sysfs_clear_stats,
 	&sysfs_running,
 	&sysfs_state,
-- 
cgit v1.2.3


From 65d22e911bfc4f46cda4751f1b1926b43c316c14 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 31 Jul 2013 00:03:54 -0700
Subject: bcache: Move spinlock into struct time_stats

Minor cleanup.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h |  2 --
 drivers/md/bcache/bset.c   |  7 +------
 drivers/md/bcache/btree.c  |  3 ---
 drivers/md/bcache/super.c  |  8 ++++++--
 drivers/md/bcache/util.c   | 12 +++++++++---
 drivers/md/bcache/util.h   |  1 +
 6 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 322735547eab..816d07958fac 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -666,11 +666,9 @@ struct cache_set {
 	unsigned		congested_read_threshold_us;
 	unsigned		congested_write_threshold_us;
 
-	spinlock_t		sort_time_lock;
 	struct time_stats	sort_time;
 	struct time_stats	btree_gc_time;
 	struct time_stats	btree_split_time;
-	spinlock_t		btree_read_time_lock;
 	struct time_stats	btree_read_time;
 	struct time_stats	try_harder_time;
 
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index b0fe0dc59ee6..14573391206b 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -1077,11 +1077,8 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter,
 	if (b->written)
 		bset_build_written_tree(b);
 
-	if (!start) {
-		spin_lock(&b->c->sort_time_lock);
+	if (!start)
 		bch_time_stats_update(&b->c->sort_time, start_time);
-		spin_unlock(&b->c->sort_time_lock);
-	}
 }
 
 void bch_btree_sort_partial(struct btree *b, unsigned start)
@@ -1128,9 +1125,7 @@ void bch_btree_sort_into(struct btree *b, struct btree *new)
 
 	btree_mergesort(b, new->sets->data, &iter, false, true);
 
-	spin_lock(&b->c->sort_time_lock);
 	bch_time_stats_update(&b->c->sort_time, start_time);
-	spin_unlock(&b->c->sort_time_lock);
 
 	bkey_copy_key(&new->key, &b->key);
 	new->sets->size = 0;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index adc5bb0d8e92..1a7530cd1407 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -317,10 +317,7 @@ void bch_btree_node_read(struct btree *b)
 		goto err;
 
 	bch_btree_node_read_done(b);
-
-	spin_lock(&b->c->btree_read_time_lock);
 	bch_time_stats_update(&b->c->btree_read_time, start_time);
-	spin_unlock(&b->c->btree_read_time_lock);
 
 	return;
 err:
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 041dd9d1d882..4813ef67cef5 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1435,9 +1435,13 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 	init_waitqueue_head(&c->try_wait);
 	init_waitqueue_head(&c->bucket_wait);
 	closure_init_unlocked(&c->uuid_write);
-	spin_lock_init(&c->sort_time_lock);
 	mutex_init(&c->sort_lock);
-	spin_lock_init(&c->btree_read_time_lock);
+
+	spin_lock_init(&c->sort_time.lock);
+	spin_lock_init(&c->btree_gc_time.lock);
+	spin_lock_init(&c->btree_split_time.lock);
+	spin_lock_init(&c->btree_read_time.lock);
+	spin_lock_init(&c->try_harder_time.lock);
 
 	bch_moving_init_cache_set(c);
 
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 420dad545c7d..462214eeacbe 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -168,10 +168,14 @@ int bch_parse_uuid(const char *s, char *uuid)
 
 void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
 {
-	uint64_t now		= local_clock();
-	uint64_t duration	= time_after64(now, start_time)
+	uint64_t now, duration, last;
+
+	spin_lock(&stats->lock);
+
+	now		= local_clock();
+	duration	= time_after64(now, start_time)
 		? now - start_time : 0;
-	uint64_t last		= time_after64(now, stats->last)
+	last		= time_after64(now, stats->last)
 		? now - stats->last : 0;
 
 	stats->max_duration = max(stats->max_duration, duration);
@@ -188,6 +192,8 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
 	}
 
 	stats->last = now ?: 1;
+
+	spin_unlock(&stats->lock);
 }
 
 /**
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 8ce5aab55962..362c4b3f8b4a 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -378,6 +378,7 @@ ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[
 ssize_t bch_read_string_list(const char *buf, const char * const list[]);
 
 struct time_stats {
+	spinlock_t	lock;
 	/*
 	 * all fields are in nanoseconds, averages are ewmas stored left shifted
 	 * by 8
-- 
cgit v1.2.3


From 17e21a9f248d3d330acdfb2405c23b8d84c9c23a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Fri, 26 Jul 2013 12:32:38 -0700
Subject: bcache: Have btree_split() insert into parent directly

The flow control in btree_insert_node() was... fragile... before,
this'll use more stack (but since our btrees are never more than depth
1, that shouldn't matter) and it should be significantly clearer and
less fragile.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/btree.c | 85 ++++++++++++++++++++++-------------------------
 1 file changed, 39 insertions(+), 46 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 1a7530cd1407..6def7c9a1228 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -2025,15 +2025,16 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
 
 static int btree_split(struct btree *b, struct btree_op *op,
 		       struct keylist *insert_keys,
-		       struct keylist *parent_keys,
 		       struct bkey *replace_key)
 {
 	bool split;
 	struct btree *n1, *n2 = NULL, *n3 = NULL;
 	uint64_t start_time = local_clock();
 	struct closure cl;
+	struct keylist parent_keys;
 
 	closure_init_stack(&cl);
+	bch_keylist_init(&parent_keys);
 
 	n1 = btree_node_alloc_replacement(b, true);
 	if (IS_ERR(n1))
@@ -2078,7 +2079,7 @@ static int btree_split(struct btree *b, struct btree_op *op,
 
 		bkey_copy_key(&n2->key, &b->key);
 
-		bch_keylist_add(parent_keys, &n2->key);
+		bch_keylist_add(&parent_keys, &n2->key);
 		bch_btree_node_write(n2, &cl);
 		rw_unlock(true, n2);
 	} else {
@@ -2087,33 +2088,39 @@ static int btree_split(struct btree *b, struct btree_op *op,
 		bch_btree_insert_keys(n1, op, insert_keys, replace_key);
 	}
 
-	bch_keylist_add(parent_keys, &n1->key);
+	bch_keylist_add(&parent_keys, &n1->key);
 	bch_btree_node_write(n1, &cl);
 
 	if (n3) {
 		/* Depth increases, make a new root */
-
 		bkey_copy_key(&n3->key, &MAX_KEY);
-		bch_btree_insert_keys(n3, op, parent_keys, NULL);
+		bch_btree_insert_keys(n3, op, &parent_keys, NULL);
 		bch_btree_node_write(n3, &cl);
 
 		closure_sync(&cl);
 		bch_btree_set_root(n3);
 		rw_unlock(true, n3);
+
+		btree_node_free(b);
 	} else if (!b->parent) {
 		/* Root filled up but didn't need to be split */
-
-		bch_keylist_reset(parent_keys);
 		closure_sync(&cl);
 		bch_btree_set_root(n1);
+
+		btree_node_free(b);
 	} else {
+		/* Split a non root node */
 		closure_sync(&cl);
-		make_btree_freeing_key(b, parent_keys->top);
-		bch_keylist_push(parent_keys);
+		make_btree_freeing_key(b, parent_keys.top);
+		bch_keylist_push(&parent_keys);
+
+		btree_node_free(b);
+
+		bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL);
+		BUG_ON(!bch_keylist_empty(&parent_keys));
 	}
 
 	rw_unlock(true, n1);
-	btree_node_free(b);
 
 	bch_time_stats_update(&b->c->btree_split_time, start_time);
 
@@ -2139,46 +2146,32 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 				 atomic_t *journal_ref,
 				 struct bkey *replace_key)
 {
-	int ret = 0;
-	struct keylist split_keys;
-
-	bch_keylist_init(&split_keys);
+	BUG_ON(b->level && replace_key);
 
-	do {
-		BUG_ON(b->level && replace_key);
-
-		if (should_split(b)) {
-			if (current->bio_list) {
-				op->lock = b->c->root->level + 1;
-				ret = -EAGAIN;
-			} else if (op->lock <= b->c->root->level) {
-				op->lock = b->c->root->level + 1;
-				ret = -EINTR;
-			} else {
-				struct btree *parent = b->parent;
-
-				ret = btree_split(b, op, insert_keys,
-						  &split_keys, replace_key);
-				insert_keys = &split_keys;
-				replace_key = NULL;
-				b = parent;
-				if (!ret)
-					ret = -EINTR;
-			}
+	if (should_split(b)) {
+		if (current->bio_list) {
+			op->lock = b->c->root->level + 1;
+			return -EAGAIN;
+		} else if (op->lock <= b->c->root->level) {
+			op->lock = b->c->root->level + 1;
+			return -EINTR;
 		} else {
-			BUG_ON(write_block(b) != b->sets[b->nsets].data);
-
-			if (bch_btree_insert_keys(b, op, insert_keys,
-						  replace_key)) {
-				if (!b->level)
-					bch_btree_leaf_dirty(b, journal_ref);
-				else
-					bch_btree_node_write_sync(b);
-			}
+			/* Invalidated all iterators */
+			return btree_split(b, op, insert_keys, replace_key) ?:
+				-EINTR;
 		}
-	} while (!bch_keylist_empty(&split_keys));
+	} else {
+		BUG_ON(write_block(b) != b->sets[b->nsets].data);
 
-	return ret;
+		if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
+			if (!b->level)
+				bch_btree_leaf_dirty(b, journal_ref);
+			else
+				bch_btree_node_write_sync(b);
+		}
+
+		return 0;
+	}
 }
 
 int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
-- 
cgit v1.2.3


From 48a915a87f0bd98c3d68d029acf223a2e5116f07 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Thu, 31 Oct 2013 15:43:22 -0700
Subject: bcache: Better full stripe scanning

The old scanning-by-stripe code burned too much CPU, this should be
better.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h    |  5 ++-
 drivers/md/bcache/btree.c     | 19 +++++----
 drivers/md/bcache/super.c     | 17 +++++++-
 drivers/md/bcache/writeback.c | 94 ++++++++++++++++++++++++++-----------------
 drivers/md/bcache/writeback.h | 21 ++++++----
 include/trace/events/bcache.h | 29 +++++++++++++
 6 files changed, 128 insertions(+), 57 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 816d07958fac..ab0b2150fed6 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -237,7 +237,7 @@ struct keybuf {
 
 	struct rb_root		keys;
 
-#define KEYBUF_NR		100
+#define KEYBUF_NR		500
 	DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR);
 };
 
@@ -273,9 +273,10 @@ struct bcache_device {
 	atomic_t		detaching;
 	int			flush_done;
 
-	uint64_t		nr_stripes;
+	unsigned		nr_stripes;
 	unsigned		stripe_size;
 	atomic_t		*stripe_sectors_dirty;
+	unsigned long		*full_dirty_stripes;
 
 	unsigned long		sectors_dirty_last;
 	long			sectors_dirty_derivative;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 6def7c9a1228..5e2765aadce1 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -2378,6 +2378,7 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
 
 struct refill {
 	struct btree_op	op;
+	unsigned	nr_found;
 	struct keybuf	*buf;
 	struct bkey	*end;
 	keybuf_pred_fn	*pred;
@@ -2414,6 +2415,8 @@ static int refill_keybuf_fn(struct btree_op *op, struct btree *b,
 
 		if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
 			array_free(&buf->freelist, w);
+		else
+			refill->nr_found++;
 
 		if (array_freelist_empty(&buf->freelist))
 			ret = MAP_DONE;
@@ -2434,18 +2437,18 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
 	cond_resched();
 
 	bch_btree_op_init(&refill.op, -1);
-	refill.buf = buf;
-	refill.end = end;
-	refill.pred = pred;
+	refill.nr_found	= 0;
+	refill.buf	= buf;
+	refill.end	= end;
+	refill.pred	= pred;
 
 	bch_btree_map_keys(&refill.op, c, &buf->last_scanned,
 			   refill_keybuf_fn, MAP_END_KEY);
 
-	pr_debug("found %s keys from %llu:%llu to %llu:%llu",
-		 RB_EMPTY_ROOT(&buf->keys) ? "no" :
-		 array_freelist_empty(&buf->freelist) ? "some" : "a few",
-		 KEY_INODE(&start), KEY_OFFSET(&start),
-		 KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned));
+	trace_bcache_keyscan(refill.nr_found,
+			     KEY_INODE(&start), KEY_OFFSET(&start),
+			     KEY_INODE(&buf->last_scanned),
+			     KEY_OFFSET(&buf->last_scanned));
 
 	spin_lock(&buf->lock);
 
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 4813ef67cef5..43fcfe38be11 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -738,6 +738,10 @@ static void bcache_device_free(struct bcache_device *d)
 		mempool_destroy(d->unaligned_bvec);
 	if (d->bio_split)
 		bioset_free(d->bio_split);
+	if (is_vmalloc_addr(d->full_dirty_stripes))
+		vfree(d->full_dirty_stripes);
+	else
+		kfree(d->full_dirty_stripes);
 	if (is_vmalloc_addr(d->stripe_sectors_dirty))
 		vfree(d->stripe_sectors_dirty);
 	else
@@ -757,8 +761,12 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 
 	d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
 
-	if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t))
+	if (!d->nr_stripes ||
+	    d->nr_stripes > INT_MAX ||
+	    d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) {
+		pr_err("nr_stripes too large");
 		return -ENOMEM;
+	}
 
 	n = d->nr_stripes * sizeof(atomic_t);
 	d->stripe_sectors_dirty = n < PAGE_SIZE << 6
@@ -767,6 +775,13 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 	if (!d->stripe_sectors_dirty)
 		return -ENOMEM;
 
+	n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
+	d->full_dirty_stripes = n < PAGE_SIZE << 6
+		? kzalloc(n, GFP_KERNEL)
+		: vzalloc(n);
+	if (!d->full_dirty_stripes)
+		return -ENOMEM;
+
 	if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
 	    !(d->unaligned_bvec = mempool_create_kmalloc_pool(1,
 				sizeof(struct bio_vec) * BIO_MAX_PAGES)) ||
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index ab0f6b449111..22e21dc9a037 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -292,14 +292,12 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
 				  uint64_t offset, int nr_sectors)
 {
 	struct bcache_device *d = c->devices[inode];
-	unsigned stripe_offset;
-	uint64_t stripe = offset;
+	unsigned stripe_offset, stripe, sectors_dirty;
 
 	if (!d)
 		return;
 
-	do_div(stripe, d->stripe_size);
-
+	stripe = offset_to_stripe(d, offset);
 	stripe_offset = offset & (d->stripe_size - 1);
 
 	while (nr_sectors) {
@@ -309,7 +307,16 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
 		if (nr_sectors < 0)
 			s = -s;
 
-		atomic_add(s, d->stripe_sectors_dirty + stripe);
+		if (stripe >= d->nr_stripes)
+			return;
+
+		sectors_dirty = atomic_add_return(s,
+					d->stripe_sectors_dirty + stripe);
+		if (sectors_dirty == d->stripe_size)
+			set_bit(stripe, d->full_dirty_stripes);
+		else
+			clear_bit(stripe, d->full_dirty_stripes);
+
 		nr_sectors -= s;
 		stripe_offset = 0;
 		stripe++;
@@ -321,59 +328,70 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k)
 	return KEY_DIRTY(k);
 }
 
-static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k)
+static void refill_full_stripes(struct cached_dev *dc)
 {
-	uint64_t stripe = KEY_START(k);
-	unsigned nr_sectors = KEY_SIZE(k);
-	struct cached_dev *dc = container_of(buf, struct cached_dev,
-					     writeback_keys);
+	struct keybuf *buf = &dc->writeback_keys;
+	unsigned start_stripe, stripe, next_stripe;
+	bool wrapped = false;
+
+	stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned));
 
-	if (!KEY_DIRTY(k))
-		return false;
+	if (stripe >= dc->disk.nr_stripes)
+		stripe = 0;
 
-	do_div(stripe, dc->disk.stripe_size);
+	start_stripe = stripe;
 
 	while (1) {
-		if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) ==
-		    dc->disk.stripe_size)
-			return true;
+		stripe = find_next_bit(dc->disk.full_dirty_stripes,
+				       dc->disk.nr_stripes, stripe);
 
-		if (nr_sectors <= dc->disk.stripe_size)
-			return false;
+		if (stripe == dc->disk.nr_stripes)
+			goto next;
 
-		nr_sectors -= dc->disk.stripe_size;
-		stripe++;
+		next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes,
+						 dc->disk.nr_stripes, stripe);
+
+		buf->last_scanned = KEY(dc->disk.id,
+					stripe * dc->disk.stripe_size, 0);
+
+		bch_refill_keybuf(dc->disk.c, buf,
+				  &KEY(dc->disk.id,
+				       next_stripe * dc->disk.stripe_size, 0),
+				  dirty_pred);
+
+		if (array_freelist_empty(&buf->freelist))
+			return;
+
+		stripe = next_stripe;
+next:
+		if (wrapped && stripe > start_stripe)
+			return;
+
+		if (stripe == dc->disk.nr_stripes) {
+			stripe = 0;
+			wrapped = true;
+		}
 	}
 }
 
 static bool refill_dirty(struct cached_dev *dc)
 {
 	struct keybuf *buf = &dc->writeback_keys;
-	bool searched_from_start = false;
 	struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0);
+	bool searched_from_start = false;
+
+	if (dc->partial_stripes_expensive) {
+		refill_full_stripes(dc);
+		if (array_freelist_empty(&buf->freelist))
+			return false;
+	}
 
 	if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
 		buf->last_scanned = KEY(dc->disk.id, 0, 0);
 		searched_from_start = true;
 	}
 
-	if (dc->partial_stripes_expensive) {
-		uint64_t i;
-
-		for (i = 0; i < dc->disk.nr_stripes; i++)
-			if (atomic_read(dc->disk.stripe_sectors_dirty + i) ==
-			    dc->disk.stripe_size)
-				goto full_stripes;
-
-		goto normal_refill;
-full_stripes:
-		searched_from_start = false;	/* not searching entire btree */
-		bch_refill_keybuf(dc->disk.c, buf, &end,
-				  dirty_full_stripe_pred);
-	} else {
-normal_refill:
-		bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
-	}
+	bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
 
 	return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start;
 }
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 60516bfa6052..fe7d9d56492b 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -14,22 +14,27 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
 	return ret;
 }
 
-static inline bool bcache_dev_stripe_dirty(struct bcache_device *d,
+static inline unsigned offset_to_stripe(struct bcache_device *d,
+					uint64_t offset)
+{
+	do_div(offset, d->stripe_size);
+	return offset;
+}
+
+static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc,
 					   uint64_t offset,
 					   unsigned nr_sectors)
 {
-	uint64_t stripe = offset;
-
-	do_div(stripe, d->stripe_size);
+	unsigned stripe = offset_to_stripe(&dc->disk, offset);
 
 	while (1) {
-		if (atomic_read(d->stripe_sectors_dirty + stripe))
+		if (atomic_read(dc->disk.stripe_sectors_dirty + stripe))
 			return true;
 
-		if (nr_sectors <= d->stripe_size)
+		if (nr_sectors <= dc->disk.stripe_size)
 			return false;
 
-		nr_sectors -= d->stripe_size;
+		nr_sectors -= dc->disk.stripe_size;
 		stripe++;
 	}
 }
@@ -45,7 +50,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 		return false;
 
 	if (dc->partial_stripes_expensive &&
-	    bcache_dev_stripe_dirty(&dc->disk, bio->bi_sector,
+	    bcache_dev_stripe_dirty(dc, bio->bi_sector,
 				    bio_sectors(bio)))
 		return true;
 
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index 32c89b33c391..e2b9576d00e2 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -368,6 +368,35 @@ DEFINE_EVENT(btree_node, bcache_btree_set_root,
 	TP_ARGS(b)
 );
 
+TRACE_EVENT(bcache_keyscan,
+	TP_PROTO(unsigned nr_found,
+		 unsigned start_inode, uint64_t start_offset,
+		 unsigned end_inode, uint64_t end_offset),
+	TP_ARGS(nr_found,
+		start_inode, start_offset,
+		end_inode, end_offset),
+
+	TP_STRUCT__entry(
+		__field(__u32,	nr_found			)
+		__field(__u32,	start_inode			)
+		__field(__u64,	start_offset			)
+		__field(__u32,	end_inode			)
+		__field(__u64,	end_offset			)
+	),
+
+	TP_fast_assign(
+		__entry->nr_found	= nr_found;
+		__entry->start_inode	= start_inode;
+		__entry->start_offset	= start_offset;
+		__entry->end_inode	= end_inode;
+		__entry->end_offset	= end_offset;
+	),
+
+	TP_printk("found %u keys from %u:%llu to %u:%llu", __entry->nr_found,
+		  __entry->start_inode, __entry->start_offset,
+		  __entry->end_inode, __entry->end_offset)
+);
+
 /* Allocator */
 
 TRACE_EVENT(bcache_alloc_invalidate,
-- 
cgit v1.2.3


From c4d951ddb66fe1d087447b0ba65c4fa4446f1083 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 21 Aug 2013 17:49:09 -0700
Subject: bcache: Fix sysfs splat on shutdown with flash only devs

Whoops.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h    | 10 ++++------
 drivers/md/bcache/request.c   |  2 +-
 drivers/md/bcache/super.c     | 41 ++++++++++++++++++++---------------------
 drivers/md/bcache/sysfs.c     |  2 +-
 drivers/md/bcache/writeback.c |  6 +++---
 drivers/md/bcache/writeback.h |  2 +-
 6 files changed, 30 insertions(+), 33 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index ab0b2150fed6..97ef126b68bb 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -266,12 +266,10 @@ struct bcache_device {
 
 	struct gendisk		*disk;
 
-	/* If nonzero, we're closing */
-	atomic_t		closing;
-
-	/* If nonzero, we're detaching/unregistering from cache set */
-	atomic_t		detaching;
-	int			flush_done;
+	unsigned long		flags;
+#define BCACHE_DEV_CLOSING	0
+#define BCACHE_DEV_DETACHING	1
+#define BCACHE_DEV_UNLINK_DONE	2
 
 	unsigned		nr_stripes;
 	unsigned		stripe_size;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index f645da61189a..9f5a1386f77a 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -512,7 +512,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	struct task_struct *task = current;
 	struct io *i;
 
-	if (atomic_read(&dc->disk.detaching) ||
+	if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
 	    c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
 	    (bio->bi_rw & REQ_DISCARD))
 		goto skip;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 43fcfe38be11..fa1d53087f88 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -621,7 +621,7 @@ static void prio_read(struct cache *ca, uint64_t bucket)
 static int open_dev(struct block_device *b, fmode_t mode)
 {
 	struct bcache_device *d = b->bd_disk->private_data;
-	if (atomic_read(&d->closing))
+	if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
 		return -ENXIO;
 
 	closure_get(&d->cl);
@@ -650,20 +650,24 @@ static const struct block_device_operations bcache_ops = {
 
 void bcache_device_stop(struct bcache_device *d)
 {
-	if (!atomic_xchg(&d->closing, 1))
+	if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
 		closure_queue(&d->cl);
 }
 
 static void bcache_device_unlink(struct bcache_device *d)
 {
-	unsigned i;
-	struct cache *ca;
+	lockdep_assert_held(&bch_register_lock);
 
-	sysfs_remove_link(&d->c->kobj, d->name);
-	sysfs_remove_link(&d->kobj, "cache");
+	if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
+		unsigned i;
+		struct cache *ca;
 
-	for_each_cache(ca, d->c, i)
-		bd_unlink_disk_holder(ca->bdev, d->disk);
+		sysfs_remove_link(&d->c->kobj, d->name);
+		sysfs_remove_link(&d->kobj, "cache");
+
+		for_each_cache(ca, d->c, i)
+			bd_unlink_disk_holder(ca->bdev, d->disk);
+	}
 }
 
 static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
@@ -687,19 +691,16 @@ static void bcache_device_detach(struct bcache_device *d)
 {
 	lockdep_assert_held(&bch_register_lock);
 
-	if (atomic_read(&d->detaching)) {
+	if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
 		struct uuid_entry *u = d->c->uuids + d->id;
 
 		SET_UUID_FLASH_ONLY(u, 0);
 		memcpy(u->uuid, invalid_uuid, 16);
 		u->invalidated = cpu_to_le32(get_seconds());
 		bch_uuid_write(d->c);
-
-		atomic_set(&d->detaching, 0);
 	}
 
-	if (!d->flush_done)
-		bcache_device_unlink(d);
+	bcache_device_unlink(d);
 
 	d->c->devices[d->id] = NULL;
 	closure_put(&d->c->caching);
@@ -879,7 +880,7 @@ static void cached_dev_detach_finish(struct work_struct *w)
 	struct closure cl;
 	closure_init_stack(&cl);
 
-	BUG_ON(!atomic_read(&dc->disk.detaching));
+	BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
 	BUG_ON(atomic_read(&dc->count));
 
 	mutex_lock(&bch_register_lock);
@@ -893,6 +894,8 @@ static void cached_dev_detach_finish(struct work_struct *w)
 	bcache_device_detach(&dc->disk);
 	list_move(&dc->list, &uncached_devices);
 
+	clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
+
 	mutex_unlock(&bch_register_lock);
 
 	pr_info("Caching disabled for %s", bdevname(dc->bdev, buf));
@@ -905,10 +908,10 @@ void bch_cached_dev_detach(struct cached_dev *dc)
 {
 	lockdep_assert_held(&bch_register_lock);
 
-	if (atomic_read(&dc->disk.closing))
+	if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
 		return;
 
-	if (atomic_xchg(&dc->disk.detaching, 1))
+	if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
 		return;
 
 	/*
@@ -1064,11 +1067,7 @@ static void cached_dev_flush(struct closure *cl)
 	struct bcache_device *d = &dc->disk;
 
 	mutex_lock(&bch_register_lock);
-	d->flush_done = 1;
-
-	if (d->c)
-		bcache_device_unlink(d);
-
+	bcache_device_unlink(d);
 	mutex_unlock(&bch_register_lock);
 
 	bch_cache_accounting_destroy(&dc->accounting);
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 4b672449ffaf..194d43782ea4 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -370,7 +370,7 @@ STORE(__bch_flash_dev)
 	}
 
 	if (attr == &sysfs_unregister) {
-		atomic_set(&d->detaching, 1);
+		set_bit(BCACHE_DEV_DETACHING, &d->flags);
 		bcache_device_stop(d);
 	}
 
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 22e21dc9a037..99053b1251be 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -89,7 +89,7 @@ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
 {
 	uint64_t ret;
 
-	if (atomic_read(&dc->disk.detaching) ||
+	if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
 	    !dc->writeback_percent)
 		return 0;
 
@@ -404,7 +404,7 @@ static int bch_writeback_thread(void *arg)
 	while (!kthread_should_stop()) {
 		down_write(&dc->writeback_lock);
 		if (!atomic_read(&dc->has_dirty) ||
-		    (!atomic_read(&dc->disk.detaching) &&
+		    (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
 		     !dc->writeback_running)) {
 			up_write(&dc->writeback_lock);
 			set_current_state(TASK_INTERRUPTIBLE);
@@ -437,7 +437,7 @@ static int bch_writeback_thread(void *arg)
 
 			while (delay &&
 			       !kthread_should_stop() &&
-			       !atomic_read(&dc->disk.detaching))
+			       !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
 				delay = schedule_timeout_interruptible(delay);
 		}
 	}
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index fe7d9d56492b..c9ddcf4614b9 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -45,7 +45,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 	unsigned in_use = dc->disk.c->gc_stats.in_use;
 
 	if (cache_mode != CACHE_MODE_WRITEBACK ||
-	    atomic_read(&dc->disk.detaching) ||
+	    test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
 	    in_use > CUTOFF_WRITEBACK_SYNC)
 		return false;
 
-- 
cgit v1.2.3


From 28935ab5163c49ca5c199d67335e5e3c72c50853 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 31 Jul 2013 01:12:02 -0700
Subject: bcache: Use ida for bcache block dev minor

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/super.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index fa1d53087f88..70708ab0b8f9 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -16,6 +16,7 @@
 #include <linux/buffer_head.h>
 #include <linux/debugfs.h>
 #include <linux/genhd.h>
+#include <linux/idr.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/random.h>
@@ -50,7 +51,8 @@ struct mutex bch_register_lock;
 LIST_HEAD(bch_cache_sets);
 static LIST_HEAD(uncached_devices);
 
-static int bcache_major, bcache_minor;
+static int bcache_major;
+static DEFINE_IDA(bcache_minor);
 static wait_queue_head_t unregister_wait;
 struct workqueue_struct *bcache_wq;
 
@@ -731,8 +733,10 @@ static void bcache_device_free(struct bcache_device *d)
 		del_gendisk(d->disk);
 	if (d->disk && d->disk->queue)
 		blk_cleanup_queue(d->disk->queue);
-	if (d->disk)
+	if (d->disk) {
+		ida_simple_remove(&bcache_minor, d->disk->first_minor);
 		put_disk(d->disk);
+	}
 
 	bio_split_pool_free(&d->bio_split_hook);
 	if (d->unaligned_bvec)
@@ -756,6 +760,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 {
 	struct request_queue *q;
 	size_t n;
+	int minor;
 
 	if (!d->stripe_size)
 		d->stripe_size = 1 << 31;
@@ -783,22 +788,31 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 	if (!d->full_dirty_stripes)
 		return -ENOMEM;
 
+	minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL);
+	if (minor < 0)
+		return minor;
+
 	if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
 	    !(d->unaligned_bvec = mempool_create_kmalloc_pool(1,
 				sizeof(struct bio_vec) * BIO_MAX_PAGES)) ||
 	    bio_split_pool_init(&d->bio_split_hook) ||
-	    !(d->disk = alloc_disk(1)) ||
-	    !(q = blk_alloc_queue(GFP_KERNEL)))
+	    !(d->disk = alloc_disk(1))) {
+		ida_simple_remove(&bcache_minor, minor);
 		return -ENOMEM;
+	}
 
 	set_capacity(d->disk, sectors);
-	snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor);
+	snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor);
 
 	d->disk->major		= bcache_major;
-	d->disk->first_minor	= bcache_minor++;
+	d->disk->first_minor	= minor;
 	d->disk->fops		= &bcache_ops;
 	d->disk->private_data	= d;
 
+	q = blk_alloc_queue(GFP_KERNEL);
+	if (!q)
+		return -ENOMEM;
+
 	blk_queue_make_request(q, NULL);
 	d->disk->queue			= q;
 	q->queuedata			= d;
-- 
cgit v1.2.3


From 098fb25498214069e6bbf908515f2952dd7654d0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Wed, 21 Aug 2013 18:36:45 -0700
Subject: bcache: Delete some slower inline asm

Never saw a profile of bset_search_tree() where it wasn't bottlenecked
on memory until I got my new Haswell machine, but when I tried it there
it was suddenly burning 20% of the cpu in the inner loop on shrd...

Turns out, the version of shrd that takes 64 bit operands has a 9 cycle
latency. hah.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bset.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 14573391206b..7d388b8bb50e 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -481,16 +481,8 @@ static struct bkey *table_to_bkey(struct bset_tree *t, unsigned cacheline)
 
 static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift)
 {
-#ifdef CONFIG_X86_64
-	asm("shrd %[shift],%[high],%[low]"
-	    : [low] "+Rm" (low)
-	    : [high] "R" (high),
-	    [shift] "ci" (shift)
-	    : "cc");
-#else
 	low >>= shift;
 	low  |= (high << 1) << (63U - shift);
-#endif
 	return low;
 }
 
-- 
cgit v1.2.3


From 5ceaaad7047745c1c02150c39d3fb623b7948d48 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kmo@daterainc.com>
Date: Tue, 10 Sep 2013 14:27:42 -0700
Subject: bcache: Bypass torture test

More testing ftw! Also, now verify mode doesn't break if you read dirty
data.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/bcache.h  |  1 +
 drivers/md/bcache/debug.c   | 15 ++++++++-------
 drivers/md/bcache/debug.h   |  2 ++
 drivers/md/bcache/request.c | 14 +++++++++++++-
 drivers/md/bcache/sysfs.c   |  4 ++++
 5 files changed, 28 insertions(+), 8 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 97ef126b68bb..4beb55a0ff30 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -364,6 +364,7 @@ struct cached_dev {
 	unsigned		readahead;
 
 	unsigned		verify:1;
+	unsigned		bypass_torture_test:1;
 
 	unsigned		partial_stripes_expensive:1;
 	unsigned		writeback_metadata:1;
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index e99e6b8852b2..264fcfbd6290 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -189,13 +189,14 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
 		void *p1 = kmap_atomic(bv->bv_page);
 		void *p2 = page_address(check->bi_io_vec[i].bv_page);
 
-		if (memcmp(p1 + bv->bv_offset,
-			   p2 + bv->bv_offset,
-			   bv->bv_len))
-			printk(KERN_ERR
-			       "bcache (%s): verify failed at sector %llu\n",
-			       bdevname(dc->bdev, name),
-			       (uint64_t) bio->bi_sector);
+		cache_set_err_on(memcmp(p1 + bv->bv_offset,
+					p2 + bv->bv_offset,
+					bv->bv_len),
+				 dc->disk.c,
+				 "verify failed at dev %s sector %llu",
+				 bdevname(dc->bdev, name),
+				 (uint64_t) bio->bi_sector);
+
 		kunmap_atomic(p1);
 	}
 
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h
index 7914ba0ff316..2ede60e31874 100644
--- a/drivers/md/bcache/debug.h
+++ b/drivers/md/bcache/debug.h
@@ -16,6 +16,7 @@ void bch_btree_iter_next_check(struct btree_iter *);
 #define EBUG_ON(cond)			BUG_ON(cond)
 #define expensive_debug_checks(c)	((c)->expensive_debug_checks)
 #define key_merging_disabled(c)		((c)->key_merging_disabled)
+#define bypass_torture_test(d)		((d)->bypass_torture_test)
 
 #else /* DEBUG */
 
@@ -28,6 +29,7 @@ static inline void bch_btree_iter_next_check(struct btree_iter *iter) {}
 #define EBUG_ON(cond)			do { if (cond); } while (0)
 #define expensive_debug_checks(c)	0
 #define key_merging_disabled(c)		0
+#define bypass_torture_test(d)		0
 
 #endif
 
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 9f5a1386f77a..fbcc851ed5a5 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -528,6 +528,13 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 		goto skip;
 	}
 
+	if (bypass_torture_test(dc)) {
+		if ((get_random_int() & 3) == 3)
+			goto skip;
+		else
+			goto rescale;
+	}
+
 	if (!congested && !dc->sequential_cutoff)
 		goto rescale;
 
@@ -601,6 +608,7 @@ struct search {
 	unsigned		recoverable:1;
 	unsigned		unaligned_bvec:1;
 	unsigned		write:1;
+	unsigned		read_dirty_data:1;
 
 	unsigned long		start_time;
 
@@ -669,6 +677,9 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
 
 	PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
 
+	if (KEY_DIRTY(k))
+		s->read_dirty_data = true;
+
 	n = bch_bio_split(bio, min_t(uint64_t, INT_MAX,
 				     KEY_OFFSET(k) - bio->bi_sector),
 			  GFP_NOIO, s->d->bio_split);
@@ -894,7 +905,8 @@ static void cached_dev_read_done(struct closure *cl)
 		s->cache_miss = NULL;
 	}
 
-	if (verify(dc, &s->bio.bio) && s->recoverable && !s->unaligned_bvec)
+	if (verify(dc, &s->bio.bio) && s->recoverable &&
+	    !s->unaligned_bvec && !s->read_dirty_data)
 		bch_data_verify(dc, s->orig_bio);
 
 	bio_complete(s);
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 194d43782ea4..80d4c2bee18a 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -99,6 +99,7 @@ rw_attribute(errors);
 rw_attribute(io_error_limit);
 rw_attribute(io_error_halflife);
 rw_attribute(verify);
+rw_attribute(bypass_torture_test);
 rw_attribute(key_merging_disabled);
 rw_attribute(gc_always_rewrite);
 rw_attribute(expensive_debug_checks);
@@ -123,6 +124,7 @@ SHOW(__bch_cached_dev)
 
 	sysfs_printf(data_csum,		"%i", dc->disk.data_csum);
 	var_printf(verify,		"%i");
+	var_printf(bypass_torture_test,	"%i");
 	var_printf(writeback_metadata,	"%i");
 	var_printf(writeback_running,	"%i");
 	var_print(writeback_delay);
@@ -191,6 +193,7 @@ STORE(__cached_dev)
 
 	sysfs_strtoul(data_csum,	dc->disk.data_csum);
 	d_strtoul(verify);
+	d_strtoul(bypass_torture_test);
 	d_strtoul(writeback_metadata);
 	d_strtoul(writeback_running);
 	d_strtoul(writeback_delay);
@@ -323,6 +326,7 @@ static struct attribute *bch_cached_dev_files[] = {
 	&sysfs_readahead,
 #ifdef CONFIG_BCACHE_DEBUG
 	&sysfs_verify,
+	&sysfs_bypass_torture_test,
 #endif
 	NULL
 };
-- 
cgit v1.2.3


From c86949486d41d9e7d7681fc72923555114fd702f Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 10 Sep 2013 21:41:34 -0700
Subject: bcache: defensively handle format strings

Just to be safe, call the error reporting function with "%s" to avoid
any possible future format string leak.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Kent Overstreet <kmo@daterainc.com>
---
 drivers/md/bcache/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 70708ab0b8f9..dec15cd2d797 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1670,7 +1670,7 @@ static void run_cache_set(struct cache_set *c)
 err:
 	closure_sync(&cl);
 	/* XXX: test this, it's broken */
-	bch_cache_set_error(c, err);
+	bch_cache_set_error(c, "%s", err);
 }
 
 static bool can_attach_cache(struct cache *ca, struct cache_set *c)
-- 
cgit v1.2.3


From 1cf7e9c68fe84248174e998922b39e508375e7c1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 1 Nov 2013 10:52:52 -0600
Subject: virtio_blk: blk-mq support

Switch virtio-blk from the dual support for old-style requests and bios
to use the block-multiqueue.

Acked-by: Asias He <asias@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/block/virtio_blk.c | 322 +++++++++------------------------------------
 1 file changed, 65 insertions(+), 257 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 5cdf88b7ad9e..7455fe24bbbe 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -11,12 +11,11 @@
 #include <linux/string_helpers.h>
 #include <scsi/scsi_cmnd.h>
 #include <linux/idr.h>
+#include <linux/blk-mq.h>
+#include <linux/numa.h>
 
 #define PART_BITS 4
 
-static bool use_bio;
-module_param(use_bio, bool, S_IRUGO);
-
 static int major;
 static DEFINE_IDA(vd_index_ida);
 
@@ -26,13 +25,11 @@ struct virtio_blk
 {
 	struct virtio_device *vdev;
 	struct virtqueue *vq;
-	wait_queue_head_t queue_wait;
+	spinlock_t vq_lock;
 
 	/* The disk structure for the kernel. */
 	struct gendisk *disk;
 
-	mempool_t *pool;
-
 	/* Process context for config space updates */
 	struct work_struct config_work;
 
@@ -47,31 +44,17 @@ struct virtio_blk
 
 	/* Ida index - used to track minor number allocations. */
 	int index;
-
-	/* Scatterlist: can be too big for stack. */
-	struct scatterlist sg[/*sg_elems*/];
 };
 
 struct virtblk_req
 {
 	struct request *req;
-	struct bio *bio;
 	struct virtio_blk_outhdr out_hdr;
 	struct virtio_scsi_inhdr in_hdr;
-	struct work_struct work;
-	struct virtio_blk *vblk;
-	int flags;
 	u8 status;
 	struct scatterlist sg[];
 };
 
-enum {
-	VBLK_IS_FLUSH		= 1,
-	VBLK_REQ_FLUSH		= 2,
-	VBLK_REQ_DATA		= 4,
-	VBLK_REQ_FUA		= 8,
-};
-
 static inline int virtblk_result(struct virtblk_req *vbr)
 {
 	switch (vbr->status) {
@@ -84,22 +67,6 @@ static inline int virtblk_result(struct virtblk_req *vbr)
 	}
 }
 
-static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk,
-						    gfp_t gfp_mask)
-{
-	struct virtblk_req *vbr;
-
-	vbr = mempool_alloc(vblk->pool, gfp_mask);
-	if (!vbr)
-		return NULL;
-
-	vbr->vblk = vblk;
-	if (use_bio)
-		sg_init_table(vbr->sg, vblk->sg_elems);
-
-	return vbr;
-}
-
 static int __virtblk_add_req(struct virtqueue *vq,
 			     struct virtblk_req *vbr,
 			     struct scatterlist *data_sg,
@@ -143,83 +110,8 @@ static int __virtblk_add_req(struct virtqueue *vq,
 	return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
 }
 
-static void virtblk_add_req(struct virtblk_req *vbr, bool have_data)
-{
-	struct virtio_blk *vblk = vbr->vblk;
-	DEFINE_WAIT(wait);
-	int ret;
-
-	spin_lock_irq(vblk->disk->queue->queue_lock);
-	while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr, vbr->sg,
-						 have_data)) < 0)) {
-		prepare_to_wait_exclusive(&vblk->queue_wait, &wait,
-					  TASK_UNINTERRUPTIBLE);
-
-		spin_unlock_irq(vblk->disk->queue->queue_lock);
-		io_schedule();
-		spin_lock_irq(vblk->disk->queue->queue_lock);
-
-		finish_wait(&vblk->queue_wait, &wait);
-	}
-
-	virtqueue_kick(vblk->vq);
-	spin_unlock_irq(vblk->disk->queue->queue_lock);
-}
-
-static void virtblk_bio_send_flush(struct virtblk_req *vbr)
-{
-	vbr->flags |= VBLK_IS_FLUSH;
-	vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
-	vbr->out_hdr.sector = 0;
-	vbr->out_hdr.ioprio = 0;
-
-	virtblk_add_req(vbr, false);
-}
-
-static void virtblk_bio_send_data(struct virtblk_req *vbr)
-{
-	struct virtio_blk *vblk = vbr->vblk;
-	struct bio *bio = vbr->bio;
-	bool have_data;
-
-	vbr->flags &= ~VBLK_IS_FLUSH;
-	vbr->out_hdr.type = 0;
-	vbr->out_hdr.sector = bio->bi_sector;
-	vbr->out_hdr.ioprio = bio_prio(bio);
-
-	if (blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg)) {
-		have_data = true;
-		if (bio->bi_rw & REQ_WRITE)
-			vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
-		else
-			vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
-	} else
-		have_data = false;
-
-	virtblk_add_req(vbr, have_data);
-}
-
-static void virtblk_bio_send_data_work(struct work_struct *work)
-{
-	struct virtblk_req *vbr;
-
-	vbr = container_of(work, struct virtblk_req, work);
-
-	virtblk_bio_send_data(vbr);
-}
-
-static void virtblk_bio_send_flush_work(struct work_struct *work)
-{
-	struct virtblk_req *vbr;
-
-	vbr = container_of(work, struct virtblk_req, work);
-
-	virtblk_bio_send_flush(vbr);
-}
-
 static inline void virtblk_request_done(struct virtblk_req *vbr)
 {
-	struct virtio_blk *vblk = vbr->vblk;
 	struct request *req = vbr->req;
 	int error = virtblk_result(vbr);
 
@@ -231,90 +123,43 @@ static inline void virtblk_request_done(struct virtblk_req *vbr)
 		req->errors = (error != 0);
 	}
 
-	__blk_end_request_all(req, error);
-	mempool_free(vbr, vblk->pool);
-}
-
-static inline void virtblk_bio_flush_done(struct virtblk_req *vbr)
-{
-	struct virtio_blk *vblk = vbr->vblk;
-
-	if (vbr->flags & VBLK_REQ_DATA) {
-		/* Send out the actual write data */
-		INIT_WORK(&vbr->work, virtblk_bio_send_data_work);
-		queue_work(virtblk_wq, &vbr->work);
-	} else {
-		bio_endio(vbr->bio, virtblk_result(vbr));
-		mempool_free(vbr, vblk->pool);
-	}
-}
-
-static inline void virtblk_bio_data_done(struct virtblk_req *vbr)
-{
-	struct virtio_blk *vblk = vbr->vblk;
-
-	if (unlikely(vbr->flags & VBLK_REQ_FUA)) {
-		/* Send out a flush before end the bio */
-		vbr->flags &= ~VBLK_REQ_DATA;
-		INIT_WORK(&vbr->work, virtblk_bio_send_flush_work);
-		queue_work(virtblk_wq, &vbr->work);
-	} else {
-		bio_endio(vbr->bio, virtblk_result(vbr));
-		mempool_free(vbr, vblk->pool);
-	}
-}
-
-static inline void virtblk_bio_done(struct virtblk_req *vbr)
-{
-	if (unlikely(vbr->flags & VBLK_IS_FLUSH))
-		virtblk_bio_flush_done(vbr);
-	else
-		virtblk_bio_data_done(vbr);
+	blk_mq_end_io(req, error);
 }
 
 static void virtblk_done(struct virtqueue *vq)
 {
 	struct virtio_blk *vblk = vq->vdev->priv;
-	bool bio_done = false, req_done = false;
+	bool req_done = false;
 	struct virtblk_req *vbr;
 	unsigned long flags;
 	unsigned int len;
 
-	spin_lock_irqsave(vblk->disk->queue->queue_lock, flags);
+	spin_lock_irqsave(&vblk->vq_lock, flags);
 	do {
 		virtqueue_disable_cb(vq);
 		while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) {
-			if (vbr->bio) {
-				virtblk_bio_done(vbr);
-				bio_done = true;
-			} else {
-				virtblk_request_done(vbr);
-				req_done = true;
-			}
+			virtblk_request_done(vbr);
+			req_done = true;
 		}
 	} while (!virtqueue_enable_cb(vq));
+	spin_unlock_irqrestore(&vblk->vq_lock, flags);
+
 	/* In case queue is stopped waiting for more buffers. */
 	if (req_done)
-		blk_start_queue(vblk->disk->queue);
-	spin_unlock_irqrestore(vblk->disk->queue->queue_lock, flags);
-
-	if (bio_done)
-		wake_up(&vblk->queue_wait);
+		blk_mq_start_stopped_hw_queues(vblk->disk->queue);
 }
 
-static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
-		   struct request *req)
+static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
 {
+	struct virtio_blk *vblk = hctx->queue->queuedata;
+	struct virtblk_req *vbr = req->special;
+	unsigned long flags;
 	unsigned int num;
-	struct virtblk_req *vbr;
+	const bool last = (req->cmd_flags & REQ_END) != 0;
 
-	vbr = virtblk_alloc_req(vblk, GFP_ATOMIC);
-	if (!vbr)
-		/* When another request finishes we'll try again. */
-		return false;
+	BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
 
 	vbr->req = req;
-	vbr->bio = NULL;
 	if (req->cmd_flags & REQ_FLUSH) {
 		vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
 		vbr->out_hdr.sector = 0;
@@ -342,7 +187,7 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 		}
 	}
 
-	num = blk_rq_map_sg(q, vbr->req, vblk->sg);
+	num = blk_rq_map_sg(hctx->queue, vbr->req, vbr->sg);
 	if (num) {
 		if (rq_data_dir(vbr->req) == WRITE)
 			vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
@@ -350,63 +195,18 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 			vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
 	}
 
-	if (__virtblk_add_req(vblk->vq, vbr, vblk->sg, num) < 0) {
-		mempool_free(vbr, vblk->pool);
-		return false;
-	}
-
-	return true;
-}
-
-static void virtblk_request(struct request_queue *q)
-{
-	struct virtio_blk *vblk = q->queuedata;
-	struct request *req;
-	unsigned int issued = 0;
-
-	while ((req = blk_peek_request(q)) != NULL) {
-		BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
-
-		/* If this request fails, stop queue and wait for something to
-		   finish to restart it. */
-		if (!do_req(q, vblk, req)) {
-			blk_stop_queue(q);
-			break;
-		}
-		blk_start_request(req);
-		issued++;
-	}
-
-	if (issued)
+	spin_lock_irqsave(&vblk->vq_lock, flags);
+	if (__virtblk_add_req(vblk->vq, vbr, vbr->sg, num) < 0) {
+		spin_unlock_irqrestore(&vblk->vq_lock, flags);
+		blk_mq_stop_hw_queue(hctx);
 		virtqueue_kick(vblk->vq);
-}
-
-static void virtblk_make_request(struct request_queue *q, struct bio *bio)
-{
-	struct virtio_blk *vblk = q->queuedata;
-	struct virtblk_req *vbr;
-
-	BUG_ON(bio->bi_phys_segments + 2 > vblk->sg_elems);
-
-	vbr = virtblk_alloc_req(vblk, GFP_NOIO);
-	if (!vbr) {
-		bio_endio(bio, -ENOMEM);
-		return;
+		return BLK_MQ_RQ_QUEUE_BUSY;
 	}
+	spin_unlock_irqrestore(&vblk->vq_lock, flags);
 
-	vbr->bio = bio;
-	vbr->flags = 0;
-	if (bio->bi_rw & REQ_FLUSH)
-		vbr->flags |= VBLK_REQ_FLUSH;
-	if (bio->bi_rw & REQ_FUA)
-		vbr->flags |= VBLK_REQ_FUA;
-	if (bio->bi_size)
-		vbr->flags |= VBLK_REQ_DATA;
-
-	if (unlikely(vbr->flags & VBLK_REQ_FLUSH))
-		virtblk_bio_send_flush(vbr);
-	else
-		virtblk_bio_send_data(vbr);
+	if (last)
+		virtqueue_kick(vblk->vq);
+	return BLK_MQ_RQ_QUEUE_OK;
 }
 
 /* return id (s/n) string for *disk to *id_str
@@ -680,12 +480,35 @@ static const struct device_attribute dev_attr_cache_type_rw =
 	__ATTR(cache_type, S_IRUGO|S_IWUSR,
 	       virtblk_cache_type_show, virtblk_cache_type_store);
 
+static struct blk_mq_ops virtio_mq_ops = {
+	.queue_rq	= virtio_queue_rq,
+	.map_queue	= blk_mq_map_queue,
+	.alloc_hctx	= blk_mq_alloc_single_hw_queue,
+	.free_hctx	= blk_mq_free_single_hw_queue,
+};
+
+static struct blk_mq_reg virtio_mq_reg = {
+	.ops		= &virtio_mq_ops,
+	.nr_hw_queues	= 1,
+	.queue_depth	= 64,
+	.numa_node	= NUMA_NO_NODE,
+	.flags		= BLK_MQ_F_SHOULD_MERGE,
+};
+
+static void virtblk_init_vbr(void *data, struct blk_mq_hw_ctx *hctx,
+			     struct request *rq, unsigned int nr)
+{
+	struct virtio_blk *vblk = data;
+	struct virtblk_req *vbr = rq->special;
+
+	sg_init_table(vbr->sg, vblk->sg_elems);
+}
+
 static int virtblk_probe(struct virtio_device *vdev)
 {
 	struct virtio_blk *vblk;
 	struct request_queue *q;
 	int err, index;
-	int pool_size;
 
 	u64 cap;
 	u32 v, blk_size, sg_elems, opt_io_size;
@@ -709,17 +532,14 @@ static int virtblk_probe(struct virtio_device *vdev)
 
 	/* We need an extra sg elements at head and tail. */
 	sg_elems += 2;
-	vdev->priv = vblk = kmalloc(sizeof(*vblk) +
-				    sizeof(vblk->sg[0]) * sg_elems, GFP_KERNEL);
+	vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);
 	if (!vblk) {
 		err = -ENOMEM;
 		goto out_free_index;
 	}
 
-	init_waitqueue_head(&vblk->queue_wait);
 	vblk->vdev = vdev;
 	vblk->sg_elems = sg_elems;
-	sg_init_table(vblk->sg, vblk->sg_elems);
 	mutex_init(&vblk->config_lock);
 
 	INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
@@ -728,31 +548,27 @@ static int virtblk_probe(struct virtio_device *vdev)
 	err = init_vq(vblk);
 	if (err)
 		goto out_free_vblk;
-
-	pool_size = sizeof(struct virtblk_req);
-	if (use_bio)
-		pool_size += sizeof(struct scatterlist) * sg_elems;
-	vblk->pool = mempool_create_kmalloc_pool(1, pool_size);
-	if (!vblk->pool) {
-		err = -ENOMEM;
-		goto out_free_vq;
-	}
+	spin_lock_init(&vblk->vq_lock);
 
 	/* FIXME: How many partitions?  How long is a piece of string? */
 	vblk->disk = alloc_disk(1 << PART_BITS);
 	if (!vblk->disk) {
 		err = -ENOMEM;
-		goto out_mempool;
+		goto out_free_vq;
 	}
 
-	q = vblk->disk->queue = blk_init_queue(virtblk_request, NULL);
+	virtio_mq_reg.cmd_size =
+		sizeof(struct virtblk_req) +
+		sizeof(struct scatterlist) * sg_elems;
+
+	q = vblk->disk->queue = blk_mq_init_queue(&virtio_mq_reg, vblk);
 	if (!q) {
 		err = -ENOMEM;
 		goto out_put_disk;
 	}
 
-	if (use_bio)
-		blk_queue_make_request(q, virtblk_make_request);
+	blk_mq_init_commands(q, virtblk_init_vbr, vblk);
+
 	q->queuedata = vblk;
 
 	virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
@@ -857,8 +673,6 @@ out_del_disk:
 	blk_cleanup_queue(vblk->disk->queue);
 out_put_disk:
 	put_disk(vblk->disk);
-out_mempool:
-	mempool_destroy(vblk->pool);
 out_free_vq:
 	vdev->config->del_vqs(vdev);
 out_free_vblk:
@@ -890,7 +704,6 @@ static void virtblk_remove(struct virtio_device *vdev)
 
 	refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount);
 	put_disk(vblk->disk);
-	mempool_destroy(vblk->pool);
 	vdev->config->del_vqs(vdev);
 	kfree(vblk);
 
@@ -914,10 +727,7 @@ static int virtblk_freeze(struct virtio_device *vdev)
 
 	flush_work(&vblk->config_work);
 
-	spin_lock_irq(vblk->disk->queue->queue_lock);
-	blk_stop_queue(vblk->disk->queue);
-	spin_unlock_irq(vblk->disk->queue->queue_lock);
-	blk_sync_queue(vblk->disk->queue);
+	blk_mq_stop_hw_queues(vblk->disk->queue);
 
 	vdev->config->del_vqs(vdev);
 	return 0;
@@ -930,11 +740,9 @@ static int virtblk_restore(struct virtio_device *vdev)
 
 	vblk->config_enable = true;
 	ret = init_vq(vdev->priv);
-	if (!ret) {
-		spin_lock_irq(vblk->disk->queue->queue_lock);
-		blk_start_queue(vblk->disk->queue);
-		spin_unlock_irq(vblk->disk->queue->queue_lock);
-	}
+	if (!ret)
+		blk_mq_start_stopped_hw_queues(vblk->disk->queue);
+
 	return ret;
 }
 #endif
-- 
cgit v1.2.3