31 files changed, 1113 insertions, 917 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 9a06fe883766..95ad936e6048 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -254,16 +254,6 @@ config DM_THIN_PROVISIONING
        ---help---
          Provides thin provisioning and snapshots that share a data store.
 
-config DM_DEBUG_BLOCK_STACK_TRACING
-	boolean "Keep stack trace of persistent data block lock holders"
-	depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA
-	select STACKTRACE
-	---help---
-	  Enable this for messages that may help debug problems with the
-	  block manager locking used by thin provisioning and caching.
-
-	  If unsure, say N.
-
 config DM_CACHE
        tristate "Cache target (EXPERIMENTAL)"
        depends on BLK_DEV_DM
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index 2638417b19aa..4d200883c505 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -24,11 +24,3 @@ config BCACHE_CLOSURES_DEBUG
 	Keeps all active closures in a linked list and provides a debugfs
 	interface to list them, which makes it possible to see asynchronous
 	operations that get stuck.
-
-# cgroup code needs to be updated:
-#
-#config CGROUP_BCACHE
-#	bool "Cgroup controls for bcache"
-#	depends on BCACHE && BLK_CGROUP
-#	---help---
-#	TODO
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index c0d37d082443..443d03fbac47 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -78,12 +78,6 @@ uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
 	ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b));
 	WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX);
 
-	if (CACHE_SYNC(&ca->set->sb)) {
-		ca->need_save_prio = max(ca->need_save_prio,
-					 bucket_disk_gen(b));
-		WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX);
-	}
-
 	return ret;
 }
 
@@ -120,51 +114,45 @@ void bch_rescale_priorities(struct cache_set *c, int sectors)
 	mutex_unlock(&c->bucket_lock);
 }
 
-/* Allocation */
+/*
+ * Background allocation thread: scans for buckets to be invalidated,
+ * invalidates them, rewrites prios/gens (marking them as invalidated on disk),
+ * then optionally issues discard commands to the newly free buckets, then puts
+ * them on the various freelists.
+ */
 
 static inline bool can_inc_bucket_gen(struct bucket *b)
 {
-	return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX &&
-		bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX;
+	return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX;
 }
 
-bool bch_bucket_add_unused(struct cache *ca, struct bucket *b)
+bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b)
 {
-	BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b));
-
-	if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) {
-		unsigned i;
-
-		for (i = 0; i < RESERVE_NONE; i++)
-			if (!fifo_full(&ca->free[i]))
-				goto add;
+	BUG_ON(!ca->set->gc_mark_valid);
 
-		return false;
-	}
-add:
-	b->prio = 0;
-
-	if (can_inc_bucket_gen(b) &&
-	    fifo_push(&ca->unused, b - ca->buckets)) {
-		atomic_inc(&b->pin);
-		return true;
-	}
-
-	return false;
-}
-
-static bool can_invalidate_bucket(struct cache *ca, struct bucket *b)
-{
-	return GC_MARK(b) == GC_MARK_RECLAIMABLE &&
+	return (!GC_MARK(b) ||
+		GC_MARK(b) == GC_MARK_RECLAIMABLE) &&
 		!atomic_read(&b->pin) &&
 		can_inc_bucket_gen(b);
 }
 
-static void invalidate_one_bucket(struct cache *ca, struct bucket *b)
+void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
 {
+	lockdep_assert_held(&ca->set->bucket_lock);
+	BUG_ON(GC_MARK(b) && GC_MARK(b) != GC_MARK_RECLAIMABLE);
+
+	if (GC_SECTORS_USED(b))
+		trace_bcache_invalidate(ca, b - ca->buckets);
+
 	bch_inc_gen(ca, b);
 	b->prio = INITIAL_PRIO;
 	atomic_inc(&b->pin);
+}
+
+static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
+{
+	__bch_invalidate_one_bucket(ca, b);
+
 	fifo_push(&ca->free_inc, b - ca->buckets);
 }
 
@@ -195,20 +183,7 @@ static void invalidate_buckets_lru(struct cache *ca)
 	ca->heap.used = 0;
 
 	for_each_bucket(b, ca) {
-		/*
-		 * If we fill up the unused list, if we then return before
-		 * adding anything to the free_inc list we'll skip writing
-		 * prios/gens and just go back to allocating from the unused
-		 * list:
-		 */
-		if (fifo_full(&ca->unused))
-			return;
-
-		if (!can_invalidate_bucket(ca, b))
-			continue;
-
-		if (!GC_SECTORS_USED(b) &&
-		    bch_bucket_add_unused(ca, b))
+		if (!bch_can_invalidate_bucket(ca, b))
 			continue;
 
 		if (!heap_full(&ca->heap))
@@ -233,7 +208,7 @@ static void invalidate_buckets_lru(struct cache *ca)
 			return;
 		}
 
-		invalidate_one_bucket(ca, b);
+		bch_invalidate_one_bucket(ca, b);
 	}
 }
 
@@ -249,8 +224,8 @@ static void invalidate_buckets_fifo(struct cache *ca)
 
 		b = ca->buckets + ca->fifo_last_bucket++;
 
-		if (can_invalidate_bucket(ca, b))
-			invalidate_one_bucket(ca, b);
+		if (bch_can_invalidate_bucket(ca, b))
+			bch_invalidate_one_bucket(ca, b);
 
 		if (++checked >= ca->sb.nbuckets) {
 			ca->invalidate_needs_gc = 1;
@@ -274,8 +249,8 @@ static void invalidate_buckets_random(struct cache *ca)
 
 		b = ca->buckets + n;
 
-		if (can_invalidate_bucket(ca, b))
-			invalidate_one_bucket(ca, b);
+		if (bch_can_invalidate_bucket(ca, b))
+			bch_invalidate_one_bucket(ca, b);
 
 		if (++checked >= ca->sb.nbuckets / 2) {
 			ca->invalidate_needs_gc = 1;
@@ -287,8 +262,7 @@ static void invalidate_buckets_random(struct cache *ca)
 
 static void invalidate_buckets(struct cache *ca)
 {
-	if (ca->invalidate_needs_gc)
-		return;
+	BUG_ON(ca->invalidate_needs_gc);
 
 	switch (CACHE_REPLACEMENT(&ca->sb)) {
 	case CACHE_REPLACEMENT_LRU:
@@ -301,8 +275,6 @@ static void invalidate_buckets(struct cache *ca)
 		invalidate_buckets_random(ca);
 		break;
 	}
-
-	trace_bcache_alloc_invalidate(ca);
 }
 
 #define allocator_wait(ca, cond)					\
@@ -350,17 +322,10 @@ static int bch_allocator_thread(void *arg)
 		 * possibly issue discards to them, then we add the bucket to
 		 * the free list:
 		 */
-		while (1) {
+		while (!fifo_empty(&ca->free_inc)) {
 			long bucket;
 
-			if ((!atomic_read(&ca->set->prio_blocked) ||
-			     !CACHE_SYNC(&ca->set->sb)) &&
-			    !fifo_empty(&ca->unused))
-				fifo_pop(&ca->unused, bucket);
-			else if (!fifo_empty(&ca->free_inc))
-				fifo_pop(&ca->free_inc, bucket);
-			else
-				break;
+			fifo_pop(&ca->free_inc, bucket);
 
 			if (ca->discard) {
 				mutex_unlock(&ca->set->bucket_lock);
@@ -371,6 +336,7 @@ static int bch_allocator_thread(void *arg)
 			}
 
 			allocator_wait(ca, bch_allocator_push(ca, bucket));
+			wake_up(&ca->set->btree_cache_wait);
 			wake_up(&ca->set->bucket_wait);
 		}
 
@@ -380,9 +346,9 @@ static int bch_allocator_thread(void *arg)
 		 * them to the free_inc list:
 		 */
 
+retry_invalidate:
 		allocator_wait(ca, ca->set->gc_mark_valid &&
-			       (ca->need_save_prio > 64 ||
-				!ca->invalidate_needs_gc));
+			       !ca->invalidate_needs_gc);
 		invalidate_buckets(ca);
 
 		/*
@@ -390,13 +356,28 @@ static int bch_allocator_thread(void *arg)
 		 * new stuff to them:
 		 */
 		allocator_wait(ca, !atomic_read(&ca->set->prio_blocked));
-		if (CACHE_SYNC(&ca->set->sb) &&
-		    (!fifo_empty(&ca->free_inc) ||
-		     ca->need_save_prio > 64))
+		if (CACHE_SYNC(&ca->set->sb)) {
+			/*
+			 * This could deadlock if an allocation with a btree
+			 * node locked ever blocked - having the btree node
+			 * locked would block garbage collection, but here we're
+			 * waiting on garbage collection before we invalidate
+			 * and free anything.
+			 *
+			 * But this should be safe since the btree code always
+			 * uses btree_check_reserve() before allocating now, and
+			 * if it fails it blocks without btree nodes locked.
+			 */
+			if (!fifo_full(&ca->free_inc))
+				goto retry_invalidate;
+
 			bch_prio_write(ca);
+		}
 	}
 }
 
+/* Allocation */
+
 long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
 {
 	DEFINE_WAIT(w);
@@ -408,8 +389,10 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
 	    fifo_pop(&ca->free[reserve], r))
 		goto out;
 
-	if (!wait)
+	if (!wait) {
+		trace_bcache_alloc_fail(ca, reserve);
 		return -1;
+	}
 
 	do {
 		prepare_to_wait(&ca->set->bucket_wait, &w,
@@ -425,6 +408,8 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
 out:
 	wake_up_process(ca->alloc_thread);
 
+	trace_bcache_alloc(ca, reserve);
+
 	if (expensive_debug_checks(ca->set)) {
 		size_t iter;
 		long i;
@@ -438,8 +423,6 @@ out:
 				BUG_ON(i == r);
 		fifo_for_each(i, &ca->free_inc, iter)
 			BUG_ON(i == r);
-		fifo_for_each(i, &ca->unused, iter)
-			BUG_ON(i == r);
 	}
 
 	b = ca->buckets + r;
@@ -461,17 +444,19 @@ out:
 	return r;
 }
 
+void __bch_bucket_free(struct cache *ca, struct bucket *b)
+{
+	SET_GC_MARK(b, 0);
+	SET_GC_SECTORS_USED(b, 0);
+}
+
 void bch_bucket_free(struct cache_set *c, struct bkey *k)
 {
 	unsigned i;
 
-	for (i = 0; i < KEY_PTRS(k); i++) {
-		struct bucket *b = PTR_BUCKET(c, k, i);
-
-		SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
-		SET_GC_SECTORS_USED(b, 0);
-		bch_bucket_add_unused(PTR_CACHE(c, k, i), b);
-	}
+	for (i = 0; i < KEY_PTRS(k); i++)
+		__bch_bucket_free(PTR_CACHE(c, k, i),
+				  PTR_BUCKET(c, k, i));
 }
 
 int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
@@ -709,25 +694,3 @@ int bch_cache_allocator_start(struct cache *ca)
 	ca->alloc_thread = k;
 	return 0;
 }
-
-int bch_cache_allocator_init(struct cache *ca)
-{
-	/*
-	 * Reserve:
-	 * Prio/gen writes first
-	 * Then 8 for btree allocations
-	 * Then half for the moving garbage collector
-	 */
-#if 0
-	ca->watermark[WATERMARK_PRIO] = 0;
-
-	ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
-
-	ca->watermark[WATERMARK_MOVINGGC] = 8 +
-		ca->watermark[WATERMARK_METADATA];
-
-	ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
-		ca->watermark[WATERMARK_MOVINGGC];
-#endif
-	return 0;
-}
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index a4c7306ff43d..82c9c5d35251 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -195,9 +195,7 @@ struct bucket {
 	atomic_t	pin;
 	uint16_t	prio;
 	uint8_t		gen;
-	uint8_t		disk_gen;
 	uint8_t		last_gc; /* Most out of date gen in the btree */
-	uint8_t		gc_gen;
 	uint16_t	gc_mark; /* Bitfield used by GC. See below for field */
 };
 
@@ -207,9 +205,9 @@ struct bucket {
  */
 
 BITMASK(GC_MARK,	 struct bucket, gc_mark, 0, 2);
-#define GC_MARK_RECLAIMABLE	0
-#define GC_MARK_DIRTY		1
-#define GC_MARK_METADATA	2
+#define GC_MARK_RECLAIMABLE	1
+#define GC_MARK_DIRTY		2
+#define GC_MARK_METADATA	3
 #define GC_SECTORS_USED_SIZE	13
 #define MAX_GC_SECTORS_USED	(~(~0ULL << GC_SECTORS_USED_SIZE))
 BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE);
@@ -426,14 +424,9 @@ struct cache {
 	 * their new gen to disk. After prio_write() finishes writing the new
 	 * gens/prios, they'll be moved to the free list (and possibly discarded
 	 * in the process)
-	 *
-	 * unused: GC found nothing pointing into these buckets (possibly
-	 * because all the data they contained was overwritten), so we only
-	 * need to discard them before they can be moved to the free list.
 	 */
 	DECLARE_FIFO(long, free)[RESERVE_NR];
 	DECLARE_FIFO(long, free_inc);
-	DECLARE_FIFO(long, unused);
 
 	size_t			fifo_last_bucket;
 
@@ -443,12 +436,6 @@ struct cache {
 	DECLARE_HEAP(struct bucket *, heap);
 
 	/*
-	 * max(gen - disk_gen) for all buckets. When it gets too big we have to
-	 * call prio_write() to keep gens from wrapping.
-	 */
-	uint8_t			need_save_prio;
-
-	/*
 	 * If nonzero, we know we aren't going to find any buckets to invalidate
 	 * until a gc finishes - otherwise we could pointlessly burn a ton of
 	 * cpu
@@ -562,19 +549,16 @@ struct cache_set {
 	struct list_head	btree_cache_freed;
 
 	/* Number of elements in btree_cache + btree_cache_freeable lists */
-	unsigned		bucket_cache_used;
+	unsigned		btree_cache_used;
 
 	/*
 	 * If we need to allocate memory for a new btree node and that
 	 * allocation fails, we can cannibalize another node in the btree cache
-	 * to satisfy the allocation. However, only one thread can be doing this
-	 * at a time, for obvious reasons - try_harder and try_wait are
-	 * basically a lock for this that we can wait on asynchronously. The
-	 * btree_root() macro releases the lock when it returns.
+	 * to satisfy the allocation - lock to guarantee only one thread does
+	 * this at a time:
 	 */
-	struct task_struct	*try_harder;
-	wait_queue_head_t	try_wait;
-	uint64_t		try_harder_start;
+	wait_queue_head_t	btree_cache_wait;
+	struct task_struct	*btree_cache_alloc_lock;
 
 	/*
 	 * When we free a btree node, we increment the gen of the bucket the
@@ -603,7 +587,7 @@ struct cache_set {
 	uint16_t		min_prio;
 
 	/*
-	 * max(gen - gc_gen) for all buckets. When it gets too big we have to gc
+	 * max(gen - last_gc) for all buckets. When it gets too big we have to gc
 	 * to keep gens from wrapping around.
 	 */
 	uint8_t			need_gc;
@@ -628,6 +612,8 @@ struct cache_set {
 	/* Number of moving GC bios in flight */
 	struct semaphore	moving_in_flight;
 
+	struct workqueue_struct	*moving_gc_wq;
+
 	struct btree		*root;
 
 #ifdef CONFIG_BCACHE_DEBUG
@@ -667,7 +653,6 @@ struct cache_set {
 	struct time_stats	btree_gc_time;
 	struct time_stats	btree_split_time;
 	struct time_stats	btree_read_time;
-	struct time_stats	try_harder_time;
 
 	atomic_long_t		cache_read_races;
 	atomic_long_t		writeback_keys_done;
@@ -850,9 +835,6 @@ static inline bool cached_dev_get(struct cached_dev *dc)
 /*
  * bucket_gc_gen() returns the difference between the bucket's current gen and
  * the oldest gen of any pointer into that bucket in the btree (last_gc).
- *
- * bucket_disk_gen() returns the difference between the current gen and the gen
- * on disk; they're both used to make sure gens don't wrap around.
  */
 
 static inline uint8_t bucket_gc_gen(struct bucket *b)
@@ -860,13 +842,7 @@ static inline uint8_t bucket_gc_gen(struct bucket *b)
 	return b->gen - b->last_gc;
 }
 
-static inline uint8_t bucket_disk_gen(struct bucket *b)
-{
-	return b->gen - b->disk_gen;
-}
-
 #define BUCKET_GC_GEN_MAX	96U
-#define BUCKET_DISK_GEN_MAX	64U
 
 #define kobj_attribute_write(n, fn)					\
 	static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
@@ -899,11 +875,14 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
 
 uint8_t bch_inc_gen(struct cache *, struct bucket *);
 void bch_rescale_priorities(struct cache_set *, int);
-bool bch_bucket_add_unused(struct cache *, struct bucket *);
 
-long bch_bucket_alloc(struct cache *, unsigned, bool);
+bool bch_can_invalidate_bucket(struct cache *, struct bucket *);
+void __bch_invalidate_one_bucket(struct cache *, struct bucket *);
+
+void __bch_bucket_free(struct cache *, struct bucket *);
 void bch_bucket_free(struct cache_set *, struct bkey *);
 
+long bch_bucket_alloc(struct cache *, unsigned, bool);
 int __bch_bucket_alloc_set(struct cache_set *, unsigned,
 			   struct bkey *, int, bool);
 int bch_bucket_alloc_set(struct cache_set *, unsigned,
@@ -954,13 +933,10 @@ int bch_open_buckets_alloc(struct cache_set *);
 void bch_open_buckets_free(struct cache_set *);
 
 int bch_cache_allocator_start(struct cache *ca);
-int bch_cache_allocator_init(struct cache *ca);
 
 void bch_debug_exit(void);
 int bch_debug_init(struct kobject *);
 void bch_request_exit(void);
 int bch_request_init(void);
-void bch_btree_exit(void);
-int bch_btree_init(void);
 
 #endif /* _BCACHE_H */
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 3f74b4b0747b..545416415305 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -23,8 +23,8 @@ void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set)
 	for (k = i->start; k < bset_bkey_last(i); k = next) {
 		next = bkey_next(k);
 
-		printk(KERN_ERR "block %u key %li/%u: ", set,
-		       (uint64_t *) k - i->d, i->keys);
+		printk(KERN_ERR "block %u key %u/%u: ", set,
+		       (unsigned) ((u64 *) k - i->d), i->keys);
 
 		if (b->ops->key_dump)
 			b->ops->key_dump(b, k);
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index 003260f4ddf6..5f6728d5d4dd 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -478,6 +478,12 @@ static inline void bch_keylist_init(struct keylist *l)
 	l->top_p = l->keys_p = l->inline_keys;
 }
 
+static inline void bch_keylist_init_single(struct keylist *l, struct bkey *k)
+{
+	l->keys = k;
+	l->top = bkey_next(k);
+}
+
 static inline void bch_keylist_push(struct keylist *l)
 {
 	l->top = bkey_next(l->top);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 5f9c2a665ca5..7347b6100961 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -68,15 +68,11 @@
  * alloc_bucket() cannot fail. This should be true but is not completely
  * obvious.
  *
- * Make sure all allocations get charged to the root cgroup
- *
  * Plugging?
  *
  * If data write is less than hard sector size of ssd, round up offset in open
  * bucket to the next whole sector
  *
- * Also lookup by cgroup in get_open_bucket()
- *
  * Superblock needs to be fleshed out for multiple cache devices
  *
  * Add a sysfs tunable for the number of writeback IOs in flight
@@ -97,8 +93,6 @@
 #define PTR_HASH(c, k)							\
 	(((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
 
-static struct workqueue_struct *btree_io_wq;
-
 #define insert_lock(s, b)	((b)->level <= (s)->lock)
 
 /*
@@ -123,7 +117,7 @@ static struct workqueue_struct *btree_io_wq;
 ({									\
 	int _r, l = (b)->level - 1;					\
 	bool _w = l <= (op)->lock;					\
-	struct btree *_child = bch_btree_node_get((b)->c, key, l, _w);	\
+	struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\
 	if (!IS_ERR(_child)) {						\
 		_child->parent = (b);					\
 		_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);	\
@@ -152,17 +146,12 @@ static struct workqueue_struct *btree_io_wq;
 			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\
 		}							\
 		rw_unlock(_w, _b);					\
+		bch_cannibalize_unlock(c);				\
 		if (_r == -EINTR)					\
 			schedule();					\
-		bch_cannibalize_unlock(c);				\
-		if (_r == -ENOSPC) {					\
-			wait_event((c)->try_wait,			\
-				   !(c)->try_harder);			\
-			_r = -EINTR;					\
-		}							\
 	} while (_r == -EINTR);						\
 									\
-	finish_wait(&(c)->bucket_wait, &(op)->wait);			\
+	finish_wait(&(c)->btree_cache_wait, &(op)->wait);		\
 	_r;								\
 })
 
@@ -171,6 +160,20 @@ static inline struct bset *write_block(struct btree *b)
 	return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
 }
 
+static void bch_btree_init_next(struct btree *b)
+{
+	/* If not a leaf node, always sort */
+	if (b->level && b->keys.nsets)
+		bch_btree_sort(&b->keys, &b->c->sort);
+	else
+		bch_btree_sort_lazy(&b->keys, &b->c->sort);
+
+	if (b->written < btree_blocks(b))
+		bch_bset_init_next(&b->keys, write_block(b),
+				   bset_magic(&b->c->sb));
+
+}
+
 /* Btree key manipulation */
 
 void bkey_put(struct cache_set *c, struct bkey *k)
@@ -352,8 +355,7 @@ static void __btree_node_write_done(struct closure *cl)
 	btree_complete_write(b, w);
 
 	if (btree_node_dirty(b))
-		queue_delayed_work(btree_io_wq, &b->work,
-				   msecs_to_jiffies(30000));
+		schedule_delayed_work(&b->work, 30 * HZ);
 
 	closure_return_with_destructor(cl, btree_node_write_unlock);
 }
@@ -442,10 +444,12 @@ static void do_btree_node_write(struct btree *b)
 	}
 }
 
-void bch_btree_node_write(struct btree *b, struct closure *parent)
+void __bch_btree_node_write(struct btree *b, struct closure *parent)
 {
 	struct bset *i = btree_bset_last(b);
 
+	lockdep_assert_held(&b->write_lock);
+
 	trace_bcache_btree_write(b);
 
 	BUG_ON(current->bio_list);
@@ -469,23 +473,24 @@ void bch_btree_node_write(struct btree *b, struct closure *parent)
 			&PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
 
 	b->written += set_blocks(i, block_bytes(b->c));
+}
 
-	/* If not a leaf node, always sort */
-	if (b->level && b->keys.nsets)
-		bch_btree_sort(&b->keys, &b->c->sort);
-	else
-		bch_btree_sort_lazy(&b->keys, &b->c->sort);
+void bch_btree_node_write(struct btree *b, struct closure *parent)
+{
+	unsigned nsets = b->keys.nsets;
+
+	lockdep_assert_held(&b->lock);
+
+	__bch_btree_node_write(b, parent);
 
 	/*
 	 * do verify if there was more than one set initially (i.e. we did a
 	 * sort) and we sorted down to a single set:
 	 */
-	if (i != b->keys.set->data && !b->keys.nsets)
+	if (nsets && !b->keys.nsets)
 		bch_btree_verify(b);
 
-	if (b->written < btree_blocks(b))
-		bch_bset_init_next(&b->keys, write_block(b),
-				   bset_magic(&b->c->sb));
+	bch_btree_init_next(b);
 }
 
 static void bch_btree_node_write_sync(struct btree *b)
@@ -493,7 +498,11 @@ static void bch_btree_node_write_sync(struct btree *b)
 	struct closure cl;
 
 	closure_init_stack(&cl);
+
+	mutex_lock(&b->write_lock);
 	bch_btree_node_write(b, &cl);
+	mutex_unlock(&b->write_lock);
+
 	closure_sync(&cl);
 }
 
@@ -501,11 +510,10 @@ static void btree_node_write_work(struct work_struct *w)
 {
 	struct btree *b = container_of(to_delayed_work(w), struct btree, work);
 
-	rw_lock(true, b, b->level);
-
+	mutex_lock(&b->write_lock);
 	if (btree_node_dirty(b))
-		bch_btree_node_write(b, NULL);
-	rw_unlock(true, b);
+		__bch_btree_node_write(b, NULL);
+	mutex_unlock(&b->write_lock);
 }
 
 static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
@@ -513,11 +521,13 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
 	struct bset *i = btree_bset_last(b);
 	struct btree_write *w = btree_current_write(b);
 
+	lockdep_assert_held(&b->write_lock);
+
 	BUG_ON(!b->written);
 	BUG_ON(!i->keys);
 
 	if (!btree_node_dirty(b))
-		queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
+		schedule_delayed_work(&b->work, 30 * HZ);
 
 	set_btree_node_dirty(b);
 
@@ -548,7 +558,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
 #define mca_reserve(c)	(((c->root && c->root->level)		\
 			  ? c->root->level : 1) * 8 + 16)
 #define mca_can_free(c)						\
-	max_t(int, 0, c->bucket_cache_used - mca_reserve(c))
+	max_t(int, 0, c->btree_cache_used - mca_reserve(c))
 
 static void mca_data_free(struct btree *b)
 {
@@ -556,7 +566,7 @@ static void mca_data_free(struct btree *b)
 
 	bch_btree_keys_free(&b->keys);
 
-	b->c->bucket_cache_used--;
+	b->c->btree_cache_used--;
 	list_move(&b->list, &b->c->btree_cache_freed);
 }
 
@@ -581,7 +591,7 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
 					ilog2(b->c->btree_pages),
 					btree_order(k)),
 				  gfp)) {
-		b->c->bucket_cache_used++;
+		b->c->btree_cache_used++;
 		list_move(&b->list, &b->c->btree_cache);
 	} else {
 		list_move(&b->list, &b->c->btree_cache_freed);
@@ -597,6 +607,8 @@ static struct btree *mca_bucket_alloc(struct cache_set *c,
 
 	init_rwsem(&b->lock);
 	lockdep_set_novalidate_class(&b->lock);
+	mutex_init(&b->write_lock);
+	lockdep_set_novalidate_class(&b->write_lock);
 	INIT_LIST_HEAD(&b->list);
 	INIT_DELAYED_WORK(&b->work, btree_node_write_work);
 	b->c = c;
@@ -630,8 +642,12 @@ static int mca_reap(struct btree *b, unsigned min_order, bool flush)
 		up(&b->io_mutex);
 	}
 
+	mutex_lock(&b->write_lock);
 	if (btree_node_dirty(b))
-		bch_btree_node_write_sync(b);
+		__bch_btree_node_write(b, &cl);
+	mutex_unlock(&b->write_lock);
+
+	closure_sync(&cl);
 
 	/* wait for any in flight btree write */
 	down(&b->io_mutex);
@@ -654,7 +670,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
 	if (c->shrinker_disabled)
 		return SHRINK_STOP;
 
-	if (c->try_harder)
+	if (c->btree_cache_alloc_lock)
 		return SHRINK_STOP;
 
 	/* Return -1 if we can't do anything right now */
@@ -686,7 +702,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
 		}
 	}
 
-	for (i = 0; (nr--) && i < c->bucket_cache_used; i++) {
+	for (i = 0; (nr--) && i < c->btree_cache_used; i++) {
 		if (list_empty(&c->btree_cache))
 			goto out;
 
@@ -715,7 +731,7 @@ static unsigned long bch_mca_count(struct shrinker *shrink,
 	if (c->shrinker_disabled)
 		return 0;
 
-	if (c->try_harder)
+	if (c->btree_cache_alloc_lock)
 		return 0;
 
 	return mca_can_free(c) * c->btree_pages;
@@ -819,17 +835,30 @@ out:
 	return b;
 }
 
-static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
+static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op)
+{
+	struct task_struct *old;
+
+	old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
+	if (old && old != current) {
+		if (op)
+			prepare_to_wait(&c->btree_cache_wait, &op->wait,
+					TASK_UNINTERRUPTIBLE);
+		return -EINTR;
+	}
+
+	return 0;
+}
+
+static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op,
+				     struct bkey *k)
 {
 	struct btree *b;
 
 	trace_bcache_btree_cache_cannibalize(c);
 
-	if (!c->try_harder) {
-		c->try_harder = current;
-		c->try_harder_start = local_clock();
-	} else if (c->try_harder != current)
-		return ERR_PTR(-ENOSPC);
+	if (mca_cannibalize_lock(c, op))
+		return ERR_PTR(-EINTR);
 
 	list_for_each_entry_reverse(b, &c->btree_cache, list)
 		if (!mca_reap(b, btree_order(k), false))
@@ -839,6 +868,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
 		if (!mca_reap(b, btree_order(k), true))
 			return b;
 
+	WARN(1, "btree cache cannibalize failed\n");
 	return ERR_PTR(-ENOMEM);
 }
 
@@ -850,14 +880,14 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
  */
 static void bch_cannibalize_unlock(struct cache_set *c)
 {
-	if (c->try_harder == current) {
-		bch_time_stats_update(&c->try_harder_time, c->try_harder_start);
-		c->try_harder = NULL;
-		wake_up(&c->try_wait);
+	if (c->btree_cache_alloc_lock == current) {
+		c->btree_cache_alloc_lock = NULL;
+		wake_up(&c->btree_cache_wait);
 	}
 }
 
-static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
+static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
+			       struct bkey *k, int level)
 {
 	struct btree *b;
 
@@ -920,7 +950,7 @@ err:
 	if (b)
 		rw_unlock(true, b);
 
-	b = mca_cannibalize(c, k);
+	b = mca_cannibalize(c, op, k);
 	if (!IS_ERR(b))
 		goto out;
 
@@ -936,8 +966,8 @@ err:
  * The btree node will have either a read or a write lock held, depending on
  * level and op->lock.
  */
-struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k,
-				 int level, bool write)
+struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
+				 struct bkey *k, int level, bool write)
 {
 	int i = 0;
 	struct btree *b;
@@ -951,7 +981,7 @@ retry:
 			return ERR_PTR(-EAGAIN);
 
 		mutex_lock(&c->bucket_lock);
-		b = mca_alloc(c, k, level);
+		b = mca_alloc(c, op, k, level);
 		mutex_unlock(&c->bucket_lock);
 
 		if (!b)
@@ -997,7 +1027,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
 	struct btree *b;
 
 	mutex_lock(&c->bucket_lock);
-	b = mca_alloc(c, k, level);
+	b = mca_alloc(c, NULL, k, level);
 	mutex_unlock(&c->bucket_lock);
 
 	if (!IS_ERR_OR_NULL(b)) {
@@ -1010,46 +1040,41 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
 
 static void btree_node_free(struct btree *b)
 {
-	unsigned i;
-
 	trace_bcache_btree_node_free(b);
 
 	BUG_ON(b == b->c->root);
 
+	mutex_lock(&b->write_lock);
+
 	if (btree_node_dirty(b))
 		btree_complete_write(b, btree_current_write(b));
 	clear_bit(BTREE_NODE_dirty, &b->flags);
 
+	mutex_unlock(&b->write_lock);
+
 	cancel_delayed_work(&b->work);
 
 	mutex_lock(&b->c->bucket_lock);
-
-	for (i = 0; i < KEY_PTRS(&b->key); i++) {
-		BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin));
-
-		bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
-			    PTR_BUCKET(b->c, &b->key, i));
-	}
-
 	bch_bucket_free(b->c, &b->key);
 	mca_bucket_free(b);
 	mutex_unlock(&b->c->bucket_lock);
 }
 
-struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait)
+struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
+				   int level)
 {
 	BKEY_PADDED(key) k;
 	struct btree *b = ERR_PTR(-EAGAIN);
 
 	mutex_lock(&c->bucket_lock);
 retry:
-	if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
+	if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, op != NULL))
 		goto err;
 
 	bkey_put(c, &k.key);
 	SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
 
-	b = mca_alloc(c, &k.key, level);
+	b = mca_alloc(c, op, &k.key, level);
 	if (IS_ERR(b))
 		goto err_free;
 
@@ -1075,12 +1100,15 @@ err:
 	return b;
 }
 
-static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait)
+static struct btree *btree_node_alloc_replacement(struct btree *b,
+						  struct btree_op *op)
 {
-	struct btree *n = bch_btree_node_alloc(b->c, b->level, wait);
+	struct btree *n = bch_btree_node_alloc(b->c, op, b->level);
 	if (!IS_ERR_OR_NULL(n)) {
+		mutex_lock(&n->write_lock);
 		bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
 		bkey_copy_key(&n->key, &b->key);
+		mutex_unlock(&n->write_lock);
 	}
 
 	return n;
@@ -1090,43 +1118,47 @@ static void make_btree_freeing_key(struct btree *b, struct bkey *k)
 {
 	unsigned i;
 
+	mutex_lock(&b->c->bucket_lock);
+
+	atomic_inc(&b->c->prio_blocked);
+
 	bkey_copy(k, &b->key);
 	bkey_copy_key(k, &ZERO_KEY);
 
-	for (i = 0; i < KEY_PTRS(k); i++) {
-		uint8_t g = PTR_BUCKET(b->c, k, i)->gen + 1;
-
-		SET_PTR_GEN(k, i, g);
-	}
+	for (i = 0; i < KEY_PTRS(k); i++)
+		SET_PTR_GEN(k, i,
+			    bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
+					PTR_BUCKET(b->c, &b->key, i)));
 
-	atomic_inc(&b->c->prio_blocked);
+	mutex_unlock(&b->c->bucket_lock);
 }
 
 static int btree_check_reserve(struct btree *b, struct btree_op *op)
 {
 	struct cache_set *c = b->c;
 	struct cache *ca;
-	unsigned i, reserve = c->root->level * 2 + 1;
-	int ret = 0;
+	unsigned i, reserve = (c->root->level - b->level) * 2 + 1;
 
 	mutex_lock(&c->bucket_lock);
 
 	for_each_cache(ca, c, i)
 		if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
 			if (op)
-				prepare_to_wait(&c->bucket_wait, &op->wait,
+				prepare_to_wait(&c->btree_cache_wait, &op->wait,
 						TASK_UNINTERRUPTIBLE);
-			ret = -EINTR;
-			break;
+			mutex_unlock(&c->bucket_lock);
+			return -EINTR;
 		}
 
 	mutex_unlock(&c->bucket_lock);
-	return ret;
+
+	return mca_cannibalize_lock(b->c, op);
 }
 
 /* Garbage collection */
 
-uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
+static uint8_t __bch_btree_mark_key(struct cache_set *c, int level,
+				    struct bkey *k)
 {
 	uint8_t stale = 0;
 	unsigned i;
@@ -1146,8 +1178,8 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
 
 		g = PTR_BUCKET(c, k, i);
 
-		if (gen_after(g->gc_gen, PTR_GEN(k, i)))
-			g->gc_gen = PTR_GEN(k, i);
+		if (gen_after(g->last_gc, PTR_GEN(k, i)))
+			g->last_gc = PTR_GEN(k, i);
 
 		if (ptr_stale(c, k, i)) {
 			stale = max(stale, ptr_stale(c, k, i));
@@ -1163,6 +1195,8 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
 			SET_GC_MARK(g, GC_MARK_METADATA);
 		else if (KEY_DIRTY(k))
 			SET_GC_MARK(g, GC_MARK_DIRTY);
+		else if (!GC_MARK(g))
+			SET_GC_MARK(g, GC_MARK_RECLAIMABLE);
 
 		/* guard against overflow */
 		SET_GC_SECTORS_USED(g, min_t(unsigned,
@@ -1177,6 +1211,26 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
 
 #define btree_mark_key(b, k)	__bch_btree_mark_key(b->c, b->level, k)
 
+void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k)
+{
+	unsigned i;
+
+	for (i = 0; i < KEY_PTRS(k); i++)
+		if (ptr_available(c, k, i) &&
+		    !ptr_stale(c, k, i)) {
+			struct bucket *b = PTR_BUCKET(c, k, i);
+
+			b->gen = PTR_GEN(k, i);
+
+			if (level && bkey_cmp(k, &ZERO_KEY))
+				b->prio = BTREE_PRIO;
+			else if (!level && b->prio == BTREE_PRIO)
+				b->prio = INITIAL_PRIO;
+		}
+
+	__bch_btree_mark_key(c, level, k);
+}
+
 static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
 {
 	uint8_t stale = 0;
@@ -1230,14 +1284,19 @@ static int bch_btree_insert_node(struct btree *, struct btree_op *,
 				 struct keylist *, atomic_t *, struct bkey *);
 
 static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
-			     struct keylist *keylist, struct gc_stat *gc,
-			     struct gc_merge_info *r)
+			     struct gc_stat *gc, struct gc_merge_info *r)
 {
 	unsigned i, nodes = 0, keys = 0, blocks;
 	struct btree *new_nodes[GC_MERGE_NODES];
+	struct keylist keylist;
 	struct closure cl;
 	struct bkey *k;
 
+	bch_keylist_init(&keylist);
+
+	if (btree_check_reserve(b, NULL))
+		return 0;
+
 	memset(new_nodes, 0, sizeof(new_nodes));
 	closure_init_stack(&cl);
 
@@ -1252,11 +1311,23 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
 		return 0;
 
 	for (i = 0; i < nodes; i++) {
-		new_nodes[i] = btree_node_alloc_replacement(r[i].b, false);
+		new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL);
 		if (IS_ERR_OR_NULL(new_nodes[i]))
 			goto out_nocoalesce;
 	}
 
+	/*
+	 * We have to check the reserve here, after we've allocated our new
+	 * nodes, to make sure the insert below will succeed - we also check
+	 * before as an optimization to potentially avoid a bunch of expensive
+	 * allocs/sorts
+	 */
+	if (btree_check_reserve(b, NULL))
+		goto out_nocoalesce;
+
+	for (i = 0; i < nodes; i++)
+		mutex_lock(&new_nodes[i]->write_lock);
+
 	for (i = nodes - 1; i > 0; --i) {
 		struct bset *n1 = btree_bset_first(new_nodes[i]);
 		struct bset *n2 = btree_bset_first(new_nodes[i - 1]);
@@ -1315,28 +1386,34 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
 
 		n2->keys -= keys;
 
-		if (__bch_keylist_realloc(keylist,
+		if (__bch_keylist_realloc(&keylist,
 					  bkey_u64s(&new_nodes[i]->key)))
 			goto out_nocoalesce;
 
 		bch_btree_node_write(new_nodes[i], &cl);
-		bch_keylist_add(keylist, &new_nodes[i]->key);
+		bch_keylist_add(&keylist, &new_nodes[i]->key);
 	}
 
-	for (i = 0; i < nodes; i++) {
-		if (__bch_keylist_realloc(keylist, bkey_u64s(&r[i].b->key)))
-			goto out_nocoalesce;
+	for (i = 0; i < nodes; i++)
+		mutex_unlock(&new_nodes[i]->write_lock);
 
-		make_btree_freeing_key(r[i].b, keylist->top);
-		bch_keylist_push(keylist);
-	}
+	closure_sync(&cl);
 
 	/* We emptied out this node */
 	BUG_ON(btree_bset_first(new_nodes[0])->keys);
 	btree_node_free(new_nodes[0]);
 	rw_unlock(true, new_nodes[0]);
 
-	closure_sync(&cl);
+	for (i = 0; i < nodes; i++) {
+		if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key)))
+			goto out_nocoalesce;
+
+		make_btree_freeing_key(r[i].b, keylist.top);
+		bch_keylist_push(&keylist);
+	}
+
+	bch_btree_insert_node(b, op, &keylist, NULL, NULL);
+	BUG_ON(!bch_keylist_empty(&keylist));
 
 	for (i = 0; i < nodes; i++) {
 		btree_node_free(r[i].b);
@@ -1345,22 +1422,22 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
 		r[i].b = new_nodes[i];
 	}
 
-	bch_btree_insert_node(b, op, keylist, NULL, NULL);
-	BUG_ON(!bch_keylist_empty(keylist));
-
 	memmove(r, r + 1, sizeof(r[0]) * (nodes - 1));
 	r[nodes - 1].b = ERR_PTR(-EINTR);
 
 	trace_bcache_btree_gc_coalesce(nodes);
 	gc->nodes--;
 
+	bch_keylist_free(&keylist);
+
 	/* Invalidated our iterator */
 	return -EINTR;
 
 out_nocoalesce:
 	closure_sync(&cl);
+	bch_keylist_free(&keylist);
 
-	while ((k = bch_keylist_pop(keylist)))
+	while ((k = bch_keylist_pop(&keylist)))
 		if (!bkey_cmp(k, &ZERO_KEY))
 			atomic_dec(&b->c->prio_blocked);
 
@@ -1372,6 +1449,42 @@ out_nocoalesce:
 	return 0;
 }
 
+static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
+				 struct btree *replace)
+{
+	struct keylist keys;
+	struct btree *n;
+
+	if (btree_check_reserve(b, NULL))
+		return 0;
+
+	n = btree_node_alloc_replacement(replace, NULL);
+
+	/* recheck reserve after allocating replacement node */
+	if (btree_check_reserve(b, NULL)) {
+		btree_node_free(n);
+		rw_unlock(true, n);
+		return 0;
+	}
+
+	bch_btree_node_write_sync(n);
+
+	bch_keylist_init(&keys);
+	bch_keylist_add(&keys, &n->key);
+
+	make_btree_freeing_key(replace, keys.top);
+	bch_keylist_push(&keys);
+
+	bch_btree_insert_node(b, op, &keys, NULL, NULL);
+	BUG_ON(!bch_keylist_empty(&keys));
+
+	btree_node_free(replace);
+	rw_unlock(true, n);
+
+	/* Invalidated our iterator */
+	return -EINTR;
+}
+
 static unsigned btree_gc_count_keys(struct btree *b)
 {
 	struct bkey *k;
@@ -1387,26 +1500,23 @@ static unsigned btree_gc_count_keys(struct btree *b)
 static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 			    struct closure *writes, struct gc_stat *gc)
 {
-	unsigned i;
 	int ret = 0;
 	bool should_rewrite;
-	struct btree *n;
 	struct bkey *k;
-	struct keylist keys;
 	struct btree_iter iter;
 	struct gc_merge_info r[GC_MERGE_NODES];
-	struct gc_merge_info *last = r + GC_MERGE_NODES - 1;
+	struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1;
 
-	bch_keylist_init(&keys);
 	bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
 
-	for (i = 0; i < GC_MERGE_NODES; i++)
-		r[i].b = ERR_PTR(-EINTR);
+	for (i = r; i < r + ARRAY_SIZE(r); i++)
+		i->b = ERR_PTR(-EINTR);
 
 	while (1) {
 		k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
 		if (k) {
-			r->b = bch_btree_node_get(b->c, k, b->level - 1, true);
+			r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
+						  true);
 			if (IS_ERR(r->b)) {
 				ret = PTR_ERR(r->b);
 				break;
@@ -1414,7 +1524,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 
 			r->keys = btree_gc_count_keys(r->b);
 
-			ret = btree_gc_coalesce(b, op, &keys, gc, r);
+			ret = btree_gc_coalesce(b, op, gc, r);
 			if (ret)
 				break;
 		}
@@ -1424,32 +1534,10 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 
 		if (!IS_ERR(last->b)) {
 			should_rewrite = btree_gc_mark_node(last->b, gc);
-			if (should_rewrite &&
-			    !btree_check_reserve(b, NULL)) {
-				n = btree_node_alloc_replacement(last->b,
-								 false);
-
-				if (!IS_ERR_OR_NULL(n)) {
-					bch_btree_node_write_sync(n);
-					bch_keylist_add(&keys, &n->key);
-
-					make_btree_freeing_key(last->b,
-							       keys.top);
-					bch_keylist_push(&keys);
-
-					btree_node_free(last->b);
-
-					bch_btree_insert_node(b, op, &keys,
-							      NULL, NULL);
-					BUG_ON(!bch_keylist_empty(&keys));
-
-					rw_unlock(true, last->b);
-					last->b = n;
-
-					/* Invalidated our iterator */
-					ret = -EINTR;
+			if (should_rewrite) {
+				ret = btree_gc_rewrite_node(b, op, last->b);
+				if (ret)
 					break;
-				}
 			}
 
 			if (last->b->level) {
@@ -1464,8 +1552,10 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 			 * Must flush leaf nodes before gc ends, since replace
 			 * operations aren't journalled
 			 */
+			mutex_lock(&last->b->write_lock);
 			if (btree_node_dirty(last->b))
 				bch_btree_node_write(last->b, writes);
+			mutex_unlock(&last->b->write_lock);
 			rw_unlock(true, last->b);
 		}
 
@@ -1478,15 +1568,15 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 		}
 	}
 
-	for (i = 0; i < GC_MERGE_NODES; i++)
-		if (!IS_ERR_OR_NULL(r[i].b)) {
-			if (btree_node_dirty(r[i].b))
-				bch_btree_node_write(r[i].b, writes);
-			rw_unlock(true, r[i].b);
+	for (i = r; i < r + ARRAY_SIZE(r); i++)
+		if (!IS_ERR_OR_NULL(i->b)) {
+			mutex_lock(&i->b->write_lock);
+			if (btree_node_dirty(i->b))
+				bch_btree_node_write(i->b, writes);
+			mutex_unlock(&i->b->write_lock);
+			rw_unlock(true, i->b);
 		}
 
-	bch_keylist_free(&keys);
-
 	return ret;
 }
 
@@ -1499,10 +1589,11 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
 
 	should_rewrite = btree_gc_mark_node(b, gc);
 	if (should_rewrite) {
-		n = btree_node_alloc_replacement(b, false);
+		n = btree_node_alloc_replacement(b, NULL);
 
 		if (!IS_ERR_OR_NULL(n)) {
 			bch_btree_node_write_sync(n);
+
 			bch_btree_set_root(n);
 			btree_node_free(b);
 			rw_unlock(true, n);
@@ -1511,6 +1602,8 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
 		}
 	}
 
+	__bch_btree_mark_key(b->c, b->level + 1, &b->key);
+
 	if (b->level) {
 		ret = btree_gc_recurse(b, op, writes, gc);
 		if (ret)
@@ -1538,9 +1631,9 @@ static void btree_gc_start(struct cache_set *c)
 
 	for_each_cache(ca, c, i)
 		for_each_bucket(b, ca) {
-			b->gc_gen = b->gen;
+			b->last_gc = b->gen;
 			if (!atomic_read(&b->pin)) {
-				SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
+				SET_GC_MARK(b, 0);
 				SET_GC_SECTORS_USED(b, 0);
 			}
 		}
@@ -1548,7 +1641,7 @@ static void btree_gc_start(struct cache_set *c)
 	mutex_unlock(&c->bucket_lock);
 }
 
-size_t bch_btree_gc_finish(struct cache_set *c)
+static size_t bch_btree_gc_finish(struct cache_set *c)
 {
 	size_t available = 0;
 	struct bucket *b;
@@ -1561,11 +1654,6 @@ size_t bch_btree_gc_finish(struct cache_set *c)
 	c->gc_mark_valid = 1;
 	c->need_gc	= 0;
 
-	if (c->root)
-		for (i = 0; i < KEY_PTRS(&c->root->key); i++)
-			SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i),
-				    GC_MARK_METADATA);
-
 	for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
 		SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
 			    GC_MARK_METADATA);
@@ -1605,15 +1693,15 @@ size_t bch_btree_gc_finish(struct cache_set *c)
 			SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
 
 		for_each_bucket(b, ca) {
-			b->last_gc	= b->gc_gen;
 			c->need_gc	= max(c->need_gc, bucket_gc_gen(b));
 
-			if (!atomic_read(&b->pin) &&
-			    GC_MARK(b) == GC_MARK_RECLAIMABLE) {
+			if (atomic_read(&b->pin))
+				continue;
+
+			BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
+
+			if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
 				available++;
-				if (!GC_SECTORS_USED(b))
-					bch_bucket_add_unused(ca, b);
-			}
 		}
 	}
 
@@ -1705,36 +1793,16 @@ int bch_gc_thread_start(struct cache_set *c)
 
 /* Initial partial gc */
 
-static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
-				   unsigned long **seen)
+static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
 {
 	int ret = 0;
-	unsigned i;
 	struct bkey *k, *p = NULL;
-	struct bucket *g;
 	struct btree_iter iter;
 
-	for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
-		for (i = 0; i < KEY_PTRS(k); i++) {
-			if (!ptr_available(b->c, k, i))
-				continue;
-
-			g = PTR_BUCKET(b->c, k, i);
+	for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid)
+		bch_initial_mark_key(b->c, b->level, k);
 
-			if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i),
-						seen[PTR_DEV(k, i)]) ||
-			    !ptr_stale(b->c, k, i)) {
-				g->gen = PTR_GEN(k, i);
-
-				if (b->level)
-					g->prio = BTREE_PRIO;
-				else if (g->prio == BTREE_PRIO)
-					g->prio = INITIAL_PRIO;
-			}
-		}
-
-		btree_mark_key(b, k);
-	}
+	bch_initial_mark_key(b->c, b->level + 1, &b->key);
 
 	if (b->level) {
 		bch_btree_iter_init(&b->keys, &iter, NULL);
@@ -1746,40 +1814,58 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
 				btree_node_prefetch(b->c, k, b->level - 1);
 
 			if (p)
-				ret = btree(check_recurse, p, b, op, seen);
+				ret = btree(check_recurse, p, b, op);
 
 			p = k;
 		} while (p && !ret);
 	}
 
-	return 0;
+	return ret;
 }
 
 int bch_btree_check(struct cache_set *c)
 {
-	int ret = -ENOMEM;
-	unsigned i;
-	unsigned long *seen[MAX_CACHES_PER_SET];
 	struct btree_op op;
 
-	memset(seen, 0, sizeof(seen));
 	bch_btree_op_init(&op, SHRT_MAX);
 
-	for (i = 0; c->cache[i]; i++) {
-		size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8);
-		seen[i] = kmalloc(n, GFP_KERNEL);
-		if (!seen[i])
-			goto err;
+	return btree_root(check_recurse, c, &op);
+}
+
+void bch_initial_gc_finish(struct cache_set *c)
+{
+	struct cache *ca;
+	struct bucket *b;
+	unsigned i;
+
+	bch_btree_gc_finish(c);
 
-		/* Disables the seen array until prio_read() uses it too */
-		memset(seen[i], 0xFF, n);
+	mutex_lock(&c->bucket_lock);
+
+	/*
+	 * We need to put some unused buckets directly on the prio freelist in
+	 * order to get the allocator thread started - it needs freed buckets in
+	 * order to rewrite the prios and gens, and it needs to rewrite prios
+	 * and gens in order to free buckets.
+	 *
+	 * This is only safe for buckets that have no live data in them, which
+	 * there should always be some of.
+	 */
+	for_each_cache(ca, c, i) {
+		for_each_bucket(b, ca) {
+			if (fifo_full(&ca->free[RESERVE_PRIO]))
+				break;
+
+			if (bch_can_invalidate_bucket(ca, b) &&
+			    !GC_MARK(b)) {
+				__bch_invalidate_one_bucket(ca, b);
+				fifo_push(&ca->free[RESERVE_PRIO],
+					  b - ca->buckets);
+			}
+		}
 	}
 
-	ret = btree_root(check_recurse, c, &op, seen);
-err:
-	for (i = 0; i < MAX_CACHES_PER_SET; i++)
-		kfree(seen[i]);
-	return ret;
+	mutex_unlock(&c->bucket_lock);
 }
 
 /* Btree insertion */
@@ -1871,11 +1957,14 @@ static int btree_split(struct btree *b, struct btree_op *op,
 	closure_init_stack(&cl);
 	bch_keylist_init(&parent_keys);
 
-	if (!b->level &&
-	    btree_check_reserve(b, op))
-		return -EINTR;
+	if (btree_check_reserve(b, op)) {
+		if (!b->level)
+			return -EINTR;
+		else
+			WARN(1, "insufficient reserve for split\n");
+	}
 
-	n1 = btree_node_alloc_replacement(b, true);
+	n1 = btree_node_alloc_replacement(b, op);
 	if (IS_ERR(n1))
 		goto err;
 
@@ -1887,16 +1976,19 @@ static int btree_split(struct btree *b, struct btree_op *op,
 
 		trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
 
-		n2 = bch_btree_node_alloc(b->c, b->level, true);
+		n2 = bch_btree_node_alloc(b->c, op, b->level);
 		if (IS_ERR(n2))
 			goto err_free1;
 
 		if (!b->parent) {
-			n3 = bch_btree_node_alloc(b->c, b->level + 1, true);
+			n3 = bch_btree_node_alloc(b->c, op, b->level + 1);
 			if (IS_ERR(n3))
 				goto err_free2;
 		}
 
+		mutex_lock(&n1->write_lock);
+		mutex_lock(&n2->write_lock);
+
 		bch_btree_insert_keys(n1, op, insert_keys, replace_key);
 
 		/*
@@ -1923,45 +2015,45 @@ static int btree_split(struct btree *b, struct btree_op *op,
 
 		bch_keylist_add(&parent_keys, &n2->key);
 		bch_btree_node_write(n2, &cl);
+		mutex_unlock(&n2->write_lock);
 		rw_unlock(true, n2);
 	} else {
 		trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys);
 
+		mutex_lock(&n1->write_lock);
 		bch_btree_insert_keys(n1, op, insert_keys, replace_key);
 	}
 
 	bch_keylist_add(&parent_keys, &n1->key);
 	bch_btree_node_write(n1, &cl);
+	mutex_unlock(&n1->write_lock);
 
 	if (n3) {
 		/* Depth increases, make a new root */
+		mutex_lock(&n3->write_lock);
 		bkey_copy_key(&n3->key, &MAX_KEY);
 		bch_btree_insert_keys(n3, op, &parent_keys, NULL);
 		bch_btree_node_write(n3, &cl);
+		mutex_unlock(&n3->write_lock);
 
 		closure_sync(&cl);
 		bch_btree_set_root(n3);
 		rw_unlock(true, n3);
-
-		btree_node_free(b);
 	} else if (!b->parent) {
 		/* Root filled up but didn't need to be split */
 		closure_sync(&cl);
 		bch_btree_set_root(n1);
-
-		btree_node_free(b);
 	} else {
 		/* Split a non root node */
 		closure_sync(&cl);
 		make_btree_freeing_key(b, parent_keys.top);
 		bch_keylist_push(&parent_keys);
 
-		btree_node_free(b);
-
 		bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL);
 		BUG_ON(!bch_keylist_empty(&parent_keys));
 	}
 
+	btree_node_free(b);
 	rw_unlock(true, n1);
 
 	bch_time_stats_update(&b->c->btree_split_time, start_time);
@@ -1976,7 +2068,7 @@ err_free1:
 	btree_node_free(n1);
 	rw_unlock(true, n1);
 err:
-	WARN(1, "bcache: btree split failed");
+	WARN(1, "bcache: btree split failed (level %u)", b->level);
 
 	if (n3 == ERR_PTR(-EAGAIN) ||
 	    n2 == ERR_PTR(-EAGAIN) ||
@@ -1991,33 +2083,54 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
 				 atomic_t *journal_ref,
 				 struct bkey *replace_key)
 {
+	struct closure cl;
+
 	BUG_ON(b->level && replace_key);
 
+	closure_init_stack(&cl);
+
+	mutex_lock(&b->write_lock);
+
+	if (write_block(b) != btree_bset_last(b) &&
+	    b->keys.last_set_unwritten)
+		bch_btree_init_next(b); /* just wrote a set */
+
 	if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) {
-		if (current->bio_list) {
-			op->lock = b->c->root->level + 1;
-			return -EAGAIN;
-		} else if (op->lock <= b->c->root->level) {
-			op->lock = b->c->root->level + 1;
-			return -EINTR;
-		} else {
-			/* Invalidated all iterators */
-			int ret = btree_split(b, op, insert_keys, replace_key);
+		mutex_unlock(&b->write_lock);
+		goto split;
+	}
 
-			return bch_keylist_empty(insert_keys) ?
-				0 : ret ?: -EINTR;
-		}
-	} else {
-		BUG_ON(write_block(b) != btree_bset_last(b));
+	BUG_ON(write_block(b) != btree_bset_last(b));
 
-		if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
-			if (!b->level)
-				bch_btree_leaf_dirty(b, journal_ref);
-			else
-				bch_btree_node_write_sync(b);
-		}
+	if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
+		if (!b->level)
+			bch_btree_leaf_dirty(b, journal_ref);
+		else
+			bch_btree_node_write(b, &cl);
+	}
 
-		return 0;
+	mutex_unlock(&b->write_lock);
+
+	/* wait for btree node write if necessary, after unlock */
+	closure_sync(&cl);
+
+	return 0;
+split:
+	if (current->bio_list) {
+		op->lock = b->c->root->level + 1;
+		return -EAGAIN;
+	} else if (op->lock <= b->c->root->level) {
+		op->lock = b->c->root->level + 1;
+		return -EINTR;
+	} else {
+		/* Invalidated all iterators */
+		int ret = btree_split(b, op, insert_keys, replace_key);
+
+		if (bch_keylist_empty(insert_keys))
+			return 0;
+		else if (!ret)
+			return -EINTR;
+		return ret;
 	}
 }
 
@@ -2403,18 +2516,3 @@ void bch_keybuf_init(struct keybuf *buf)
 	spin_lock_init(&buf->lock);
 	array_allocator_init(&buf->freelist);
 }
-
-void bch_btree_exit(void)
-{
-	if (btree_io_wq)
-		destroy_workqueue(btree_io_wq);
-}
-
-int __init bch_btree_init(void)
-{
-	btree_io_wq = create_singlethread_workqueue("bch_btree_io");
-	if (!btree_io_wq)
-		return -ENOMEM;
-
-	return 0;
-}
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index af065e97e55c..91dfa5e69685 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -127,6 +127,8 @@ struct btree {
 	struct cache_set	*c;
 	struct btree		*parent;
 
+	struct mutex		write_lock;
+
 	unsigned long		flags;
 	uint16_t		written;	/* would be nice to kill */
 	uint8_t			level;
@@ -236,11 +238,13 @@ static inline void rw_unlock(bool w, struct btree *b)
 }
 
 void bch_btree_node_read_done(struct btree *);
+void __bch_btree_node_write(struct btree *, struct closure *);
 void bch_btree_node_write(struct btree *, struct closure *);
 
 void bch_btree_set_root(struct btree *);
-struct btree *bch_btree_node_alloc(struct cache_set *, int, bool);
-struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool);
+struct btree *bch_btree_node_alloc(struct cache_set *, struct btree_op *, int);
+struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *,
+				 struct bkey *, int, bool);
 
 int bch_btree_insert_check_key(struct btree *, struct btree_op *,
 			       struct bkey *);
@@ -248,10 +252,10 @@ int bch_btree_insert(struct cache_set *, struct keylist *,
 		     atomic_t *, struct bkey *);
 
 int bch_gc_thread_start(struct cache_set *);
-size_t bch_btree_gc_finish(struct cache_set *);
+void bch_initial_gc_finish(struct cache_set *);
 void bch_moving_gc(struct cache_set *);
 int bch_btree_check(struct cache_set *);
-uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *);
+void bch_initial_mark_key(struct cache_set *, int, struct bkey *);
 
 static inline void wake_up_gc(struct cache_set *c)
 {
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index 416d1a3e028e..3a0de4cf9771 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -194,9 +194,9 @@ err:
 	mutex_unlock(&b->c->bucket_lock);
 	bch_extent_to_text(buf, sizeof(buf), k);
 	btree_bug(b,
-"inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
+"inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu",
 		  buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
-		  g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
+		  g->prio, g->gen, g->last_gc, GC_MARK(g));
 	return true;
 }
 
@@ -308,6 +308,16 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
 	return NULL;
 }
 
+static void bch_subtract_dirty(struct bkey *k,
+			   struct cache_set *c,
+			   uint64_t offset,
+			   int sectors)
+{
+	if (KEY_DIRTY(k))
+		bcache_dev_sectors_dirty_add(c, KEY_INODE(k),
+					     offset, -sectors);
+}
+
 static bool bch_extent_insert_fixup(struct btree_keys *b,
 				    struct bkey *insert,
 				    struct btree_iter *iter,
@@ -315,13 +325,6 @@ static bool bch_extent_insert_fixup(struct btree_keys *b,
 {
 	struct cache_set *c = container_of(b, struct btree, keys)->c;
 
-	void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
-	{
-		if (KEY_DIRTY(k))
-			bcache_dev_sectors_dirty_add(c, KEY_INODE(k),
-						     offset, -sectors);
-	}
-
 	uint64_t old_offset;
 	unsigned old_size, sectors_found = 0;
 
@@ -398,7 +401,8 @@ static bool bch_extent_insert_fixup(struct btree_keys *b,
 
 			struct bkey *top;
 
-			subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
+			bch_subtract_dirty(k, c, KEY_START(insert),
+				       KEY_SIZE(insert));
 
 			if (bkey_written(b, k)) {
 				/*
@@ -448,7 +452,7 @@ static bool bch_extent_insert_fixup(struct btree_keys *b,
 			}
 		}
 
-		subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
+		bch_subtract_dirty(k, c, old_offset, old_size - KEY_SIZE(k));
 	}
 
 check_failed:
@@ -499,9 +503,9 @@ static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,
 
 	if (mutex_trylock(&b->c->bucket_lock)) {
 		if (b->c->gc_mark_valid &&
-		    ((GC_MARK(g) != GC_MARK_DIRTY &&
-		      KEY_DIRTY(k)) ||
-		     GC_MARK(g) == GC_MARK_METADATA))
+		    (!GC_MARK(g) ||
+		     GC_MARK(g) == GC_MARK_METADATA ||
+		     (GC_MARK(g) != GC_MARK_DIRTY && KEY_DIRTY(k))))
 			goto err;
 
 		if (g->prio == BTREE_PRIO)
@@ -515,9 +519,9 @@ err:
 	mutex_unlock(&b->c->bucket_lock);
 	bch_extent_to_text(buf, sizeof(buf), k);
 	btree_bug(b,
-"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
+"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu",
 		  buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
-		  g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
+		  g->prio, g->gen, g->last_gc, GC_MARK(g));
 	return true;
 }
 
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 18039affc306..59e82021b5bb 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -237,8 +237,14 @@ bsearch:
 		for (i = 0; i < ca->sb.njournal_buckets; i++)
 			if (ja->seq[i] > seq) {
 				seq = ja->seq[i];
-				ja->cur_idx = ja->discard_idx =
-					ja->last_idx = i;
+				/*
+				 * When journal_reclaim() goes to allocate for
+				 * the first time, it'll use the bucket after
+				 * ja->cur_idx
+				 */
+				ja->cur_idx = i;
+				ja->last_idx = ja->discard_idx = (i + 1) %
+					ca->sb.njournal_buckets;
 
 			}
 	}
@@ -288,16 +294,11 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
 		     k = bkey_next(k)) {
 			unsigned j;
 
-			for (j = 0; j < KEY_PTRS(k); j++) {
-				struct bucket *g = PTR_BUCKET(c, k, j);
-				atomic_inc(&g->pin);
+			for (j = 0; j < KEY_PTRS(k); j++)
+				if (ptr_available(c, k, j))
+					atomic_inc(&PTR_BUCKET(c, k, j)->pin);
 
-				if (g->prio == BTREE_PRIO &&
-				    !ptr_stale(c, k, j))
-					g->prio = INITIAL_PRIO;
-			}
-
-			__bch_btree_mark_key(c, 0, k);
+			bch_initial_mark_key(c, 0, k);
 		}
 	}
 }
@@ -312,8 +313,6 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
 	uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
 	struct keylist keylist;
 
-	bch_keylist_init(&keylist);
-
 	list_for_each_entry(i, list, list) {
 		BUG_ON(i->pin && atomic_read(i->pin) != 1);
 
@@ -326,8 +325,7 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
 		     k = bkey_next(k)) {
 			trace_bcache_journal_replay_key(k);
 
-			bkey_copy(keylist.top, k);
-			bch_keylist_push(&keylist);
+			bch_keylist_init_single(&keylist, k);
 
 			ret = bch_btree_insert(s, &keylist, i->pin, NULL);
 			if (ret)
@@ -383,16 +381,15 @@ retry:
 
 	b = best;
 	if (b) {
-		rw_lock(true, b, b->level);
-
+		mutex_lock(&b->write_lock);
 		if (!btree_current_write(b)->journal) {
-			rw_unlock(true, b);
+			mutex_unlock(&b->write_lock);
 			/* We raced */
 			goto retry;
 		}
 
-		bch_btree_node_write(b, NULL);
-		rw_unlock(true, b);
+		__bch_btree_node_write(b, NULL);
+		mutex_unlock(&b->write_lock);
 	}
 }
 
@@ -536,6 +533,7 @@ void bch_journal_next(struct journal *j)
 	atomic_set(&fifo_back(&j->pin), 1);
 
 	j->cur->data->seq	= ++j->seq;
+	j->cur->dirty		= false;
 	j->cur->need_write	= false;
 	j->cur->data->keys	= 0;
 
@@ -731,7 +729,10 @@ static void journal_write_work(struct work_struct *work)
 					   struct cache_set,
 					   journal.work);
 	spin_lock(&c->journal.lock);
-	journal_try_write(c);
+	if (c->journal.cur->dirty)
+		journal_try_write(c);
+	else
+		spin_unlock(&c->journal.lock);
 }
 
 /*
@@ -761,7 +762,8 @@ atomic_t *bch_journal(struct cache_set *c,
 	if (parent) {
 		closure_wait(&w->wait, parent);
 		journal_try_write(c);
-	} else if (!w->need_write) {
+	} else if (!w->dirty) {
+		w->dirty = true;
 		schedule_delayed_work(&c->journal.work,
 				      msecs_to_jiffies(c->journal_delay_ms));
 		spin_unlock(&c->journal.lock);
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index 9180c4465075..e3c39457afbb 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -95,6 +95,7 @@ struct journal_write {
 
 	struct cache_set	*c;
 	struct closure_waitlist	wait;
+	bool			dirty;
 	bool			need_write;
 };
 
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 9eb60d102de8..cd7490311e51 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -24,12 +24,10 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k)
 					   moving_gc_keys);
 	unsigned i;
 
-	for (i = 0; i < KEY_PTRS(k); i++) {
-		struct bucket *g = PTR_BUCKET(c, k, i);
-
-		if (GC_MOVE(g))
+	for (i = 0; i < KEY_PTRS(k); i++)
+		if (ptr_available(c, k, i) &&
+		    GC_MOVE(PTR_BUCKET(c, k, i)))
 			return true;
-	}
 
 	return false;
 }
@@ -115,7 +113,7 @@ static void write_moving(struct closure *cl)
 		closure_call(&op->cl, bch_data_insert, NULL, cl);
 	}
 
-	continue_at(cl, write_moving_finish, system_wq);
+	continue_at(cl, write_moving_finish, op->wq);
 }
 
 static void read_moving_submit(struct closure *cl)
@@ -125,7 +123,7 @@ static void read_moving_submit(struct closure *cl)
 
 	bch_submit_bbio(bio, io->op.c, &io->w->key, 0);
 
-	continue_at(cl, write_moving, system_wq);
+	continue_at(cl, write_moving, io->op.wq);
 }
 
 static void read_moving(struct cache_set *c)
@@ -160,6 +158,7 @@ static void read_moving(struct cache_set *c)
 		io->w		= w;
 		io->op.inode	= KEY_INODE(&w->key);
 		io->op.c	= c;
+		io->op.wq	= c->moving_gc_wq;
 
 		moving_init(io);
 		bio = &io->bio.bio;
@@ -216,7 +215,10 @@ void bch_moving_gc(struct cache_set *c)
 		ca->heap.used = 0;
 
 		for_each_bucket(b, ca) {
-			if (!GC_SECTORS_USED(b))
+			if (GC_MARK(b) == GC_MARK_METADATA ||
+			    !GC_SECTORS_USED(b) ||
+			    GC_SECTORS_USED(b) == ca->sb.bucket_size ||
+			    atomic_read(&b->pin))
 				continue;
 
 			if (!heap_full(&ca->heap)) {
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 5d5d031cf381..15fff4f68a7c 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -12,11 +12,9 @@
 #include "request.h"
 #include "writeback.h"
 
-#include <linux/cgroup.h>
 #include <linux/module.h>
 #include <linux/hash.h>
 #include <linux/random.h>
-#include "blk-cgroup.h"
 
 #include <trace/events/bcache.h>
 
@@ -27,171 +25,13 @@ struct kmem_cache *bch_search_cache;
 
 static void bch_data_insert_start(struct closure *);
 
-/* Cgroup interface */
-
-#ifdef CONFIG_CGROUP_BCACHE
-static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 };
-
-static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup)
-{
-	struct cgroup_subsys_state *css;
-	return cgroup &&
-		(css = cgroup_subsys_state(cgroup, bcache_subsys_id))
-		? container_of(css, struct bch_cgroup, css)
-		: &bcache_default_cgroup;
-}
-
-struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio)
-{
-	struct cgroup_subsys_state *css = bio->bi_css
-		? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id)
-		: task_subsys_state(current, bcache_subsys_id);
-
-	return css
-		? container_of(css, struct bch_cgroup, css)
-		: &bcache_default_cgroup;
-}
-
-static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft,
-			struct file *file,
-			char __user *buf, size_t nbytes, loff_t *ppos)
-{
-	char tmp[1024];
-	int len = bch_snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes,
-					  cgroup_to_bcache(cgrp)->cache_mode + 1);
-
-	if (len < 0)
-		return len;
-
-	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
-}
-
-static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft,
-			    const char *buf)
-{
-	int v = bch_read_string_list(buf, bch_cache_modes);
-	if (v < 0)
-		return v;
-
-	cgroup_to_bcache(cgrp)->cache_mode = v - 1;
-	return 0;
-}
-
-static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft)
-{
-	return cgroup_to_bcache(cgrp)->verify;
-}
-
-static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
-{
-	cgroup_to_bcache(cgrp)->verify = val;
-	return 0;
-}
-
-static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft)
-{
-	struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
-	return atomic_read(&bcachecg->stats.cache_hits);
-}
-
-static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft)
-{
-	struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
-	return atomic_read(&bcachecg->stats.cache_misses);
-}
-
-static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp,
-					 struct cftype *cft)
-{
-	struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
-	return atomic_read(&bcachecg->stats.cache_bypass_hits);
-}
-
-static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp,
-					   struct cftype *cft)
-{
-	struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
-	return atomic_read(&bcachecg->stats.cache_bypass_misses);
-}
-
-static struct cftype bch_files[] = {
-	{
-		.name		= "cache_mode",
-		.read		= cache_mode_read,
-		.write_string	= cache_mode_write,
-	},
-	{
-		.name		= "verify",
-		.read_u64	= bch_verify_read,
-		.write_u64	= bch_verify_write,
-	},
-	{
-		.name		= "cache_hits",
-		.read_u64	= bch_cache_hits_read,
-	},
-	{
-		.name		= "cache_misses",
-		.read_u64	= bch_cache_misses_read,
-	},
-	{
-		.name		= "cache_bypass_hits",
-		.read_u64	= bch_cache_bypass_hits_read,
-	},
-	{
-		.name		= "cache_bypass_misses",
-		.read_u64	= bch_cache_bypass_misses_read,
-	},
-	{ }	/* terminate */
-};
-
-static void init_bch_cgroup(struct bch_cgroup *cg)
-{
-	cg->cache_mode = -1;
-}
-
-static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup)
-{
-	struct bch_cgroup *cg;
-
-	cg = kzalloc(sizeof(*cg), GFP_KERNEL);
-	if (!cg)
-		return ERR_PTR(-ENOMEM);
-	init_bch_cgroup(cg);
-	return &cg->css;
-}
-
-static void bcachecg_destroy(struct cgroup *cgroup)
-{
-	struct bch_cgroup *cg = cgroup_to_bcache(cgroup);
-	kfree(cg);
-}
-
-struct cgroup_subsys bcache_subsys = {
-	.create		= bcachecg_create,
-	.destroy	= bcachecg_destroy,
-	.subsys_id	= bcache_subsys_id,
-	.name		= "bcache",
-	.module		= THIS_MODULE,
-};
-EXPORT_SYMBOL_GPL(bcache_subsys);
-#endif
-
 static unsigned cache_mode(struct cached_dev *dc, struct bio *bio)
 {
-#ifdef CONFIG_CGROUP_BCACHE
-	int r = bch_bio_to_cgroup(bio)->cache_mode;
-	if (r >= 0)
-		return r;
-#endif
 	return BDEV_CACHE_MODE(&dc->sb);
 }
 
 static bool verify(struct cached_dev *dc, struct bio *bio)
 {
-#ifdef CONFIG_CGROUP_BCACHE
-	if (bch_bio_to_cgroup(bio)->verify)
-		return true;
-#endif
 	return dc->verify;
 }
 
@@ -248,7 +88,7 @@ static void bch_data_insert_keys(struct closure *cl)
 		atomic_dec_bug(journal_ref);
 
 	if (!op->insert_data_done)
-		continue_at(cl, bch_data_insert_start, bcache_wq);
+		continue_at(cl, bch_data_insert_start, op->wq);
 
 	bch_keylist_free(&op->insert_keys);
 	closure_return(cl);
@@ -297,7 +137,7 @@ static void bch_data_invalidate(struct closure *cl)
 	op->insert_data_done = true;
 	bio_put(bio);
 out:
-	continue_at(cl, bch_data_insert_keys, bcache_wq);
+	continue_at(cl, bch_data_insert_keys, op->wq);
 }
 
 static void bch_data_insert_error(struct closure *cl)
@@ -340,7 +180,7 @@ static void bch_data_insert_endio(struct bio *bio, int error)
 		if (op->writeback)
 			op->error = error;
 		else if (!op->replace)
-			set_closure_fn(cl, bch_data_insert_error, bcache_wq);
+			set_closure_fn(cl, bch_data_insert_error, op->wq);
 		else
 			set_closure_fn(cl, NULL, NULL);
 	}
@@ -376,7 +216,7 @@ static void bch_data_insert_start(struct closure *cl)
 		if (bch_keylist_realloc(&op->insert_keys,
 					3 + (op->csum ? 1 : 0),
 					op->c))
-			continue_at(cl, bch_data_insert_keys, bcache_wq);
+			continue_at(cl, bch_data_insert_keys, op->wq);
 
 		k = op->insert_keys.top;
 		bkey_init(k);
@@ -413,7 +253,7 @@ static void bch_data_insert_start(struct closure *cl)
 	} while (n != bio);
 
 	op->insert_data_done = true;
-	continue_at(cl, bch_data_insert_keys, bcache_wq);
+	continue_at(cl, bch_data_insert_keys, op->wq);
 err:
 	/* bch_alloc_sectors() blocks if s->writeback = true */
 	BUG_ON(op->writeback);
@@ -442,7 +282,7 @@ err:
 		bio_put(bio);
 
 		if (!bch_keylist_empty(&op->insert_keys))
-			continue_at(cl, bch_data_insert_keys, bcache_wq);
+			continue_at(cl, bch_data_insert_keys, op->wq);
 		else
 			closure_return(cl);
 	}
@@ -824,6 +664,7 @@ static inline struct search *search_alloc(struct bio *bio,
 	s->iop.error		= 0;
 	s->iop.flags		= 0;
 	s->iop.flush_journal	= (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0;
+	s->iop.wq		= bcache_wq;
 
 	return s;
 }
@@ -1203,22 +1044,13 @@ void bch_cached_dev_request_init(struct cached_dev *dc)
 static int flash_dev_cache_miss(struct btree *b, struct search *s,
 				struct bio *bio, unsigned sectors)
 {
-	struct bio_vec bv;
-	struct bvec_iter iter;
-
-	/* Zero fill bio */
+	unsigned bytes = min(sectors, bio_sectors(bio)) << 9;
 
-	bio_for_each_segment(bv, bio, iter) {
-		unsigned j = min(bv.bv_len >> 9, sectors);
-
-		void *p = kmap(bv.bv_page);
-		memset(p + bv.bv_offset, 0, j << 9);
-		kunmap(bv.bv_page);
-
-		sectors	-= j;
-	}
+	swap(bio->bi_iter.bi_size, bytes);
+	zero_fill_bio(bio);
+	swap(bio->bi_iter.bi_size, bytes);
 
-	bio_advance(bio, min(sectors << 9, bio->bi_iter.bi_size));
+	bio_advance(bio, bytes);
 
 	if (!bio->bi_iter.bi_size)
 		return MAP_DONE;
@@ -1313,9 +1145,6 @@ void bch_flash_dev_request_init(struct bcache_device *d)
 
 void bch_request_exit(void)
 {
-#ifdef CONFIG_CGROUP_BCACHE
-	cgroup_unload_subsys(&bcache_subsys);
-#endif
 	if (bch_search_cache)
 		kmem_cache_destroy(bch_search_cache);
 }
@@ -1326,11 +1155,5 @@ int __init bch_request_init(void)
 	if (!bch_search_cache)
 		return -ENOMEM;
 
-#ifdef CONFIG_CGROUP_BCACHE
-	cgroup_load_subsys(&bcache_subsys);
-	init_bch_cgroup(&bcache_default_cgroup);
-
-	cgroup_add_cftypes(&bcache_subsys, bch_files);
-#endif
 	return 0;
 }
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 39f21dbedc38..1ff36875c2b3 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -1,12 +1,11 @@
 #ifndef _BCACHE_REQUEST_H_
 #define _BCACHE_REQUEST_H_
 
-#include <linux/cgroup.h>
-
 struct data_insert_op {
 	struct closure		cl;
 	struct cache_set	*c;
 	struct bio		*bio;
+	struct workqueue_struct *wq;
 
 	unsigned		inode;
 	uint16_t		write_point;
@@ -41,20 +40,4 @@ void bch_flash_dev_request_init(struct bcache_device *d);
 
 extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache;
 
-struct bch_cgroup {
-#ifdef CONFIG_CGROUP_BCACHE
-	struct cgroup_subsys_state	css;
-#endif
-	/*
-	 * We subtract one from the index into bch_cache_modes[], so that
-	 * default == -1; this makes it so the rest match up with d->cache_mode,
-	 * and we use d->cache_mode if cgrp->cache_mode < 0
-	 */
-	short				cache_mode;
-	bool				verify;
-	struct cache_stat_collector	stats;
-};
-
-struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio);
-
 #endif /* _BCACHE_REQUEST_H_ */
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
index 84d0782f702e..0ca072c20d0d 100644
--- a/drivers/md/bcache/stats.c
+++ b/drivers/md/bcache/stats.c
@@ -201,9 +201,6 @@ void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d,
 	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
 	mark_cache_stats(&dc->accounting.collector, hit, bypass);
 	mark_cache_stats(&c->accounting.collector, hit, bypass);
-#ifdef CONFIG_CGROUP_BCACHE
-	mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass);
-#endif
 }
 
 void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 24a3a1546caa..926ded8ccbf5 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -541,9 +541,6 @@ static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw)
 	closure_sync(cl);
 }
 
-#define buckets_free(c)	"free %zu, free_inc %zu, unused %zu",		\
-	fifo_used(&c->free), fifo_used(&c->free_inc), fifo_used(&c->unused)
-
 void bch_prio_write(struct cache *ca)
 {
 	int i;
@@ -554,10 +551,6 @@ void bch_prio_write(struct cache *ca)
 
 	lockdep_assert_held(&ca->set->bucket_lock);
 
-	for (b = ca->buckets;
-	     b < ca->buckets + ca->sb.nbuckets; b++)
-		b->disk_gen = b->gen;
-
 	ca->disk_buckets->seq++;
 
 	atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
@@ -601,14 +594,17 @@ void bch_prio_write(struct cache *ca)
 
 	mutex_lock(&ca->set->bucket_lock);
 
-	ca->need_save_prio = 0;
-
 	/*
 	 * Don't want the old priorities to get garbage collected until after we
 	 * finish writing the new ones, and they're journalled
 	 */
-	for (i = 0; i < prio_buckets(ca); i++)
+	for (i = 0; i < prio_buckets(ca); i++) {
+		if (ca->prio_last_buckets[i])
+			__bch_bucket_free(ca,
+				&ca->buckets[ca->prio_last_buckets[i]]);
+
 		ca->prio_last_buckets[i] = ca->prio_buckets[i];
+	}
 }
 
 static void prio_read(struct cache *ca, uint64_t bucket)
@@ -639,7 +635,7 @@ static void prio_read(struct cache *ca, uint64_t bucket)
 		}
 
 		b->prio = le16_to_cpu(d->prio);
-		b->gen = b->disk_gen = b->last_gc = b->gc_gen = d->gen;
+		b->gen = b->last_gc = d->gen;
 	}
 }
 
@@ -843,6 +839,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 	q->limits.max_segment_size	= UINT_MAX;
 	q->limits.max_segments		= BIO_MAX_PAGES;
 	q->limits.max_discard_sectors	= UINT_MAX;
+	q->limits.discard_granularity	= 512;
 	q->limits.io_min		= block_size;
 	q->limits.logical_block_size	= block_size;
 	q->limits.physical_block_size	= block_size;
@@ -1355,6 +1352,8 @@ static void cache_set_free(struct closure *cl)
 	bch_bset_sort_state_free(&c->sort);
 	free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
 
+	if (c->moving_gc_wq)
+		destroy_workqueue(c->moving_gc_wq);
 	if (c->bio_split)
 		bioset_free(c->bio_split);
 	if (c->fill_iter)
@@ -1395,14 +1394,21 @@ static void cache_set_flush(struct closure *cl)
 		list_add(&c->root->list, &c->btree_cache);
 
 	/* Should skip this if we're unregistering because of an error */
-	list_for_each_entry(b, &c->btree_cache, list)
+	list_for_each_entry(b, &c->btree_cache, list) {
+		mutex_lock(&b->write_lock);
 		if (btree_node_dirty(b))
-			bch_btree_node_write(b, NULL);
+			__bch_btree_node_write(b, NULL);
+		mutex_unlock(&b->write_lock);
+	}
 
 	for_each_cache(ca, c, i)
 		if (ca->alloc_thread)
 			kthread_stop(ca->alloc_thread);
 
+	cancel_delayed_work_sync(&c->journal.work);
+	/* flush last journal entry if needed */
+	c->journal.work.work.func(&c->journal.work.work);
+
 	closure_return(cl);
 }
 
@@ -1485,14 +1491,13 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 
 	sema_init(&c->sb_write_mutex, 1);
 	mutex_init(&c->bucket_lock);
-	init_waitqueue_head(&c->try_wait);
+	init_waitqueue_head(&c->btree_cache_wait);
 	init_waitqueue_head(&c->bucket_wait);
 	sema_init(&c->uuid_write_mutex, 1);
 
 	spin_lock_init(&c->btree_gc_time.lock);
 	spin_lock_init(&c->btree_split_time.lock);
 	spin_lock_init(&c->btree_read_time.lock);
-	spin_lock_init(&c->try_harder_time.lock);
 
 	bch_moving_init_cache_set(c);
 
@@ -1517,6 +1522,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 	    !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
 	    !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
 	    !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
+	    !(c->moving_gc_wq = create_workqueue("bcache_gc")) ||
 	    bch_journal_alloc(c) ||
 	    bch_btree_cache_alloc(c) ||
 	    bch_open_buckets_alloc(c) ||
@@ -1580,7 +1586,7 @@ static void run_cache_set(struct cache_set *c)
 			goto err;
 
 		err = "error reading btree root";
-		c->root = bch_btree_node_get(c, k, j->btree_level, true);
+		c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true);
 		if (IS_ERR_OR_NULL(c->root))
 			goto err;
 
@@ -1596,7 +1602,7 @@ static void run_cache_set(struct cache_set *c)
 			goto err;
 
 		bch_journal_mark(c, &journal);
-		bch_btree_gc_finish(c);
+		bch_initial_gc_finish(c);
 		pr_debug("btree_check() done");
 
 		/*
@@ -1638,7 +1644,7 @@ static void run_cache_set(struct cache_set *c)
 				ca->sb.d[j] = ca->sb.first_bucket + j;
 		}
 
-		bch_btree_gc_finish(c);
+		bch_initial_gc_finish(c);
 
 		err = "error starting allocator thread";
 		for_each_cache(ca, c, i)
@@ -1655,12 +1661,14 @@ static void run_cache_set(struct cache_set *c)
 			goto err;
 
 		err = "cannot allocate new btree root";
-		c->root = bch_btree_node_alloc(c, 0, true);
+		c->root = bch_btree_node_alloc(c, NULL, 0);
 		if (IS_ERR_OR_NULL(c->root))
 			goto err;
 
+		mutex_lock(&c->root->write_lock);
 		bkey_copy_key(&c->root->key, &MAX_KEY);
 		bch_btree_node_write(c->root, &cl);
+		mutex_unlock(&c->root->write_lock);
 
 		bch_btree_set_root(c->root);
 		rw_unlock(true, c->root);
@@ -1782,7 +1790,6 @@ void bch_cache_release(struct kobject *kobj)
 	vfree(ca->buckets);
 
 	free_heap(&ca->heap);
-	free_fifo(&ca->unused);
 	free_fifo(&ca->free_inc);
 
 	for (i = 0; i < RESERVE_NR; i++)
@@ -1819,7 +1826,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
 	    !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
 	    !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
 	    !init_fifo(&ca->free_inc,	free << 2, GFP_KERNEL) ||
-	    !init_fifo(&ca->unused,	free << 2, GFP_KERNEL) ||
 	    !init_heap(&ca->heap,	free << 3, GFP_KERNEL) ||
 	    !(ca->buckets	= vzalloc(sizeof(struct bucket) *
 					  ca->sb.nbuckets)) ||
@@ -1834,13 +1840,7 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca)
 	for_each_bucket(b, ca)
 		atomic_set(&b->pin, 0);
 
-	if (bch_cache_allocator_init(ca))
-		goto err;
-
 	return 0;
-err:
-	kobject_put(&ca->kobj);
-	return -ENOMEM;
 }
 
 static void register_cache(struct cache_sb *sb, struct page *sb_page,
@@ -1869,7 +1869,10 @@ static void register_cache(struct cache_sb *sb, struct page *sb_page,
 	if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache"))
 		goto err;
 
+	mutex_lock(&bch_register_lock);
 	err = register_cache_set(ca);
+	mutex_unlock(&bch_register_lock);
+
 	if (err)
 		goto err;
 
@@ -1931,8 +1934,6 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 	if (!try_module_get(THIS_MODULE))
 		return -EBUSY;
 
-	mutex_lock(&bch_register_lock);
-
 	if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
 	    !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
 		goto err;
@@ -1965,7 +1966,9 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 		if (!dc)
 			goto err_close;
 
+		mutex_lock(&bch_register_lock);
 		register_bdev(sb, sb_page, bdev, dc);
+		mutex_unlock(&bch_register_lock);
 	} else {
 		struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
 		if (!ca)
@@ -1978,7 +1981,6 @@ out:
 		put_page(sb_page);
 	kfree(sb);
 	kfree(path);
-	mutex_unlock(&bch_register_lock);
 	module_put(THIS_MODULE);
 	return ret;
 
@@ -2057,7 +2059,6 @@ static void bcache_exit(void)
 {
 	bch_debug_exit();
 	bch_request_exit();
-	bch_btree_exit();
 	if (bcache_kobj)
 		kobject_put(bcache_kobj);
 	if (bcache_wq)
@@ -2087,7 +2088,6 @@ static int __init bcache_init(void)
 	if (!(bcache_wq = create_workqueue("bcache")) ||
 	    !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
 	    sysfs_create_files(bcache_kobj, files) ||
-	    bch_btree_init() ||
 	    bch_request_init() ||
 	    bch_debug_init(bcache_kobj))
 		goto err;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index d8458d477a12..b3ff57d61dde 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -54,7 +54,6 @@ sysfs_time_stats_attribute(btree_gc,	sec, ms);
 sysfs_time_stats_attribute(btree_split, sec, us);
 sysfs_time_stats_attribute(btree_sort,	ms,  us);
 sysfs_time_stats_attribute(btree_read,	ms,  us);
-sysfs_time_stats_attribute(try_harder,	ms,  us);
 
 read_attribute(btree_nodes);
 read_attribute(btree_used_percent);
@@ -406,7 +405,7 @@ struct bset_stats_op {
 	struct bset_stats stats;
 };
 
-static int btree_bset_stats(struct btree_op *b_op, struct btree *b)
+static int bch_btree_bset_stats(struct btree_op *b_op, struct btree *b)
 {
 	struct bset_stats_op *op = container_of(b_op, struct bset_stats_op, op);
 
@@ -424,7 +423,7 @@ static int bch_bset_print_stats(struct cache_set *c, char *buf)
 	memset(&op, 0, sizeof(op));
 	bch_btree_op_init(&op.op, -1);
 
-	ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, btree_bset_stats);
+	ret = bch_btree_map_nodes(&op.op, c, &ZERO_KEY, bch_btree_bset_stats);
 	if (ret < 0)
 		return ret;
 
@@ -442,81 +441,81 @@ static int bch_bset_print_stats(struct cache_set *c, char *buf)
 			op.stats.floats, op.stats.failed);
 }
 
-SHOW(__bch_cache_set)
+static unsigned bch_root_usage(struct cache_set *c)
 {
-	unsigned root_usage(struct cache_set *c)
-	{
-		unsigned bytes = 0;
-		struct bkey *k;
-		struct btree *b;
-		struct btree_iter iter;
+	unsigned bytes = 0;
+	struct bkey *k;
+	struct btree *b;
+	struct btree_iter iter;
 
-		goto lock_root;
+	goto lock_root;
 
-		do {
-			rw_unlock(false, b);
+	do {
+		rw_unlock(false, b);
 lock_root:
-			b = c->root;
-			rw_lock(false, b, b->level);
-		} while (b != c->root);
+		b = c->root;
+		rw_lock(false, b, b->level);
+	} while (b != c->root);
 
-		for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
-			bytes += bkey_bytes(k);
+	for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
+		bytes += bkey_bytes(k);
 
-		rw_unlock(false, b);
+	rw_unlock(false, b);
 
-		return (bytes * 100) / btree_bytes(c);
-	}
+	return (bytes * 100) / btree_bytes(c);
+}
 
-	size_t cache_size(struct cache_set *c)
-	{
-		size_t ret = 0;
-		struct btree *b;
+static size_t bch_cache_size(struct cache_set *c)
+{
+	size_t ret = 0;
+	struct btree *b;
 
-		mutex_lock(&c->bucket_lock);
-		list_for_each_entry(b, &c->btree_cache, list)
-			ret += 1 << (b->keys.page_order + PAGE_SHIFT);
+	mutex_lock(&c->bucket_lock);
+	list_for_each_entry(b, &c->btree_cache, list)
+		ret += 1 << (b->keys.page_order + PAGE_SHIFT);
 
-		mutex_unlock(&c->bucket_lock);
-		return ret;
-	}
-
-	unsigned cache_max_chain(struct cache_set *c)
-	{
-		unsigned ret = 0;
-		struct hlist_head *h;
+	mutex_unlock(&c->bucket_lock);
+	return ret;
+}
 
-		mutex_lock(&c->bucket_lock);
+static unsigned bch_cache_max_chain(struct cache_set *c)
+{
+	unsigned ret = 0;
+	struct hlist_head *h;
 
-		for (h = c->bucket_hash;
-		     h < c->bucket_hash + (1 << BUCKET_HASH_BITS);
-		     h++) {
-			unsigned i = 0;
-			struct hlist_node *p;
+	mutex_lock(&c->bucket_lock);
 
-			hlist_for_each(p, h)
-				i++;
+	for (h = c->bucket_hash;
+	     h < c->bucket_hash + (1 << BUCKET_HASH_BITS);
+	     h++) {
+		unsigned i = 0;
+		struct hlist_node *p;
 
-			ret = max(ret, i);
-		}
+		hlist_for_each(p, h)
+			i++;
 
-		mutex_unlock(&c->bucket_lock);
-		return ret;
+		ret = max(ret, i);
 	}
 
-	unsigned btree_used(struct cache_set *c)
-	{
-		return div64_u64(c->gc_stats.key_bytes * 100,
-				 (c->gc_stats.nodes ?: 1) * btree_bytes(c));
-	}
+	mutex_unlock(&c->bucket_lock);
+	return ret;
+}
 
-	unsigned average_key_size(struct cache_set *c)
-	{
-		return c->gc_stats.nkeys
-			? div64_u64(c->gc_stats.data, c->gc_stats.nkeys)
-			: 0;
-	}
+static unsigned bch_btree_used(struct cache_set *c)
+{
+	return div64_u64(c->gc_stats.key_bytes * 100,
+			 (c->gc_stats.nodes ?: 1) * btree_bytes(c));
+}
 
+static unsigned bch_average_key_size(struct cache_set *c)
+{
+	return c->gc_stats.nkeys
+		? div64_u64(c->gc_stats.data, c->gc_stats.nkeys)
+		: 0;
+}
+
+SHOW(__bch_cache_set)
+{
 	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
 
 	sysfs_print(synchronous,		CACHE_SYNC(&c->sb));
@@ -524,21 +523,20 @@ lock_root:
 	sysfs_hprint(bucket_size,		bucket_bytes(c));
 	sysfs_hprint(block_size,		block_bytes(c));
 	sysfs_print(tree_depth,			c->root->level);
-	sysfs_print(root_usage_percent,		root_usage(c));
+	sysfs_print(root_usage_percent,		bch_root_usage(c));
 
-	sysfs_hprint(btree_cache_size,		cache_size(c));
-	sysfs_print(btree_cache_max_chain,	cache_max_chain(c));
+	sysfs_hprint(btree_cache_size,		bch_cache_size(c));
+	sysfs_print(btree_cache_max_chain,	bch_cache_max_chain(c));
 	sysfs_print(cache_available_percent,	100 - c->gc_stats.in_use);
 
 	sysfs_print_time_stats(&c->btree_gc_time,	btree_gc, sec, ms);
 	sysfs_print_time_stats(&c->btree_split_time,	btree_split, sec, us);
 	sysfs_print_time_stats(&c->sort.time,		btree_sort, ms, us);
 	sysfs_print_time_stats(&c->btree_read_time,	btree_read, ms, us);
-	sysfs_print_time_stats(&c->try_harder_time,	try_harder, ms, us);
 
-	sysfs_print(btree_used_percent,	btree_used(c));
+	sysfs_print(btree_used_percent,	bch_btree_used(c));
 	sysfs_print(btree_nodes,	c->gc_stats.nodes);
-	sysfs_hprint(average_key_size,	average_key_size(c));
+	sysfs_hprint(average_key_size,	bch_average_key_size(c));
 
 	sysfs_print(cache_read_races,
 		    atomic_long_read(&c->cache_read_races));
@@ -709,7 +707,6 @@ static struct attribute *bch_cache_set_internal_files[] = {
 	sysfs_time_stats_attribute_list(btree_split, sec, us)
 	sysfs_time_stats_attribute_list(btree_sort, ms, us)
 	sysfs_time_stats_attribute_list(btree_read, ms, us)
-	sysfs_time_stats_attribute_list(try_harder, ms, us)
 
 	&sysfs_btree_nodes,
 	&sysfs_btree_used_percent,
@@ -761,7 +758,9 @@ SHOW(__bch_cache)
 		int cmp(const void *l, const void *r)
 		{	return *((uint16_t *) r) - *((uint16_t *) l); }
 
-		size_t n = ca->sb.nbuckets, i, unused, btree;
+		struct bucket *b;
+		size_t n = ca->sb.nbuckets, i;
+		size_t unused = 0, available = 0, dirty = 0, meta = 0;
 		uint64_t sum = 0;
 		/* Compute 31 quantiles */
 		uint16_t q[31], *p, *cached;
@@ -772,6 +771,17 @@ SHOW(__bch_cache)
 			return -ENOMEM;
 
 		mutex_lock(&ca->set->bucket_lock);
+		for_each_bucket(b, ca) {
+			if (!GC_SECTORS_USED(b))
+				unused++;
+			if (GC_MARK(b) == GC_MARK_RECLAIMABLE)
+				available++;
+			if (GC_MARK(b) == GC_MARK_DIRTY)
+				dirty++;
+			if (GC_MARK(b) == GC_MARK_METADATA)
+				meta++;
+		}
+
 		for (i = ca->sb.first_bucket; i < n; i++)
 			p[i] = ca->buckets[i].prio;
 		mutex_unlock(&ca->set->bucket_lock);
@@ -786,10 +796,7 @@ SHOW(__bch_cache)
 
 		while (cached < p + n &&
 		       *cached == BTREE_PRIO)
-			cached++;
-
-		btree = cached - p;
-		n -= btree;
+			cached++, n--;
 
 		for (i = 0; i < n; i++)
 			sum += INITIAL_PRIO - cached[i];
@@ -805,12 +812,16 @@ SHOW(__bch_cache)
 
 		ret = scnprintf(buf, PAGE_SIZE,
 				"Unused:		%zu%%\n"
+				"Clean:		%zu%%\n"
+				"Dirty:		%zu%%\n"
 				"Metadata:	%zu%%\n"
 				"Average:	%llu\n"
 				"Sectors per Q:	%zu\n"
 				"Quantiles:	[",
 				unused * 100 / (size_t) ca->sb.nbuckets,
-				btree * 100 / (size_t) ca->sb.nbuckets, sum,
+				available * 100 / (size_t) ca->sb.nbuckets,
+				dirty * 100 / (size_t) ca->sb.nbuckets,
+				meta * 100 / (size_t) ca->sb.nbuckets, sum,
 				n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1));
 
 		for (i = 0; i < ARRAY_SIZE(q); i++)
diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c
index adbc3df17a80..b7820b0d2621 100644
--- a/drivers/md/bcache/trace.c
+++ b/drivers/md/bcache/trace.c
@@ -45,7 +45,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root);
 
-EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_invalidate);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_invalidate);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail);
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback);
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 1e018e986610..0e385e40909e 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -872,7 +872,7 @@ static void mq_destroy(struct dm_cache_policy *p)
 {
 	struct mq_policy *mq = to_mq_policy(p);
 
-	kfree(mq->table);
+	vfree(mq->table);
 	epool_exit(&mq->cache_pool);
 	epool_exit(&mq->pre_cache_pool);
 	kfree(mq);
@@ -1245,7 +1245,7 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
 
 	mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);
 	mq->hash_bits = ffs(mq->nr_buckets) - 1;
-	mq->table = kzalloc(sizeof(*mq->table) * mq->nr_buckets, GFP_KERNEL);
+	mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets);
 	if (!mq->table)
 		goto bad_alloc_table;
 
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index ffd472e015ca..074b9c8e4cf0 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -289,6 +289,7 @@ struct per_bio_data {
 	bool tick:1;
 	unsigned req_nr:2;
 	struct dm_deferred_entry *all_io_entry;
+	struct dm_hook_info hook_info;
 
 	/*
 	 * writethrough fields.  These MUST remain at the end of this
@@ -297,7 +298,6 @@ struct per_bio_data {
 	 */
 	struct cache *cache;
 	dm_cblock_t cblock;
-	struct dm_hook_info hook_info;
 	struct dm_bio_details bio_details;
 };
 
@@ -671,15 +671,16 @@ static void remap_to_cache(struct cache *cache, struct bio *bio,
 			   dm_cblock_t cblock)
 {
 	sector_t bi_sector = bio->bi_iter.bi_sector;
+	sector_t block = from_cblock(cblock);
 
 	bio->bi_bdev = cache->cache_dev->bdev;
 	if (!block_size_is_power_of_two(cache))
 		bio->bi_iter.bi_sector =
-			(from_cblock(cblock) * cache->sectors_per_block) +
+			(block * cache->sectors_per_block) +
 			sector_div(bi_sector, cache->sectors_per_block);
 	else
 		bio->bi_iter.bi_sector =
-			(from_cblock(cblock) << cache->sectors_per_block_shift) |
+			(block << cache->sectors_per_block_shift) |
 			(bi_sector & (cache->sectors_per_block - 1));
 }
 
@@ -978,12 +979,13 @@ static void issue_copy_real(struct dm_cache_migration *mg)
 	int r;
 	struct dm_io_region o_region, c_region;
 	struct cache *cache = mg->cache;
+	sector_t cblock = from_cblock(mg->cblock);
 
 	o_region.bdev = cache->origin_dev->bdev;
 	o_region.count = cache->sectors_per_block;
 
 	c_region.bdev = cache->cache_dev->bdev;
-	c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block;
+	c_region.sector = cblock * cache->sectors_per_block;
 	c_region.count = cache->sectors_per_block;
 
 	if (mg->writeback || mg->demote) {
@@ -1010,13 +1012,15 @@ static void overwrite_endio(struct bio *bio, int err)
 	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 	unsigned long flags;
 
+	dm_unhook_bio(&pb->hook_info, bio);
+
 	if (err)
 		mg->err = true;
 
+	mg->requeue_holder = false;
+
 	spin_lock_irqsave(&cache->lock, flags);
 	list_add_tail(&mg->list, &cache->completed_migrations);
-	dm_unhook_bio(&pb->hook_info, bio);
-	mg->requeue_holder = false;
 	spin_unlock_irqrestore(&cache->lock, flags);
 
 	wake_worker(cache);
@@ -2461,20 +2465,18 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
 	bool discarded_block;
 	struct dm_bio_prison_cell *cell;
 	struct policy_result lookup_result;
-	struct per_bio_data *pb;
+	struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
 
-	if (from_oblock(block) > from_oblock(cache->origin_blocks)) {
+	if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
 		/*
 		 * This can only occur if the io goes to a partial block at
 		 * the end of the origin device.  We don't cache these.
 		 * Just remap to the origin and carry on.
 		 */
-		remap_to_origin_clear_discard(cache, bio, block);
+		remap_to_origin(cache, bio);
 		return DM_MAPIO_REMAPPED;
 	}
 
-	pb = init_per_bio_data(bio, pb_data_size);
-
 	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
 		defer_bio(cache, bio);
 		return DM_MAPIO_SUBMITTED;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index b2b8a10e8427..3842ac738f98 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -201,29 +201,28 @@ static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offse
 /*
  * Functions for getting the pages from a bvec.
  */
-static void bio_get_page(struct dpages *dp,
-		  struct page **p, unsigned long *len, unsigned *offset)
+static void bio_get_page(struct dpages *dp, struct page **p,
+			 unsigned long *len, unsigned *offset)
 {
-	struct bio *bio = dp->context_ptr;
-	struct bio_vec bvec = bio_iovec(bio);
-	*p = bvec.bv_page;
-	*len = bvec.bv_len;
-	*offset = bvec.bv_offset;
+	struct bio_vec *bvec = dp->context_ptr;
+	*p = bvec->bv_page;
+	*len = bvec->bv_len - dp->context_u;
+	*offset = bvec->bv_offset + dp->context_u;
 }
 
 static void bio_next_page(struct dpages *dp)
 {
-	struct bio *bio = dp->context_ptr;
-	struct bio_vec bvec = bio_iovec(bio);
-
-	bio_advance(bio, bvec.bv_len);
+	struct bio_vec *bvec = dp->context_ptr;
+	dp->context_ptr = bvec + 1;
+	dp->context_u = 0;
 }
 
 static void bio_dp_init(struct dpages *dp, struct bio *bio)
 {
 	dp->get_page = bio_get_page;
 	dp->next_page = bio_next_page;
-	dp->context_ptr = bio;
+	dp->context_ptr = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+	dp->context_u = bio->bi_iter.bi_bvec_done;
 }
 
 /*
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 08d9a207259a..b428c0ae63d5 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -66,7 +66,7 @@ static int dm_ulog_sendto_server(struct dm_ulog_request *tfr)
 	msg->seq = tfr->seq;
 	msg->len = sizeof(struct dm_ulog_request) + tfr->data_size;
 
-	r = cn_netlink_send(msg, 0, gfp_any());
+	r = cn_netlink_send(msg, 0, 0, gfp_any());
 
 	return r;
 }
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 6eb9dc9ef8f3..422a9fdeb53e 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1626,8 +1626,11 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
 	/*
 	 * Only pass ioctls through if the device sizes match exactly.
 	 */
-	if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
-		r = scsi_verify_blk_ioctl(NULL, cmd);
+	if (!bdev || ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) {
+		int err = scsi_verify_blk_ioctl(NULL, cmd);
+		if (err)
+			r = err;
+	}
 
 	if (r == -ENOTCONN && !fatal_signal_pending(current))
 		queue_work(kmultipathd, &m->process_queued_ios);
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index f284e0bfb25f..7dfdb5c746d6 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1244,6 +1244,9 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
 
 			dm_bio_restore(bd, bio);
 			bio_record->details.bi_bdev = NULL;
+
+			atomic_inc(&bio->bi_remaining);
+
 			queue_bio(ms, bio, rw);
 			return DM_ENDIO_INCOMPLETE;
 		}
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index afc3d017de4c..d6e88178d22c 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -546,6 +546,9 @@ static int read_exceptions(struct pstore *ps,
 		r = insert_exceptions(ps, area, callback, callback_context,
 				      &full);
 
+		if (!full)
+			memcpy(ps->area, area, ps->store->chunk_size << SECTOR_SHIFT);
+
 		dm_bufio_release(bp);
 
 		dm_bufio_forget(client, chunk);
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 7da347665552..fb9efc829182 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -76,7 +76,7 @@
 
 #define THIN_SUPERBLOCK_MAGIC 27022010
 #define THIN_SUPERBLOCK_LOCATION 0
-#define THIN_VERSION 1
+#define THIN_VERSION 2
 #define THIN_METADATA_CACHE_SIZE 64
 #define SECTOR_TO_BLOCK_SHIFT 3
 
@@ -483,7 +483,7 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd)
 
 	disk_super->data_mapping_root = cpu_to_le64(pmd->root);
 	disk_super->device_details_root = cpu_to_le64(pmd->details_root);
-	disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
+	disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE);
 	disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
 	disk_super->data_block_size = cpu_to_le32(pmd->data_block_size);
 
@@ -651,7 +651,7 @@ static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool f
 {
 	int r;
 
-	pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE,
+	pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
 					  THIN_METADATA_CACHE_SIZE,
 					  THIN_MAX_CONCURRENT_LOCKS);
 	if (IS_ERR(pmd->bm)) {
@@ -1489,6 +1489,23 @@ bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
 	return r;
 }
 
+bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd)
+{
+	bool r = false;
+	struct dm_thin_device *td, *tmp;
+
+	down_read(&pmd->root_lock);
+	list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
+		if (td->changed) {
+			r = td->changed;
+			break;
+		}
+	}
+	up_read(&pmd->root_lock);
+
+	return r;
+}
+
 bool dm_thin_aborted_changes(struct dm_thin_device *td)
 {
 	bool r;
@@ -1738,3 +1755,38 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
 
 	return r;
 }
+
+int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
+{
+	int r;
+	struct dm_block *sblock;
+	struct thin_disk_superblock *disk_super;
+
+	down_write(&pmd->root_lock);
+	pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
+
+	r = superblock_lock(pmd, &sblock);
+	if (r) {
+		DMERR("couldn't read superblock");
+		goto out;
+	}
+
+	disk_super = dm_block_data(sblock);
+	disk_super->flags = cpu_to_le32(pmd->flags);
+
+	dm_bm_unlock(sblock);
+out:
+	up_write(&pmd->root_lock);
+	return r;
+}
+
+bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
+{
+	bool needs_check;
+
+	down_read(&pmd->root_lock);
+	needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG;
+	up_read(&pmd->root_lock);
+
+	return needs_check;
+}
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 9a368567632f..e3c857db195a 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -9,16 +9,14 @@
 
 #include "persistent-data/dm-block-manager.h"
 #include "persistent-data/dm-space-map.h"
+#include "persistent-data/dm-space-map-metadata.h"
 
-#define THIN_METADATA_BLOCK_SIZE 4096
+#define THIN_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE
 
 /*
  * The metadata device is currently limited in size.
- *
- * We have one block of index, which can hold 255 index entries.  Each
- * index entry contains allocation info about 16k metadata blocks.
  */
-#define THIN_METADATA_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
+#define THIN_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS
 
 /*
  * A metadata device larger than 16GB triggers a warning.
@@ -27,6 +25,11 @@
 
 /*----------------------------------------------------------------*/
 
+/*
+ * Thin metadata superblock flags.
+ */
+#define THIN_METADATA_NEEDS_CHECK_FLAG (1 << 0)
+
 struct dm_pool_metadata;
 struct dm_thin_device;
 
@@ -161,6 +164,8 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block);
  */
 bool dm_thin_changed_this_transaction(struct dm_thin_device *td);
 
+bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd);
+
 bool dm_thin_aborted_changes(struct dm_thin_device *td);
 
 int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
@@ -202,6 +207,12 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
 					dm_sm_threshold_fn fn,
 					void *context);
 
+/*
+ * Updates the superblock immediately.
+ */
+int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd);
+bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd);
+
 /*----------------------------------------------------------------*/
 
 #endif
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index faaf944597ab..be70d38745f7 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -130,10 +130,11 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
 struct dm_thin_new_mapping;
 
 /*
- * The pool runs in 3 modes.  Ordered in degraded order for comparisons.
+ * The pool runs in 4 modes.  Ordered in degraded order for comparisons.
  */
 enum pool_mode {
 	PM_WRITE,		/* metadata may be changed */
+	PM_OUT_OF_DATA_SPACE,	/* metadata may be changed, though data may not be allocated */
 	PM_READ_ONLY,		/* metadata may not be changed */
 	PM_FAIL,		/* all I/O fails */
 };
@@ -198,7 +199,6 @@ struct pool {
 };
 
 static enum pool_mode get_pool_mode(struct pool *pool);
-static void out_of_data_space(struct pool *pool);
 static void metadata_operation_failed(struct pool *pool, const char *op, int r);
 
 /*
@@ -226,6 +226,7 @@ struct thin_c {
 
 	struct pool *pool;
 	struct dm_thin_device *td;
+	bool requeue_mode:1;
 };
 
 /*----------------------------------------------------------------*/
@@ -369,14 +370,18 @@ struct dm_thin_endio_hook {
 	struct dm_thin_new_mapping *overwrite_mapping;
 };
 
-static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
+static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
 {
 	struct bio *bio;
 	struct bio_list bios;
+	unsigned long flags;
 
 	bio_list_init(&bios);
+
+	spin_lock_irqsave(&tc->pool->lock, flags);
 	bio_list_merge(&bios, master);
 	bio_list_init(master);
+	spin_unlock_irqrestore(&tc->pool->lock, flags);
 
 	while ((bio = bio_list_pop(&bios))) {
 		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
@@ -391,12 +396,26 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
 static void requeue_io(struct thin_c *tc)
 {
 	struct pool *pool = tc->pool;
+
+	requeue_bio_list(tc, &pool->deferred_bios);
+	requeue_bio_list(tc, &pool->retry_on_resume_list);
+}
+
+static void error_retry_list(struct pool *pool)
+{
+	struct bio *bio;
 	unsigned long flags;
+	struct bio_list bios;
+
+	bio_list_init(&bios);
 
 	spin_lock_irqsave(&pool->lock, flags);
-	__requeue_bio_list(tc, &pool->deferred_bios);
-	__requeue_bio_list(tc, &pool->retry_on_resume_list);
+	bio_list_merge(&bios, &pool->retry_on_resume_list);
+	bio_list_init(&pool->retry_on_resume_list);
 	spin_unlock_irqrestore(&pool->lock, flags);
+
+	while ((bio = bio_list_pop(&bios)))
+		bio_io_error(bio);
 }
 
 /*
@@ -925,13 +944,15 @@ static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
 	}
 }
 
+static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
+
 static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 {
 	int r;
 	dm_block_t free_blocks;
 	struct pool *pool = tc->pool;
 
-	if (get_pool_mode(pool) != PM_WRITE)
+	if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
 		return -EINVAL;
 
 	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
@@ -958,7 +979,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 		}
 
 		if (!free_blocks) {
-			out_of_data_space(pool);
+			set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
 			return -ENOSPC;
 		}
 	}
@@ -988,15 +1009,32 @@ static void retry_on_resume(struct bio *bio)
 	spin_unlock_irqrestore(&pool->lock, flags);
 }
 
-static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
+static bool should_error_unserviceable_bio(struct pool *pool)
 {
-	/*
-	 * When pool is read-only, no cell locking is needed because
-	 * nothing is changing.
-	 */
-	WARN_ON_ONCE(get_pool_mode(pool) != PM_READ_ONLY);
+	enum pool_mode m = get_pool_mode(pool);
 
-	if (pool->pf.error_if_no_space)
+	switch (m) {
+	case PM_WRITE:
+		/* Shouldn't get here */
+		DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
+		return true;
+
+	case PM_OUT_OF_DATA_SPACE:
+		return pool->pf.error_if_no_space;
+
+	case PM_READ_ONLY:
+	case PM_FAIL:
+		return true;
+	default:
+		/* Shouldn't get here */
+		DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
+		return true;
+	}
+}
+
+static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
+{
+	if (should_error_unserviceable_bio(pool))
 		bio_io_error(bio);
 	else
 		retry_on_resume(bio);
@@ -1007,11 +1045,20 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
 	struct bio *bio;
 	struct bio_list bios;
 
+	if (should_error_unserviceable_bio(pool)) {
+		cell_error(pool, cell);
+		return;
+	}
+
 	bio_list_init(&bios);
 	cell_release(pool, cell, &bios);
 
-	while ((bio = bio_list_pop(&bios)))
-		handle_unserviceable_bio(pool, bio);
+	if (should_error_unserviceable_bio(pool))
+		while ((bio = bio_list_pop(&bios)))
+			bio_io_error(bio);
+	else
+		while ((bio = bio_list_pop(&bios)))
+			retry_on_resume(bio);
 }
 
 static void process_discard(struct thin_c *tc, struct bio *bio)
@@ -1296,6 +1343,11 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
 	}
 }
 
+static void process_bio_success(struct thin_c *tc, struct bio *bio)
+{
+	bio_endio(bio, 0);
+}
+
 static void process_bio_fail(struct thin_c *tc, struct bio *bio)
 {
 	bio_io_error(bio);
@@ -1328,6 +1380,11 @@ static void process_deferred_bios(struct pool *pool)
 		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 		struct thin_c *tc = h->tc;
 
+		if (tc->requeue_mode) {
+			bio_endio(bio, DM_ENDIO_REQUEUE);
+			continue;
+		}
+
 		/*
 		 * If we've got no free new_mapping structs, and processing
 		 * this bio might require one, we pause until there are some
@@ -1357,7 +1414,8 @@ static void process_deferred_bios(struct pool *pool)
 	bio_list_init(&pool->deferred_flush_bios);
 	spin_unlock_irqrestore(&pool->lock, flags);
 
-	if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
+	if (bio_list_empty(&bios) &&
+	    !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
 		return;
 
 	if (commit(pool)) {
@@ -1393,51 +1451,134 @@ static void do_waker(struct work_struct *ws)
 
 /*----------------------------------------------------------------*/
 
+struct noflush_work {
+	struct work_struct worker;
+	struct thin_c *tc;
+
+	atomic_t complete;
+	wait_queue_head_t wait;
+};
+
+static void complete_noflush_work(struct noflush_work *w)
+{
+	atomic_set(&w->complete, 1);
+	wake_up(&w->wait);
+}
+
+static void do_noflush_start(struct work_struct *ws)
+{
+	struct noflush_work *w = container_of(ws, struct noflush_work, worker);
+	w->tc->requeue_mode = true;
+	requeue_io(w->tc);
+	complete_noflush_work(w);
+}
+
+static void do_noflush_stop(struct work_struct *ws)
+{
+	struct noflush_work *w = container_of(ws, struct noflush_work, worker);
+	w->tc->requeue_mode = false;
+	complete_noflush_work(w);
+}
+
+static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
+{
+	struct noflush_work w;
+
+	INIT_WORK(&w.worker, fn);
+	w.tc = tc;
+	atomic_set(&w.complete, 0);
+	init_waitqueue_head(&w.wait);
+
+	queue_work(tc->pool->wq, &w.worker);
+
+	wait_event(w.wait, atomic_read(&w.complete));
+}
+
+/*----------------------------------------------------------------*/
+
 static enum pool_mode get_pool_mode(struct pool *pool)
 {
 	return pool->pf.mode;
 }
 
+static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
+{
+	dm_table_event(pool->ti->table);
+	DMINFO("%s: switching pool to %s mode",
+	       dm_device_name(pool->pool_md), new_mode);
+}
+
 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 {
-	int r;
-	enum pool_mode old_mode = pool->pf.mode;
+	struct pool_c *pt = pool->ti->private;
+	bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
+	enum pool_mode old_mode = get_pool_mode(pool);
+
+	/*
+	 * Never allow the pool to transition to PM_WRITE mode if user
+	 * intervention is required to verify metadata and data consistency.
+	 */
+	if (new_mode == PM_WRITE && needs_check) {
+		DMERR("%s: unable to switch pool to write mode until repaired.",
+		      dm_device_name(pool->pool_md));
+		if (old_mode != new_mode)
+			new_mode = old_mode;
+		else
+			new_mode = PM_READ_ONLY;
+	}
+	/*
+	 * If we were in PM_FAIL mode, rollback of metadata failed.  We're
+	 * not going to recover without a thin_repair.	So we never let the
+	 * pool move out of the old mode.
+	 */
+	if (old_mode == PM_FAIL)
+		new_mode = old_mode;
 
 	switch (new_mode) {
 	case PM_FAIL:
 		if (old_mode != new_mode)
-			DMERR("%s: switching pool to failure mode",
-			      dm_device_name(pool->pool_md));
+			notify_of_pool_mode_change(pool, "failure");
 		dm_pool_metadata_read_only(pool->pmd);
 		pool->process_bio = process_bio_fail;
 		pool->process_discard = process_bio_fail;
 		pool->process_prepared_mapping = process_prepared_mapping_fail;
 		pool->process_prepared_discard = process_prepared_discard_fail;
+
+		error_retry_list(pool);
 		break;
 
 	case PM_READ_ONLY:
 		if (old_mode != new_mode)
-			DMERR("%s: switching pool to read-only mode",
-			      dm_device_name(pool->pool_md));
-		r = dm_pool_abort_metadata(pool->pmd);
-		if (r) {
-			DMERR("%s: aborting transaction failed",
-			      dm_device_name(pool->pool_md));
-			new_mode = PM_FAIL;
-			set_pool_mode(pool, new_mode);
-		} else {
-			dm_pool_metadata_read_only(pool->pmd);
-			pool->process_bio = process_bio_read_only;
-			pool->process_discard = process_discard;
-			pool->process_prepared_mapping = process_prepared_mapping_fail;
-			pool->process_prepared_discard = process_prepared_discard_passdown;
-		}
+			notify_of_pool_mode_change(pool, "read-only");
+		dm_pool_metadata_read_only(pool->pmd);
+		pool->process_bio = process_bio_read_only;
+		pool->process_discard = process_bio_success;
+		pool->process_prepared_mapping = process_prepared_mapping_fail;
+		pool->process_prepared_discard = process_prepared_discard_passdown;
+
+		error_retry_list(pool);
+		break;
+
+	case PM_OUT_OF_DATA_SPACE:
+		/*
+		 * Ideally we'd never hit this state; the low water mark
+		 * would trigger userland to extend the pool before we
+		 * completely run out of data space.  However, many small
+		 * IOs to unprovisioned space can consume data space at an
+		 * alarming rate.  Adjust your low water mark if you're
+		 * frequently seeing this mode.
+		 */
+		if (old_mode != new_mode)
+			notify_of_pool_mode_change(pool, "out-of-data-space");
+		pool->process_bio = process_bio_read_only;
+		pool->process_discard = process_discard;
+		pool->process_prepared_mapping = process_prepared_mapping;
+		pool->process_prepared_discard = process_prepared_discard_passdown;
 		break;
 
 	case PM_WRITE:
 		if (old_mode != new_mode)
-			DMINFO("%s: switching pool to write mode",
-			       dm_device_name(pool->pool_md));
+			notify_of_pool_mode_change(pool, "write");
 		dm_pool_metadata_read_write(pool->pmd);
 		pool->process_bio = process_bio;
 		pool->process_discard = process_discard;
@@ -1447,32 +1588,35 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 	}
 
 	pool->pf.mode = new_mode;
+	/*
+	 * The pool mode may have changed, sync it so bind_control_target()
+	 * doesn't cause an unexpected mode transition on resume.
+	 */
+	pt->adjusted_pf.mode = new_mode;
 }
 
-/*
- * Rather than calling set_pool_mode directly, use these which describe the
- * reason for mode degradation.
- */
-static void out_of_data_space(struct pool *pool)
+static void abort_transaction(struct pool *pool)
 {
-	DMERR_LIMIT("%s: no free data space available.",
-		    dm_device_name(pool->pool_md));
-	set_pool_mode(pool, PM_READ_ONLY);
+	const char *dev_name = dm_device_name(pool->pool_md);
+
+	DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
+	if (dm_pool_abort_metadata(pool->pmd)) {
+		DMERR("%s: failed to abort metadata transaction", dev_name);
+		set_pool_mode(pool, PM_FAIL);
+	}
+
+	if (dm_pool_metadata_set_needs_check(pool->pmd)) {
+		DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
+		set_pool_mode(pool, PM_FAIL);
+	}
 }
 
 static void metadata_operation_failed(struct pool *pool, const char *op, int r)
 {
-	dm_block_t free_blocks;
-
 	DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
 		    dm_device_name(pool->pool_md), op, r);
 
-	if (r == -ENOSPC &&
-	    !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
-	    !free_blocks)
-		DMERR_LIMIT("%s: no free metadata space available.",
-			    dm_device_name(pool->pool_md));
-
+	abort_transaction(pool);
 	set_pool_mode(pool, PM_READ_ONLY);
 }
 
@@ -1523,6 +1667,11 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
 
 	thin_hook_bio(tc, bio);
 
+	if (tc->requeue_mode) {
+		bio_endio(bio, DM_ENDIO_REQUEUE);
+		return DM_MAPIO_SUBMITTED;
+	}
+
 	if (get_pool_mode(tc->pool) == PM_FAIL) {
 		bio_io_error(bio);
 		return DM_MAPIO_SUBMITTED;
@@ -1686,7 +1835,7 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
 	/*
 	 * We want to make sure that a pool in PM_FAIL mode is never upgraded.
 	 */
-	enum pool_mode old_mode = pool->pf.mode;
+	enum pool_mode old_mode = get_pool_mode(pool);
 	enum pool_mode new_mode = pt->adjusted_pf.mode;
 
 	/*
@@ -1700,16 +1849,6 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
 	pool->pf = pt->adjusted_pf;
 	pool->low_water_blocks = pt->low_water_blocks;
 
-	/*
-	 * If we were in PM_FAIL mode, rollback of metadata failed.  We're
-	 * not going to recover without a thin_repair.  So we never let the
-	 * pool move out of the old mode.  On the other hand a PM_READ_ONLY
-	 * may have been due to a lack of metadata or data space, and may
-	 * now work (ie. if the underlying devices have been resized).
-	 */
-	if (old_mode == PM_FAIL)
-		new_mode = old_mode;
-
 	set_pool_mode(pool, new_mode);
 
 	return 0;
@@ -1999,16 +2138,27 @@ static void metadata_low_callback(void *context)
 	dm_table_event(pool->ti->table);
 }
 
-static sector_t get_metadata_dev_size(struct block_device *bdev)
+static sector_t get_dev_size(struct block_device *bdev)
+{
+	return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+}
+
+static void warn_if_metadata_device_too_big(struct block_device *bdev)
 {
-	sector_t metadata_dev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+	sector_t metadata_dev_size = get_dev_size(bdev);
 	char buffer[BDEVNAME_SIZE];
 
-	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) {
+	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
 		DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
 		       bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS);
-		metadata_dev_size = THIN_METADATA_MAX_SECTORS_WARNING;
-	}
+}
+
+static sector_t get_metadata_dev_size(struct block_device *bdev)
+{
+	sector_t metadata_dev_size = get_dev_size(bdev);
+
+	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)
+		metadata_dev_size = THIN_METADATA_MAX_SECTORS;
 
 	return metadata_dev_size;
 }
@@ -2017,7 +2167,7 @@ static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)
 {
 	sector_t metadata_dev_size = get_metadata_dev_size(bdev);
 
-	sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
+	sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);
 
 	return metadata_dev_size;
 }
@@ -2095,12 +2245,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		ti->error = "Error opening metadata block device";
 		goto out_unlock;
 	}
-
-	/*
-	 * Run for the side-effect of possibly issuing a warning if the
-	 * device is too big.
-	 */
-	(void) get_metadata_dev_size(metadata_dev->bdev);
+	warn_if_metadata_device_too_big(metadata_dev->bdev);
 
 	r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
 	if (r) {
@@ -2246,6 +2391,12 @@ static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
 		return -EINVAL;
 
 	} else if (data_size > sb_data_size) {
+		if (dm_pool_metadata_needs_check(pool->pmd)) {
+			DMERR("%s: unable to grow the data device until repaired.",
+			      dm_device_name(pool->pool_md));
+			return 0;
+		}
+
 		if (sb_data_size)
 			DMINFO("%s: growing the data device from %llu to %llu blocks",
 			       dm_device_name(pool->pool_md),
@@ -2287,6 +2438,13 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
 		return -EINVAL;
 
 	} else if (metadata_dev_size > sb_metadata_dev_size) {
+		if (dm_pool_metadata_needs_check(pool->pmd)) {
+			DMERR("%s: unable to grow the metadata device until repaired.",
+			      dm_device_name(pool->pool_md));
+			return 0;
+		}
+
+		warn_if_metadata_device_too_big(pool->md_dev);
 		DMINFO("%s: growing the metadata device from %llu to %llu blocks",
 		       dm_device_name(pool->pool_md),
 		       sb_metadata_dev_size, metadata_dev_size);
@@ -2673,7 +2831,9 @@ static void pool_status(struct dm_target *ti, status_type_t type,
 		else
 			DMEMIT("- ");
 
-		if (pool->pf.mode == PM_READ_ONLY)
+		if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
+			DMEMIT("out_of_data_space ");
+		else if (pool->pf.mode == PM_READ_ONLY)
 			DMEMIT("ro ");
 		else
 			DMEMIT("rw ");
@@ -2787,7 +2947,7 @@ static struct target_type pool_target = {
 	.name = "thin-pool",
 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
 		    DM_TARGET_IMMUTABLE,
-	.version = {1, 10, 0},
+	.version = {1, 11, 0},
 	.module = THIS_MODULE,
 	.ctr = pool_ctr,
 	.dtr = pool_dtr,
@@ -2894,6 +3054,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	if (get_pool_mode(tc->pool) == PM_FAIL) {
 		ti->error = "Couldn't open thin device, Pool is in fail mode";
+		r = -EINVAL;
 		goto bad_thin_open;
 	}
 
@@ -2905,7 +3066,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
 	if (r)
-		goto bad_thin_open;
+		goto bad_target_max_io_len;
 
 	ti->num_flush_bios = 1;
 	ti->flush_supported = true;
@@ -2926,6 +3087,8 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	return 0;
 
+bad_target_max_io_len:
+	dm_pool_close_thin_device(tc->td);
 bad_thin_open:
 	__pool_dec(tc->pool);
 bad_pool_lookup:
@@ -2986,10 +3149,23 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
 	return 0;
 }
 
-static void thin_postsuspend(struct dm_target *ti)
+static void thin_presuspend(struct dm_target *ti)
 {
+	struct thin_c *tc = ti->private;
+
 	if (dm_noflush_suspending(ti))
-		requeue_io((struct thin_c *)ti->private);
+		noflush_work(tc, do_noflush_start);
+}
+
+static void thin_postsuspend(struct dm_target *ti)
+{
+	struct thin_c *tc = ti->private;
+
+	/*
+	 * The dm_noflush_suspending flag has been cleared by now, so
+	 * unfortunately we must always run this.
+	 */
+	noflush_work(tc, do_noflush_stop);
 }
 
 /*
@@ -3074,12 +3250,13 @@ static int thin_iterate_devices(struct dm_target *ti,
 
 static struct target_type thin_target = {
 	.name = "thin",
-	.version = {1, 10, 0},
+	.version = {1, 11, 0},
 	.module	= THIS_MODULE,
 	.ctr = thin_ctr,
 	.dtr = thin_dtr,
 	.map = thin_map,
 	.end_io = thin_endio,
+	.presuspend = thin_presuspend,
 	.postsuspend = thin_postsuspend,
 	.status = thin_status,
 	.iterate_devices = thin_iterate_devices,
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig
index 19b268795415..0c2dec7aec20 100644
--- a/drivers/md/persistent-data/Kconfig
+++ b/drivers/md/persistent-data/Kconfig
@@ -6,3 +6,13 @@ config DM_PERSISTENT_DATA
        ---help---
 	 Library providing immutable on-disk data structure support for
 	 device-mapper targets such as the thin provisioning target.
+
+config DM_DEBUG_BLOCK_STACK_TRACING
+       boolean "Keep stack trace of persistent data block lock holders"
+       depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA
+       select STACKTRACE
+       ---help---
+	 Enable this for messages that may help debug problems with the
+	 block manager locking used by thin provisioning and caching.
+
+	 If unsure, say N.
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 536782e3bcb7..786b689bdfc7 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -91,6 +91,69 @@ struct block_op {
 	dm_block_t block;
 };
 
+struct bop_ring_buffer {
+	unsigned begin;
+	unsigned end;
+	struct block_op bops[MAX_RECURSIVE_ALLOCATIONS + 1];
+};
+
+static void brb_init(struct bop_ring_buffer *brb)
+{
+	brb->begin = 0;
+	brb->end = 0;
+}
+
+static bool brb_empty(struct bop_ring_buffer *brb)
+{
+	return brb->begin == brb->end;
+}
+
+static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old)
+{
+	unsigned r = old + 1;
+	return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r;
+}
+
+static int brb_push(struct bop_ring_buffer *brb,
+		    enum block_op_type type, dm_block_t b)
+{
+	struct block_op *bop;
+	unsigned next = brb_next(brb, brb->end);
+
+	/*
+	 * We don't allow the last bop to be filled, this way we can
+	 * differentiate between full and empty.
+	 */
+	if (next == brb->begin)
+		return -ENOMEM;
+
+	bop = brb->bops + brb->end;
+	bop->type = type;
+	bop->block = b;
+
+	brb->end = next;
+
+	return 0;
+}
+
+static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result)
+{
+	struct block_op *bop;
+
+	if (brb_empty(brb))
+		return -ENODATA;
+
+	bop = brb->bops + brb->begin;
+	result->type = bop->type;
+	result->block = bop->block;
+
+	brb->begin = brb_next(brb, brb->begin);
+
+	return 0;
+}
+
+/*----------------------------------------------------------------*/
+
 struct sm_metadata {
 	struct dm_space_map sm;
 
@@ -101,25 +164,20 @@ struct sm_metadata {
 
 	unsigned recursion_count;
 	unsigned allocated_this_transaction;
-	unsigned nr_uncommitted;
-	struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS];
+	struct bop_ring_buffer uncommitted;
 
 	struct threshold threshold;
 };
 
 static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b)
 {
-	struct block_op *op;
+	int r = brb_push(&smm->uncommitted, type, b);
 
-	if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) {
+	if (r) {
 		DMERR("too many recursive allocations");
 		return -ENOMEM;
 	}
 
-	op = smm->uncommitted + smm->nr_uncommitted++;
-	op->type = type;
-	op->block = b;
-
 	return 0;
 }
 
@@ -158,11 +216,17 @@ static int out(struct sm_metadata *smm)
 		return -ENOMEM;
 	}
 
-	if (smm->recursion_count == 1 && smm->nr_uncommitted) {
-		while (smm->nr_uncommitted && !r) {
-			smm->nr_uncommitted--;
-			r = commit_bop(smm, smm->uncommitted +
-				       smm->nr_uncommitted);
+	if (smm->recursion_count == 1) {
+		while (!brb_empty(&smm->uncommitted)) {
+			struct block_op bop;
+
+			r = brb_pop(&smm->uncommitted, &bop);
+			if (r) {
+				DMERR("bug in bop ring buffer");
+				break;
+			}
+
+			r = commit_bop(smm, &bop);
 			if (r)
 				break;
 		}
@@ -217,7 +281,8 @@ static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
 static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
 				 uint32_t *result)
 {
-	int r, i;
+	int r;
+	unsigned i;
 	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
 	unsigned adjustment = 0;
 
@@ -225,8 +290,10 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
 	 * We may have some uncommitted adjustments to add.  This list
 	 * should always be really short.
 	 */
-	for (i = 0; i < smm->nr_uncommitted; i++) {
-		struct block_op *op = smm->uncommitted + i;
+	for (i = smm->uncommitted.begin;
+	     i != smm->uncommitted.end;
+	     i = brb_next(&smm->uncommitted, i)) {
+		struct block_op *op = smm->uncommitted.bops + i;
 
 		if (op->block != b)
 			continue;
@@ -254,7 +321,8 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
 static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
 					      dm_block_t b, int *result)
 {
-	int r, i, adjustment = 0;
+	int r, adjustment = 0;
+	unsigned i;
 	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
 	uint32_t rc;
 
@@ -262,8 +330,11 @@ static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
 	 * We may have some uncommitted adjustments to add.  This list
 	 * should always be really short.
 	 */
-	for (i = 0; i < smm->nr_uncommitted; i++) {
-		struct block_op *op = smm->uncommitted + i;
+	for (i = smm->uncommitted.begin;
+	     i != smm->uncommitted.end;
+	     i = brb_next(&smm->uncommitted, i)) {
+
+		struct block_op *op = smm->uncommitted.bops + i;
 
 		if (op->block != b)
 			continue;
@@ -671,7 +742,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
 	smm->begin = superblock + 1;
 	smm->recursion_count = 0;
 	smm->allocated_this_transaction = 0;
-	smm->nr_uncommitted = 0;
+	brb_init(&smm->uncommitted);
 	threshold_init(&smm->threshold);
 
 	memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
@@ -680,6 +751,8 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
 	if (r)
 		return r;
 
+	if (nr_blocks > DM_SM_METADATA_MAX_BLOCKS)
+		nr_blocks = DM_SM_METADATA_MAX_BLOCKS;
 	r = sm_ll_extend(&smm->ll, nr_blocks);
 	if (r)
 		return r;
@@ -713,7 +786,7 @@ int dm_sm_metadata_open(struct dm_space_map *sm,
 	smm->begin = 0;
 	smm->recursion_count = 0;
 	smm->allocated_this_transaction = 0;
-	smm->nr_uncommitted = 0;
+	brb_init(&smm->uncommitted);
 	threshold_init(&smm->threshold);
 
 	memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.h b/drivers/md/persistent-data/dm-space-map-metadata.h
index 39bba0801cf2..64df923974d8 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.h
+++ b/drivers/md/persistent-data/dm-space-map-metadata.h
@@ -9,6 +9,17 @@
 
 #include "dm-transaction-manager.h"
 
+#define DM_SM_METADATA_BLOCK_SIZE (4096 >> SECTOR_SHIFT)
+
+/*
+ * The metadata device is currently limited in size.
+ *
+ * We have one block of index, which can hold 255 index entries.  Each
+ * index entry contains allocation info about ~16k metadata blocks.
+ */
+#define DM_SM_METADATA_MAX_BLOCKS (255 * ((1 << 14) - 64))
+#define DM_SM_METADATA_MAX_SECTORS (DM_SM_METADATA_MAX_BLOCKS * DM_SM_METADATA_BLOCK_SIZE)
+
 /*
  * Unfortunately we have to use two-phase construction due to the cycle
  * between the tm and sm.