10 files changed, 425 insertions, 112 deletions
diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt
index e6b72d355151..68c0f517c60e 100644
--- a/Documentation/device-mapper/cache.txt
+++ b/Documentation/device-mapper/cache.txt
@@ -124,12 +124,11 @@ the default being 204800 sectors (or 100MB).
 Updating on-disk metadata
 -------------------------
 
-On-disk metadata is committed every time a REQ_SYNC or REQ_FUA bio is
-written.  If no such requests are made then commits will occur every
-second.  This means the cache behaves like a physical disk that has a
-write cache (the same is true of the thin-provisioning target).  If
-power is lost you may lose some recent writes.  The metadata should
-always be consistent in spite of any crash.
+On-disk metadata is committed every time a FLUSH or FUA bio is written.
+If no such requests are made then commits will occur every second.  This
+means the cache behaves like a physical disk that has a volatile write
+cache.  If power is lost you may lose some recent writes.  The metadata
+should always be consistent in spite of any crash.
 
 The 'dirty' state for a cache block changes far too frequently for us
 to keep updating it on the fly.  So we treat it as a hint.  In normal
diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt
index 8a7a3d46e0da..05a27e9442bd 100644
--- a/Documentation/device-mapper/thin-provisioning.txt
+++ b/Documentation/device-mapper/thin-provisioning.txt
@@ -116,6 +116,35 @@ Resuming a device with a new table itself triggers an event so the
 userspace daemon can use this to detect a situation where a new table
 already exceeds the threshold.
 
+A low water mark for the metadata device is maintained in the kernel and
+will trigger a dm event if free space on the metadata device drops below
+it.
+
+Updating on-disk metadata
+-------------------------
+
+On-disk metadata is committed every time a FLUSH or FUA bio is written.
+If no such requests are made then commits will occur every second.  This
+means the thin-provisioning target behaves like a physical disk that has
+a volatile write cache.  If power is lost you may lose some recent
+writes.  The metadata should always be consistent in spite of any crash.
+
+If data space is exhausted the pool will either error or queue IO
+according to the configuration (see: error_if_no_space).  If metadata
+space is exhausted or a metadata operation fails: the pool will error IO
+until the pool is taken offline and repair is performed to 1) fix any
+potential inconsistencies and 2) clear the flag that imposes repair.
+Once the pool's metadata device is repaired it may be resized, which
+will allow the pool to return to normal operation.  Note that if a pool
+is flagged as needing repair, the pool's data and metadata devices
+cannot be resized until repair is performed.  It should also be noted
+that when the pool's metadata space is exhausted the current metadata
+transaction is aborted.  Given that the pool will cache IO whose
+completion may have already been acknowledged to upper IO layers
+(e.g. filesystem) it is strongly suggested that consistency checks
+(e.g. fsck) be performed on those layers when repair of the pool is
+required.
+
 Thin provisioning
 -----------------
 
@@ -258,10 +287,9 @@ ii) Status
 	should register for the event and then check the target's status.
 
     held metadata root:
-	The location, in sectors, of the metadata root that has been
+	The location, in blocks, of the metadata root that has been
 	'held' for userspace read access.  '-' indicates there is no
-	held root.  This feature is not yet implemented so '-' is
-	always returned.
+	held root.
 
     discard_passdown|no_discard_passdown
 	Whether or not discards are actually being passed down to the
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 9a06fe883766..95ad936e6048 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -254,16 +254,6 @@ config DM_THIN_PROVISIONING
        ---help---
          Provides thin provisioning and snapshots that share a data store.
 
-config DM_DEBUG_BLOCK_STACK_TRACING
-	boolean "Keep stack trace of persistent data block lock holders"
-	depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA
-	select STACKTRACE
-	---help---
-	  Enable this for messages that may help debug problems with the
-	  block manager locking used by thin provisioning and caching.
-
-	  If unsure, say N.
-
 config DM_CACHE
        tristate "Cache target (EXPERIMENTAL)"
        depends on BLK_DEV_DM
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 1e018e986610..0e385e40909e 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -872,7 +872,7 @@ static void mq_destroy(struct dm_cache_policy *p)
 {
 	struct mq_policy *mq = to_mq_policy(p);
 
-	kfree(mq->table);
+	vfree(mq->table);
 	epool_exit(&mq->cache_pool);
 	epool_exit(&mq->pre_cache_pool);
 	kfree(mq);
@@ -1245,7 +1245,7 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
 
 	mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);
 	mq->hash_bits = ffs(mq->nr_buckets) - 1;
-	mq->table = kzalloc(sizeof(*mq->table) * mq->nr_buckets, GFP_KERNEL);
+	mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets);
 	if (!mq->table)
 		goto bad_alloc_table;
 
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index afc3d017de4c..d6e88178d22c 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -546,6 +546,9 @@ static int read_exceptions(struct pstore *ps,
 		r = insert_exceptions(ps, area, callback, callback_context,
 				      &full);
 
+		if (!full)
+			memcpy(ps->area, area, ps->store->chunk_size << SECTOR_SHIFT);
+
 		dm_bufio_release(bp);
 
 		dm_bufio_forget(client, chunk);
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index baa87ff12816..fb9efc829182 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -76,7 +76,7 @@
 
 #define THIN_SUPERBLOCK_MAGIC 27022010
 #define THIN_SUPERBLOCK_LOCATION 0
-#define THIN_VERSION 1
+#define THIN_VERSION 2
 #define THIN_METADATA_CACHE_SIZE 64
 #define SECTOR_TO_BLOCK_SHIFT 3
 
@@ -1755,3 +1755,38 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
 
 	return r;
 }
+
+int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
+{
+	int r;
+	struct dm_block *sblock;
+	struct thin_disk_superblock *disk_super;
+
+	down_write(&pmd->root_lock);
+	pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
+
+	r = superblock_lock(pmd, &sblock);
+	if (r) {
+		DMERR("couldn't read superblock");
+		goto out;
+	}
+
+	disk_super = dm_block_data(sblock);
+	disk_super->flags = cpu_to_le32(pmd->flags);
+
+	dm_bm_unlock(sblock);
+out:
+	up_write(&pmd->root_lock);
+	return r;
+}
+
+bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
+{
+	bool needs_check;
+
+	down_read(&pmd->root_lock);
+	needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG;
+	up_read(&pmd->root_lock);
+
+	return needs_check;
+}
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 82ea384d36ff..e3c857db195a 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -25,6 +25,11 @@
 
 /*----------------------------------------------------------------*/
 
+/*
+ * Thin metadata superblock flags.
+ */
+#define THIN_METADATA_NEEDS_CHECK_FLAG (1 << 0)
+
 struct dm_pool_metadata;
 struct dm_thin_device;
 
@@ -202,6 +207,12 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
 					dm_sm_threshold_fn fn,
 					void *context);
 
+/*
+ * Updates the superblock immediately.
+ */
+int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd);
+bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd);
+
 /*----------------------------------------------------------------*/
 
 #endif
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 7e84baccf0ad..be70d38745f7 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -130,10 +130,11 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
 struct dm_thin_new_mapping;
 
 /*
- * The pool runs in 3 modes.  Ordered in degraded order for comparisons.
+ * The pool runs in 4 modes.  Ordered in degraded order for comparisons.
  */
 enum pool_mode {
 	PM_WRITE,		/* metadata may be changed */
+	PM_OUT_OF_DATA_SPACE,	/* metadata may be changed, though data may not be allocated */
 	PM_READ_ONLY,		/* metadata may not be changed */
 	PM_FAIL,		/* all I/O fails */
 };
@@ -198,7 +199,6 @@ struct pool {
 };
 
 static enum pool_mode get_pool_mode(struct pool *pool);
-static void out_of_data_space(struct pool *pool);
 static void metadata_operation_failed(struct pool *pool, const char *op, int r);
 
 /*
@@ -226,6 +226,7 @@ struct thin_c {
 
 	struct pool *pool;
 	struct dm_thin_device *td;
+	bool requeue_mode:1;
 };
 
 /*----------------------------------------------------------------*/
@@ -369,14 +370,18 @@ struct dm_thin_endio_hook {
 	struct dm_thin_new_mapping *overwrite_mapping;
 };
 
-static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
+static void requeue_bio_list(struct thin_c *tc, struct bio_list *master)
 {
 	struct bio *bio;
 	struct bio_list bios;
+	unsigned long flags;
 
 	bio_list_init(&bios);
+
+	spin_lock_irqsave(&tc->pool->lock, flags);
 	bio_list_merge(&bios, master);
 	bio_list_init(master);
+	spin_unlock_irqrestore(&tc->pool->lock, flags);
 
 	while ((bio = bio_list_pop(&bios))) {
 		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
@@ -391,12 +396,26 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
 static void requeue_io(struct thin_c *tc)
 {
 	struct pool *pool = tc->pool;
+
+	requeue_bio_list(tc, &pool->deferred_bios);
+	requeue_bio_list(tc, &pool->retry_on_resume_list);
+}
+
+static void error_retry_list(struct pool *pool)
+{
+	struct bio *bio;
 	unsigned long flags;
+	struct bio_list bios;
+
+	bio_list_init(&bios);
 
 	spin_lock_irqsave(&pool->lock, flags);
-	__requeue_bio_list(tc, &pool->deferred_bios);
-	__requeue_bio_list(tc, &pool->retry_on_resume_list);
+	bio_list_merge(&bios, &pool->retry_on_resume_list);
+	bio_list_init(&pool->retry_on_resume_list);
 	spin_unlock_irqrestore(&pool->lock, flags);
+
+	while ((bio = bio_list_pop(&bios)))
+		bio_io_error(bio);
 }
 
 /*
@@ -925,13 +944,15 @@ static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
 	}
 }
 
+static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
+
 static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 {
 	int r;
 	dm_block_t free_blocks;
 	struct pool *pool = tc->pool;
 
-	if (get_pool_mode(pool) != PM_WRITE)
+	if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
 		return -EINVAL;
 
 	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
@@ -958,7 +979,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 		}
 
 		if (!free_blocks) {
-			out_of_data_space(pool);
+			set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
 			return -ENOSPC;
 		}
 	}
@@ -988,15 +1009,32 @@ static void retry_on_resume(struct bio *bio)
 	spin_unlock_irqrestore(&pool->lock, flags);
 }
 
-static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
+static bool should_error_unserviceable_bio(struct pool *pool)
 {
-	/*
-	 * When pool is read-only, no cell locking is needed because
-	 * nothing is changing.
-	 */
-	WARN_ON_ONCE(get_pool_mode(pool) != PM_READ_ONLY);
+	enum pool_mode m = get_pool_mode(pool);
+
+	switch (m) {
+	case PM_WRITE:
+		/* Shouldn't get here */
+		DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
+		return true;
+
+	case PM_OUT_OF_DATA_SPACE:
+		return pool->pf.error_if_no_space;
 
-	if (pool->pf.error_if_no_space)
+	case PM_READ_ONLY:
+	case PM_FAIL:
+		return true;
+	default:
+		/* Shouldn't get here */
+		DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
+		return true;
+	}
+}
+
+static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
+{
+	if (should_error_unserviceable_bio(pool))
 		bio_io_error(bio);
 	else
 		retry_on_resume(bio);
@@ -1007,11 +1045,20 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
 	struct bio *bio;
 	struct bio_list bios;
 
+	if (should_error_unserviceable_bio(pool)) {
+		cell_error(pool, cell);
+		return;
+	}
+
 	bio_list_init(&bios);
 	cell_release(pool, cell, &bios);
 
-	while ((bio = bio_list_pop(&bios)))
-		handle_unserviceable_bio(pool, bio);
+	if (should_error_unserviceable_bio(pool))
+		while ((bio = bio_list_pop(&bios)))
+			bio_io_error(bio);
+	else
+		while ((bio = bio_list_pop(&bios)))
+			retry_on_resume(bio);
 }
 
 static void process_discard(struct thin_c *tc, struct bio *bio)
@@ -1296,6 +1343,11 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
 	}
 }
 
+static void process_bio_success(struct thin_c *tc, struct bio *bio)
+{
+	bio_endio(bio, 0);
+}
+
 static void process_bio_fail(struct thin_c *tc, struct bio *bio)
 {
 	bio_io_error(bio);
@@ -1328,6 +1380,11 @@ static void process_deferred_bios(struct pool *pool)
 		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 		struct thin_c *tc = h->tc;
 
+		if (tc->requeue_mode) {
+			bio_endio(bio, DM_ENDIO_REQUEUE);
+			continue;
+		}
+
 		/*
 		 * If we've got no free new_mapping structs, and processing
 		 * this bio might require one, we pause until there are some
@@ -1394,51 +1451,134 @@ static void do_waker(struct work_struct *ws)
 
 /*----------------------------------------------------------------*/
 
+struct noflush_work {
+	struct work_struct worker;
+	struct thin_c *tc;
+
+	atomic_t complete;
+	wait_queue_head_t wait;
+};
+
+static void complete_noflush_work(struct noflush_work *w)
+{
+	atomic_set(&w->complete, 1);
+	wake_up(&w->wait);
+}
+
+static void do_noflush_start(struct work_struct *ws)
+{
+	struct noflush_work *w = container_of(ws, struct noflush_work, worker);
+	w->tc->requeue_mode = true;
+	requeue_io(w->tc);
+	complete_noflush_work(w);
+}
+
+static void do_noflush_stop(struct work_struct *ws)
+{
+	struct noflush_work *w = container_of(ws, struct noflush_work, worker);
+	w->tc->requeue_mode = false;
+	complete_noflush_work(w);
+}
+
+static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
+{
+	struct noflush_work w;
+
+	INIT_WORK(&w.worker, fn);
+	w.tc = tc;
+	atomic_set(&w.complete, 0);
+	init_waitqueue_head(&w.wait);
+
+	queue_work(tc->pool->wq, &w.worker);
+
+	wait_event(w.wait, atomic_read(&w.complete));
+}
+
+/*----------------------------------------------------------------*/
+
 static enum pool_mode get_pool_mode(struct pool *pool)
 {
 	return pool->pf.mode;
 }
 
+static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
+{
+	dm_table_event(pool->ti->table);
+	DMINFO("%s: switching pool to %s mode",
+	       dm_device_name(pool->pool_md), new_mode);
+}
+
 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 {
-	int r;
-	enum pool_mode old_mode = pool->pf.mode;
+	struct pool_c *pt = pool->ti->private;
+	bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
+	enum pool_mode old_mode = get_pool_mode(pool);
+
+	/*
+	 * Never allow the pool to transition to PM_WRITE mode if user
+	 * intervention is required to verify metadata and data consistency.
+	 */
+	if (new_mode == PM_WRITE && needs_check) {
+		DMERR("%s: unable to switch pool to write mode until repaired.",
+		      dm_device_name(pool->pool_md));
+		if (old_mode != new_mode)
+			new_mode = old_mode;
+		else
+			new_mode = PM_READ_ONLY;
+	}
+	/*
+	 * If we were in PM_FAIL mode, rollback of metadata failed.  We're
+	 * not going to recover without a thin_repair.	So we never let the
+	 * pool move out of the old mode.
+	 */
+	if (old_mode == PM_FAIL)
+		new_mode = old_mode;
 
 	switch (new_mode) {
 	case PM_FAIL:
 		if (old_mode != new_mode)
-			DMERR("%s: switching pool to failure mode",
-			      dm_device_name(pool->pool_md));
+			notify_of_pool_mode_change(pool, "failure");
 		dm_pool_metadata_read_only(pool->pmd);
 		pool->process_bio = process_bio_fail;
 		pool->process_discard = process_bio_fail;
 		pool->process_prepared_mapping = process_prepared_mapping_fail;
 		pool->process_prepared_discard = process_prepared_discard_fail;
+
+		error_retry_list(pool);
 		break;
 
 	case PM_READ_ONLY:
 		if (old_mode != new_mode)
-			DMERR("%s: switching pool to read-only mode",
-			      dm_device_name(pool->pool_md));
-		r = dm_pool_abort_metadata(pool->pmd);
-		if (r) {
-			DMERR("%s: aborting transaction failed",
-			      dm_device_name(pool->pool_md));
-			new_mode = PM_FAIL;
-			set_pool_mode(pool, new_mode);
-		} else {
-			dm_pool_metadata_read_only(pool->pmd);
-			pool->process_bio = process_bio_read_only;
-			pool->process_discard = process_discard;
-			pool->process_prepared_mapping = process_prepared_mapping_fail;
-			pool->process_prepared_discard = process_prepared_discard_passdown;
-		}
+			notify_of_pool_mode_change(pool, "read-only");
+		dm_pool_metadata_read_only(pool->pmd);
+		pool->process_bio = process_bio_read_only;
+		pool->process_discard = process_bio_success;
+		pool->process_prepared_mapping = process_prepared_mapping_fail;
+		pool->process_prepared_discard = process_prepared_discard_passdown;
+
+		error_retry_list(pool);
+		break;
+
+	case PM_OUT_OF_DATA_SPACE:
+		/*
+		 * Ideally we'd never hit this state; the low water mark
+		 * would trigger userland to extend the pool before we
+		 * completely run out of data space.  However, many small
+		 * IOs to unprovisioned space can consume data space at an
+		 * alarming rate.  Adjust your low water mark if you're
+		 * frequently seeing this mode.
+		 */
+		if (old_mode != new_mode)
+			notify_of_pool_mode_change(pool, "out-of-data-space");
+		pool->process_bio = process_bio_read_only;
+		pool->process_discard = process_discard;
+		pool->process_prepared_mapping = process_prepared_mapping;
+		pool->process_prepared_discard = process_prepared_discard_passdown;
 		break;
 
 	case PM_WRITE:
 		if (old_mode != new_mode)
-			DMINFO("%s: switching pool to write mode",
-			       dm_device_name(pool->pool_md));
+			notify_of_pool_mode_change(pool, "write");
 		dm_pool_metadata_read_write(pool->pmd);
 		pool->process_bio = process_bio;
 		pool->process_discard = process_discard;
@@ -1448,32 +1588,35 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 	}
 
 	pool->pf.mode = new_mode;
+	/*
+	 * The pool mode may have changed, sync it so bind_control_target()
+	 * doesn't cause an unexpected mode transition on resume.
+	 */
+	pt->adjusted_pf.mode = new_mode;
 }
 
-/*
- * Rather than calling set_pool_mode directly, use these which describe the
- * reason for mode degradation.
- */
-static void out_of_data_space(struct pool *pool)
+static void abort_transaction(struct pool *pool)
 {
-	DMERR_LIMIT("%s: no free data space available.",
-		    dm_device_name(pool->pool_md));
-	set_pool_mode(pool, PM_READ_ONLY);
+	const char *dev_name = dm_device_name(pool->pool_md);
+
+	DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
+	if (dm_pool_abort_metadata(pool->pmd)) {
+		DMERR("%s: failed to abort metadata transaction", dev_name);
+		set_pool_mode(pool, PM_FAIL);
+	}
+
+	if (dm_pool_metadata_set_needs_check(pool->pmd)) {
+		DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
+		set_pool_mode(pool, PM_FAIL);
+	}
 }
 
 static void metadata_operation_failed(struct pool *pool, const char *op, int r)
 {
-	dm_block_t free_blocks;
-
 	DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
 		    dm_device_name(pool->pool_md), op, r);
 
-	if (r == -ENOSPC &&
-	    !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
-	    !free_blocks)
-		DMERR_LIMIT("%s: no free metadata space available.",
-			    dm_device_name(pool->pool_md));
-
+	abort_transaction(pool);
 	set_pool_mode(pool, PM_READ_ONLY);
 }
 
@@ -1524,6 +1667,11 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
 
 	thin_hook_bio(tc, bio);
 
+	if (tc->requeue_mode) {
+		bio_endio(bio, DM_ENDIO_REQUEUE);
+		return DM_MAPIO_SUBMITTED;
+	}
+
 	if (get_pool_mode(tc->pool) == PM_FAIL) {
 		bio_io_error(bio);
 		return DM_MAPIO_SUBMITTED;
@@ -1687,7 +1835,7 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
 	/*
 	 * We want to make sure that a pool in PM_FAIL mode is never upgraded.
 	 */
-	enum pool_mode old_mode = pool->pf.mode;
+	enum pool_mode old_mode = get_pool_mode(pool);
 	enum pool_mode new_mode = pt->adjusted_pf.mode;
 
 	/*
@@ -1701,16 +1849,6 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
 	pool->pf = pt->adjusted_pf;
 	pool->low_water_blocks = pt->low_water_blocks;
 
-	/*
-	 * If we were in PM_FAIL mode, rollback of metadata failed.  We're
-	 * not going to recover without a thin_repair.  So we never let the
-	 * pool move out of the old mode.  On the other hand a PM_READ_ONLY
-	 * may have been due to a lack of metadata or data space, and may
-	 * now work (ie. if the underlying devices have been resized).
-	 */
-	if (old_mode == PM_FAIL)
-		new_mode = old_mode;
-
 	set_pool_mode(pool, new_mode);
 
 	return 0;
@@ -2253,6 +2391,12 @@ static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
 		return -EINVAL;
 
 	} else if (data_size > sb_data_size) {
+		if (dm_pool_metadata_needs_check(pool->pmd)) {
+			DMERR("%s: unable to grow the data device until repaired.",
+			      dm_device_name(pool->pool_md));
+			return 0;
+		}
+
 		if (sb_data_size)
 			DMINFO("%s: growing the data device from %llu to %llu blocks",
 			       dm_device_name(pool->pool_md),
@@ -2294,6 +2438,12 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
 		return -EINVAL;
 
 	} else if (metadata_dev_size > sb_metadata_dev_size) {
+		if (dm_pool_metadata_needs_check(pool->pmd)) {
+			DMERR("%s: unable to grow the metadata device until repaired.",
+			      dm_device_name(pool->pool_md));
+			return 0;
+		}
+
 		warn_if_metadata_device_too_big(pool->md_dev);
 		DMINFO("%s: growing the metadata device from %llu to %llu blocks",
 		       dm_device_name(pool->pool_md),
@@ -2681,7 +2831,9 @@ static void pool_status(struct dm_target *ti, status_type_t type,
 		else
 			DMEMIT("- ");
 
-		if (pool->pf.mode == PM_READ_ONLY)
+		if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
+			DMEMIT("out_of_data_space ");
+		else if (pool->pf.mode == PM_READ_ONLY)
 			DMEMIT("ro ");
 		else
 			DMEMIT("rw ");
@@ -2795,7 +2947,7 @@ static struct target_type pool_target = {
 	.name = "thin-pool",
 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
 		    DM_TARGET_IMMUTABLE,
-	.version = {1, 10, 0},
+	.version = {1, 11, 0},
 	.module = THIS_MODULE,
 	.ctr = pool_ctr,
 	.dtr = pool_dtr,
@@ -2997,10 +3149,23 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
 	return 0;
 }
 
-static void thin_postsuspend(struct dm_target *ti)
+static void thin_presuspend(struct dm_target *ti)
 {
+	struct thin_c *tc = ti->private;
+
 	if (dm_noflush_suspending(ti))
-		requeue_io((struct thin_c *)ti->private);
+		noflush_work(tc, do_noflush_start);
+}
+
+static void thin_postsuspend(struct dm_target *ti)
+{
+	struct thin_c *tc = ti->private;
+
+	/*
+	 * The dm_noflush_suspending flag has been cleared by now, so
+	 * unfortunately we must always run this.
+	 */
+	noflush_work(tc, do_noflush_stop);
 }
 
 /*
@@ -3085,12 +3250,13 @@ static int thin_iterate_devices(struct dm_target *ti,
 
 static struct target_type thin_target = {
 	.name = "thin",
-	.version = {1, 10, 0},
+	.version = {1, 11, 0},
 	.module	= THIS_MODULE,
 	.ctr = thin_ctr,
 	.dtr = thin_dtr,
 	.map = thin_map,
 	.end_io = thin_endio,
+	.presuspend = thin_presuspend,
 	.postsuspend = thin_postsuspend,
 	.status = thin_status,
 	.iterate_devices = thin_iterate_devices,
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig
index 19b268795415..0c2dec7aec20 100644
--- a/drivers/md/persistent-data/Kconfig
+++ b/drivers/md/persistent-data/Kconfig
@@ -6,3 +6,13 @@ config DM_PERSISTENT_DATA
        ---help---
 	 Library providing immutable on-disk data structure support for
 	 device-mapper targets such as the thin provisioning target.
+
+config DM_DEBUG_BLOCK_STACK_TRACING
+       boolean "Keep stack trace of persistent data block lock holders"
+       depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA
+       select STACKTRACE
+       ---help---
+	 Enable this for messages that may help debug problems with the
+	 block manager locking used by thin provisioning and caching.
+
+	 If unsure, say N.
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index e9bdd462f4f5..786b689bdfc7 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -91,6 +91,69 @@ struct block_op {
 	dm_block_t block;
 };
 
+struct bop_ring_buffer {
+	unsigned begin;
+	unsigned end;
+	struct block_op bops[MAX_RECURSIVE_ALLOCATIONS + 1];
+};
+
+static void brb_init(struct bop_ring_buffer *brb)
+{
+	brb->begin = 0;
+	brb->end = 0;
+}
+
+static bool brb_empty(struct bop_ring_buffer *brb)
+{
+	return brb->begin == brb->end;
+}
+
+static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old)
+{
+	unsigned r = old + 1;
+	return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r;
+}
+
+static int brb_push(struct bop_ring_buffer *brb,
+		    enum block_op_type type, dm_block_t b)
+{
+	struct block_op *bop;
+	unsigned next = brb_next(brb, brb->end);
+
+	/*
+	 * We don't allow the last bop to be filled, this way we can
+	 * differentiate between full and empty.
+	 */
+	if (next == brb->begin)
+		return -ENOMEM;
+
+	bop = brb->bops + brb->end;
+	bop->type = type;
+	bop->block = b;
+
+	brb->end = next;
+
+	return 0;
+}
+
+static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result)
+{
+	struct block_op *bop;
+
+	if (brb_empty(brb))
+		return -ENODATA;
+
+	bop = brb->bops + brb->begin;
+	result->type = bop->type;
+	result->block = bop->block;
+
+	brb->begin = brb_next(brb, brb->begin);
+
+	return 0;
+}
+
+/*----------------------------------------------------------------*/
+
 struct sm_metadata {
 	struct dm_space_map sm;
 
@@ -101,25 +164,20 @@ struct sm_metadata {
 
 	unsigned recursion_count;
 	unsigned allocated_this_transaction;
-	unsigned nr_uncommitted;
-	struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS];
+	struct bop_ring_buffer uncommitted;
 
 	struct threshold threshold;
 };
 
 static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b)
 {
-	struct block_op *op;
+	int r = brb_push(&smm->uncommitted, type, b);
 
-	if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) {
+	if (r) {
 		DMERR("too many recursive allocations");
 		return -ENOMEM;
 	}
 
-	op = smm->uncommitted + smm->nr_uncommitted++;
-	op->type = type;
-	op->block = b;
-
 	return 0;
 }
 
@@ -158,11 +216,17 @@ static int out(struct sm_metadata *smm)
 		return -ENOMEM;
 	}
 
-	if (smm->recursion_count == 1 && smm->nr_uncommitted) {
-		while (smm->nr_uncommitted && !r) {
-			smm->nr_uncommitted--;
-			r = commit_bop(smm, smm->uncommitted +
-				       smm->nr_uncommitted);
+	if (smm->recursion_count == 1) {
+		while (!brb_empty(&smm->uncommitted)) {
+			struct block_op bop;
+
+			r = brb_pop(&smm->uncommitted, &bop);
+			if (r) {
+				DMERR("bug in bop ring buffer");
+				break;
+			}
+
+			r = commit_bop(smm, &bop);
 			if (r)
 				break;
 		}
@@ -217,7 +281,8 @@ static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
 static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
 				 uint32_t *result)
 {
-	int r, i;
+	int r;
+	unsigned i;
 	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
 	unsigned adjustment = 0;
 
@@ -225,8 +290,10 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
 	 * We may have some uncommitted adjustments to add.  This list
 	 * should always be really short.
 	 */
-	for (i = 0; i < smm->nr_uncommitted; i++) {
-		struct block_op *op = smm->uncommitted + i;
+	for (i = smm->uncommitted.begin;
+	     i != smm->uncommitted.end;
+	     i = brb_next(&smm->uncommitted, i)) {
+		struct block_op *op = smm->uncommitted.bops + i;
 
 		if (op->block != b)
 			continue;
@@ -254,7 +321,8 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
 static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
 					      dm_block_t b, int *result)
 {
-	int r, i, adjustment = 0;
+	int r, adjustment = 0;
+	unsigned i;
 	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
 	uint32_t rc;
 
@@ -262,8 +330,11 @@ static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
 	 * We may have some uncommitted adjustments to add.  This list
 	 * should always be really short.
 	 */
-	for (i = 0; i < smm->nr_uncommitted; i++) {
-		struct block_op *op = smm->uncommitted + i;
+	for (i = smm->uncommitted.begin;
+	     i != smm->uncommitted.end;
+	     i = brb_next(&smm->uncommitted, i)) {
+
+		struct block_op *op = smm->uncommitted.bops + i;
 
 		if (op->block != b)
 			continue;
@@ -671,7 +742,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
 	smm->begin = superblock + 1;
 	smm->recursion_count = 0;
 	smm->allocated_this_transaction = 0;
-	smm->nr_uncommitted = 0;
+	brb_init(&smm->uncommitted);
 	threshold_init(&smm->threshold);
 
 	memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
@@ -715,7 +786,7 @@ int dm_sm_metadata_open(struct dm_space_map *sm,
 	smm->begin = 0;
 	smm->recursion_count = 0;
 	smm->allocated_this_transaction = 0;
-	smm->nr_uncommitted = 0;
+	brb_init(&smm->uncommitted);
 	threshold_init(&smm->threshold);
 
 	memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));