20 files changed, 1583 insertions, 293 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 2557f198e175..db269a348b20 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -436,6 +436,15 @@ config DM_DELAY
 
 	If unsure, say N.
 
+config DM_DUST
+	tristate "Bad sector simulation target"
+	depends on BLK_DEV_DM
+	---help---
+	A target that simulates bad sector behavior.
+	Useful for testing.
+
+	If unsure, say N.
+
 config DM_INIT
 	bool "DM \"dm-mod.create=\" parameter support"
 	depends on BLK_DEV_DM=y
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index a52b703e588e..be7a6eb92abc 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_DM_BUFIO)		+= dm-bufio.o
 obj-$(CONFIG_DM_BIO_PRISON)	+= dm-bio-prison.o
 obj-$(CONFIG_DM_CRYPT)		+= dm-crypt.o
 obj-$(CONFIG_DM_DELAY)		+= dm-delay.o
+obj-$(CONFIG_DM_DUST)		+= dm-dust.o
 obj-$(CONFIG_DM_FLAKEY)		+= dm-flakey.o
 obj-$(CONFIG_DM_MULTIPATH)	+= dm-multipath.o dm-round-robin.o
 obj-$(CONFIG_DM_MULTIPATH_QL)	+= dm-queue-length.o
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 6fc93834da44..151aa95775be 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -1167,11 +1167,18 @@ static int __load_discards(struct dm_cache_metadata *cmd,
 		if (r)
 			return r;
 
-		for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) {
+		for (b = 0; ; b++) {
 			r = fn(context, cmd->discard_block_size, to_dblock(b),
 			       dm_bitset_cursor_get_value(&c));
 			if (r)
 				break;
+
+			if (b >= (from_dblock(cmd->discard_nr_blocks) - 1))
+				break;
+
+			r = dm_bitset_cursor_next(&c);
+			if (r)
+				break;
 		}
 
 		dm_bitset_cursor_end(&c);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 7f6462f74ac8..1b16d34bb785 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -946,6 +946,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti)
 {
 #ifdef CONFIG_BLK_DEV_INTEGRITY
 	struct blk_integrity *bi = blk_get_integrity(cc->dev->bdev->bd_disk);
+	struct mapped_device *md = dm_table_get_md(ti->table);
 
 	/* From now we require underlying device with our integrity profile */
 	if (!bi || strcasecmp(bi->profile->name, "DM-DIF-EXT-TAG")) {
@@ -965,7 +966,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti)
 
 	if (crypt_integrity_aead(cc)) {
 		cc->integrity_tag_size = cc->on_disk_tag_size - cc->integrity_iv_size;
-		DMINFO("Integrity AEAD, tag size %u, IV size %u.",
+		DMDEBUG("%s: Integrity AEAD, tag size %u, IV size %u.", dm_device_name(md),
 		       cc->integrity_tag_size, cc->integrity_iv_size);
 
 		if (crypto_aead_setauthsize(any_tfm_aead(cc), cc->integrity_tag_size)) {
@@ -973,7 +974,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti)
 			return -EINVAL;
 		}
 	} else if (cc->integrity_iv_size)
-		DMINFO("Additional per-sector space %u bytes for IV.",
+		DMDEBUG("%s: Additional per-sector space %u bytes for IV.", dm_device_name(md),
 		       cc->integrity_iv_size);
 
 	if ((cc->integrity_tag_size + cc->integrity_iv_size) != bi->tag_size) {
@@ -1031,11 +1032,11 @@ static u8 *org_iv_of_dmreq(struct crypt_config *cc,
 	return iv_of_dmreq(cc, dmreq) + cc->iv_size;
 }
 
-static uint64_t *org_sector_of_dmreq(struct crypt_config *cc,
+static __le64 *org_sector_of_dmreq(struct crypt_config *cc,
 		       struct dm_crypt_request *dmreq)
 {
 	u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size + cc->iv_size;
-	return (uint64_t*) ptr;
+	return (__le64 *) ptr;
 }
 
 static unsigned int *org_tag_of_dmreq(struct crypt_config *cc,
@@ -1071,7 +1072,7 @@ static int crypt_convert_block_aead(struct crypt_config *cc,
 	struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out);
 	struct dm_crypt_request *dmreq;
 	u8 *iv, *org_iv, *tag_iv, *tag;
-	uint64_t *sector;
+	__le64 *sector;
 	int r = 0;
 
 	BUG_ON(cc->integrity_iv_size && cc->integrity_iv_size != cc->iv_size);
@@ -1143,9 +1144,11 @@ static int crypt_convert_block_aead(struct crypt_config *cc,
 		r = crypto_aead_decrypt(req);
 	}
 
-	if (r == -EBADMSG)
-		DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu",
+	if (r == -EBADMSG) {
+		char b[BDEVNAME_SIZE];
+		DMERR_LIMIT("%s: INTEGRITY AEAD ERROR, sector %llu", bio_devname(ctx->bio_in, b),
 			    (unsigned long long)le64_to_cpu(*sector));
+	}
 
 	if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
 		r = cc->iv_gen_ops->post(cc, org_iv, dmreq);
@@ -1166,7 +1169,7 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc,
 	struct scatterlist *sg_in, *sg_out;
 	struct dm_crypt_request *dmreq;
 	u8 *iv, *org_iv, *tag_iv;
-	uint64_t *sector;
+	__le64 *sector;
 	int r = 0;
 
 	/* Reject unexpected unaligned bio. */
@@ -1788,7 +1791,8 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 		error = cc->iv_gen_ops->post(cc, org_iv_of_dmreq(cc, dmreq), dmreq);
 
 	if (error == -EBADMSG) {
-		DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu",
+		char b[BDEVNAME_SIZE];
+		DMERR_LIMIT("%s: INTEGRITY AEAD ERROR, sector %llu", bio_devname(ctx->bio_in, b),
 			    (unsigned long long)le64_to_cpu(*org_sector_of_dmreq(cc, dmreq)));
 		io->error = BLK_STS_PROTECTION;
 	} else if (error < 0)
@@ -1887,7 +1891,7 @@ static int crypt_alloc_tfms_skcipher(struct crypt_config *cc, char *ciphermode)
 	 * algorithm implementation is used.  Help people debug performance
 	 * problems by logging the ->cra_driver_name.
 	 */
-	DMINFO("%s using implementation \"%s\"", ciphermode,
+	DMDEBUG_LIMIT("%s using implementation \"%s\"", ciphermode,
 	       crypto_skcipher_alg(any_tfm(cc))->base.cra_driver_name);
 	return 0;
 }
@@ -1907,7 +1911,7 @@ static int crypt_alloc_tfms_aead(struct crypt_config *cc, char *ciphermode)
 		return err;
 	}
 
-	DMINFO("%s using implementation \"%s\"", ciphermode,
+	DMDEBUG_LIMIT("%s using implementation \"%s\"", ciphermode,
 	       crypto_aead_alg(any_tfm_aead(cc))->base.cra_driver_name);
 	return 0;
 }
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index fddffe251bf6..f496213f8b67 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -121,7 +121,8 @@ static void delay_dtr(struct dm_target *ti)
 {
 	struct delay_c *dc = ti->private;
 
-	destroy_workqueue(dc->kdelayd_wq);
+	if (dc->kdelayd_wq)
+		destroy_workqueue(dc->kdelayd_wq);
 
 	if (dc->read.dev)
 		dm_put_device(ti, dc->read.dev);
diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c
new file mode 100644
index 000000000000..845f376a72d9
--- /dev/null
+++ b/drivers/md/dm-dust.c
@@ -0,0 +1,515 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018 Red Hat, Inc.
+ *
+ * This is a test "dust" device, which fails reads on specified
+ * sectors, emulating the behavior of a hard disk drive sending
+ * a "Read Medium Error" sense.
+ *
+ */
+
+#include <linux/device-mapper.h>
+#include <linux/module.h>
+#include <linux/rbtree.h>
+
+#define DM_MSG_PREFIX "dust"
+
+struct badblock {
+	struct rb_node node;
+	sector_t bb;
+};
+
+struct dust_device {
+	struct dm_dev *dev;
+	struct rb_root badblocklist;
+	unsigned long long badblock_count;
+	spinlock_t dust_lock;
+	unsigned int blksz;
+	unsigned int sect_per_block;
+	sector_t start;
+	bool fail_read_on_bb:1;
+	bool quiet_mode:1;
+};
+
+static struct badblock *dust_rb_search(struct rb_root *root, sector_t blk)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct badblock *bblk = rb_entry(node, struct badblock, node);
+
+		if (bblk->bb > blk)
+			node = node->rb_left;
+		else if (bblk->bb < blk)
+			node = node->rb_right;
+		else
+			return bblk;
+	}
+
+	return NULL;
+}
+
+static bool dust_rb_insert(struct rb_root *root, struct badblock *new)
+{
+	struct badblock *bblk;
+	struct rb_node **link = &root->rb_node, *parent = NULL;
+	sector_t value = new->bb;
+
+	while (*link) {
+		parent = *link;
+		bblk = rb_entry(parent, struct badblock, node);
+
+		if (bblk->bb > value)
+			link = &(*link)->rb_left;
+		else if (bblk->bb < value)
+			link = &(*link)->rb_right;
+		else
+			return false;
+	}
+
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, root);
+
+	return true;
+}
+
+static int dust_remove_block(struct dust_device *dd, unsigned long long block)
+{
+	struct badblock *bblock;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->dust_lock, flags);
+	bblock = dust_rb_search(&dd->badblocklist, block * dd->sect_per_block);
+
+	if (bblock == NULL) {
+		if (!dd->quiet_mode) {
+			DMERR("%s: block %llu not found in badblocklist",
+			      __func__, block);
+		}
+		spin_unlock_irqrestore(&dd->dust_lock, flags);
+		return -EINVAL;
+	}
+
+	rb_erase(&bblock->node, &dd->badblocklist);
+	dd->badblock_count--;
+	if (!dd->quiet_mode)
+		DMINFO("%s: badblock removed at block %llu", __func__, block);
+	kfree(bblock);
+	spin_unlock_irqrestore(&dd->dust_lock, flags);
+
+	return 0;
+}
+
+static int dust_add_block(struct dust_device *dd, unsigned long long block)
+{
+	struct badblock *bblock;
+	unsigned long flags;
+
+	bblock = kmalloc(sizeof(*bblock), GFP_KERNEL);
+	if (bblock == NULL) {
+		if (!dd->quiet_mode)
+			DMERR("%s: badblock allocation failed", __func__);
+		return -ENOMEM;
+	}
+
+	spin_lock_irqsave(&dd->dust_lock, flags);
+	bblock->bb = block * dd->sect_per_block;
+	if (!dust_rb_insert(&dd->badblocklist, bblock)) {
+		if (!dd->quiet_mode) {
+			DMERR("%s: block %llu already in badblocklist",
+			      __func__, block);
+		}
+		spin_unlock_irqrestore(&dd->dust_lock, flags);
+		kfree(bblock);
+		return -EINVAL;
+	}
+
+	dd->badblock_count++;
+	if (!dd->quiet_mode)
+		DMINFO("%s: badblock added at block %llu", __func__, block);
+	spin_unlock_irqrestore(&dd->dust_lock, flags);
+
+	return 0;
+}
+
+static int dust_query_block(struct dust_device *dd, unsigned long long block)
+{
+	struct badblock *bblock;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->dust_lock, flags);
+	bblock = dust_rb_search(&dd->badblocklist, block * dd->sect_per_block);
+	if (bblock != NULL)
+		DMINFO("%s: block %llu found in badblocklist", __func__, block);
+	else
+		DMINFO("%s: block %llu not found in badblocklist", __func__, block);
+	spin_unlock_irqrestore(&dd->dust_lock, flags);
+
+	return 0;
+}
+
+static int __dust_map_read(struct dust_device *dd, sector_t thisblock)
+{
+	struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock);
+
+	if (bblk)
+		return DM_MAPIO_KILL;
+
+	return DM_MAPIO_REMAPPED;
+}
+
+static int dust_map_read(struct dust_device *dd, sector_t thisblock,
+			 bool fail_read_on_bb)
+{
+	unsigned long flags;
+	int ret = DM_MAPIO_REMAPPED;
+
+	if (fail_read_on_bb) {
+		spin_lock_irqsave(&dd->dust_lock, flags);
+		ret = __dust_map_read(dd, thisblock);
+		spin_unlock_irqrestore(&dd->dust_lock, flags);
+	}
+
+	return ret;
+}
+
+static void __dust_map_write(struct dust_device *dd, sector_t thisblock)
+{
+	struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock);
+
+	if (bblk) {
+		rb_erase(&bblk->node, &dd->badblocklist);
+		dd->badblock_count--;
+		kfree(bblk);
+		if (!dd->quiet_mode) {
+			sector_div(thisblock, dd->sect_per_block);
+			DMINFO("block %llu removed from badblocklist by write",
+			       (unsigned long long)thisblock);
+		}
+	}
+}
+
+static int dust_map_write(struct dust_device *dd, sector_t thisblock,
+			  bool fail_read_on_bb)
+{
+	unsigned long flags;
+
+	if (fail_read_on_bb) {
+		spin_lock_irqsave(&dd->dust_lock, flags);
+		__dust_map_write(dd, thisblock);
+		spin_unlock_irqrestore(&dd->dust_lock, flags);
+	}
+
+	return DM_MAPIO_REMAPPED;
+}
+
+static int dust_map(struct dm_target *ti, struct bio *bio)
+{
+	struct dust_device *dd = ti->private;
+	int ret;
+
+	bio_set_dev(bio, dd->dev->bdev);
+	bio->bi_iter.bi_sector = dd->start + dm_target_offset(ti, bio->bi_iter.bi_sector);
+
+	if (bio_data_dir(bio) == READ)
+		ret = dust_map_read(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb);
+	else
+		ret = dust_map_write(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb);
+
+	return ret;
+}
+
+static bool __dust_clear_badblocks(struct rb_root *tree,
+				   unsigned long long count)
+{
+	struct rb_node *node = NULL, *nnode = NULL;
+
+	nnode = rb_first(tree);
+	if (nnode == NULL) {
+		BUG_ON(count != 0);
+		return false;
+	}
+
+	while (nnode) {
+		node = nnode;
+		nnode = rb_next(node);
+		rb_erase(node, tree);
+		count--;
+		kfree(node);
+	}
+	BUG_ON(count != 0);
+	BUG_ON(tree->rb_node != NULL);
+
+	return true;
+}
+
+static int dust_clear_badblocks(struct dust_device *dd)
+{
+	unsigned long flags;
+	struct rb_root badblocklist;
+	unsigned long long badblock_count;
+
+	spin_lock_irqsave(&dd->dust_lock, flags);
+	badblocklist = dd->badblocklist;
+	badblock_count = dd->badblock_count;
+	dd->badblocklist = RB_ROOT;
+	dd->badblock_count = 0;
+	spin_unlock_irqrestore(&dd->dust_lock, flags);
+
+	if (!__dust_clear_badblocks(&badblocklist, badblock_count))
+		DMINFO("%s: no badblocks found", __func__);
+	else
+		DMINFO("%s: badblocks cleared", __func__);
+
+	return 0;
+}
+
+/*
+ * Target parameters:
+ *
+ * <device_path> <offset> <blksz>
+ *
+ * device_path: path to the block device
+ * offset: offset to data area from start of device_path
+ * blksz: block size (minimum 512, maximum 1073741824, must be a power of 2)
+ */
+static int dust_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	struct dust_device *dd;
+	unsigned long long tmp;
+	char dummy;
+	unsigned int blksz;
+	unsigned int sect_per_block;
+	sector_t DUST_MAX_BLKSZ_SECTORS = 2097152;
+	sector_t max_block_sectors = min(ti->len, DUST_MAX_BLKSZ_SECTORS);
+
+	if (argc != 3) {
+		ti->error = "Invalid argument count";
+		return -EINVAL;
+	}
+
+	if (kstrtouint(argv[2], 10, &blksz) || !blksz) {
+		ti->error = "Invalid block size parameter";
+		return -EINVAL;
+	}
+
+	if (blksz < 512) {
+		ti->error = "Block size must be at least 512";
+		return -EINVAL;
+	}
+
+	if (!is_power_of_2(blksz)) {
+		ti->error = "Block size must be a power of 2";
+		return -EINVAL;
+	}
+
+	if (to_sector(blksz) > max_block_sectors) {
+		ti->error = "Block size is too large";
+		return -EINVAL;
+	}
+
+	sect_per_block = (blksz >> SECTOR_SHIFT);
+
+	if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1 || tmp != (sector_t)tmp) {
+		ti->error = "Invalid device offset sector";
+		return -EINVAL;
+	}
+
+	dd = kzalloc(sizeof(struct dust_device), GFP_KERNEL);
+	if (dd == NULL) {
+		ti->error = "Cannot allocate context";
+		return -ENOMEM;
+	}
+
+	if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dd->dev)) {
+		ti->error = "Device lookup failed";
+		kfree(dd);
+		return -EINVAL;
+	}
+
+	dd->sect_per_block = sect_per_block;
+	dd->blksz = blksz;
+	dd->start = tmp;
+
+	/*
+	 * Whether to fail a read on a "bad" block.
+	 * Defaults to false; enabled later by message.
+	 */
+	dd->fail_read_on_bb = false;
+
+	/*
+	 * Initialize bad block list rbtree.
+	 */
+	dd->badblocklist = RB_ROOT;
+	dd->badblock_count = 0;
+	spin_lock_init(&dd->dust_lock);
+
+	dd->quiet_mode = false;
+
+	BUG_ON(dm_set_target_max_io_len(ti, dd->sect_per_block) != 0);
+
+	ti->num_discard_bios = 1;
+	ti->num_flush_bios = 1;
+	ti->private = dd;
+
+	return 0;
+}
+
+static void dust_dtr(struct dm_target *ti)
+{
+	struct dust_device *dd = ti->private;
+
+	__dust_clear_badblocks(&dd->badblocklist, dd->badblock_count);
+	dm_put_device(ti, dd->dev);
+	kfree(dd);
+}
+
+static int dust_message(struct dm_target *ti, unsigned int argc, char **argv,
+			char *result_buf, unsigned int maxlen)
+{
+	struct dust_device *dd = ti->private;
+	sector_t size = i_size_read(dd->dev->bdev->bd_inode) >> SECTOR_SHIFT;
+	bool invalid_msg = false;
+	int result = -EINVAL;
+	unsigned long long tmp, block;
+	unsigned long flags;
+	char dummy;
+
+	if (argc == 1) {
+		if (!strcasecmp(argv[0], "addbadblock") ||
+		    !strcasecmp(argv[0], "removebadblock") ||
+		    !strcasecmp(argv[0], "queryblock")) {
+			DMERR("%s requires an additional argument", argv[0]);
+		} else if (!strcasecmp(argv[0], "disable")) {
+			DMINFO("disabling read failures on bad sectors");
+			dd->fail_read_on_bb = false;
+			result = 0;
+		} else if (!strcasecmp(argv[0], "enable")) {
+			DMINFO("enabling read failures on bad sectors");
+			dd->fail_read_on_bb = true;
+			result = 0;
+		} else if (!strcasecmp(argv[0], "countbadblocks")) {
+			spin_lock_irqsave(&dd->dust_lock, flags);
+			DMINFO("countbadblocks: %llu badblock(s) found",
+			       dd->badblock_count);
+			spin_unlock_irqrestore(&dd->dust_lock, flags);
+			result = 0;
+		} else if (!strcasecmp(argv[0], "clearbadblocks")) {
+			result = dust_clear_badblocks(dd);
+		} else if (!strcasecmp(argv[0], "quiet")) {
+			if (!dd->quiet_mode)
+				dd->quiet_mode = true;
+			else
+				dd->quiet_mode = false;
+			result = 0;
+		} else {
+			invalid_msg = true;
+		}
+	} else if (argc == 2) {
+		if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1)
+			return result;
+
+		block = tmp;
+		sector_div(size, dd->sect_per_block);
+		if (block > size) {
+			DMERR("selected block value out of range");
+			return result;
+		}
+
+		if (!strcasecmp(argv[0], "addbadblock"))
+			result = dust_add_block(dd, block);
+		else if (!strcasecmp(argv[0], "removebadblock"))
+			result = dust_remove_block(dd, block);
+		else if (!strcasecmp(argv[0], "queryblock"))
+			result = dust_query_block(dd, block);
+		else
+			invalid_msg = true;
+
+	} else
+		DMERR("invalid number of arguments '%d'", argc);
+
+	if (invalid_msg)
+		DMERR("unrecognized message '%s' received", argv[0]);
+
+	return result;
+}
+
+static void dust_status(struct dm_target *ti, status_type_t type,
+			unsigned int status_flags, char *result, unsigned int maxlen)
+{
+	struct dust_device *dd = ti->private;
+	unsigned int sz = 0;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		DMEMIT("%s %s %s", dd->dev->name,
+		       dd->fail_read_on_bb ? "fail_read_on_bad_block" : "bypass",
+		       dd->quiet_mode ? "quiet" : "verbose");
+		break;
+
+	case STATUSTYPE_TABLE:
+		DMEMIT("%s %llu %u", dd->dev->name,
+		       (unsigned long long)dd->start, dd->blksz);
+		break;
+	}
+}
+
+static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
+{
+	struct dust_device *dd = ti->private;
+	struct dm_dev *dev = dd->dev;
+
+	*bdev = dev->bdev;
+
+	/*
+	 * Only pass ioctls through if the device sizes match exactly.
+	 */
+	if (dd->start ||
+	    ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
+		return 1;
+
+	return 0;
+}
+
+static int dust_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn,
+				void *data)
+{
+	struct dust_device *dd = ti->private;
+
+	return fn(ti, dd->dev, dd->start, ti->len, data);
+}
+
+static struct target_type dust_target = {
+	.name = "dust",
+	.version = {1, 0, 0},
+	.module = THIS_MODULE,
+	.ctr = dust_ctr,
+	.dtr = dust_dtr,
+	.iterate_devices = dust_iterate_devices,
+	.map = dust_map,
+	.message = dust_message,
+	.status = dust_status,
+	.prepare_ioctl = dust_prepare_ioctl,
+};
+
+static int __init dm_dust_init(void)
+{
+	int result = dm_register_target(&dust_target);
+
+	if (result < 0)
+		DMERR("dm_register_target failed %d", result);
+
+	return result;
+}
+
+static void __exit dm_dust_exit(void)
+{
+	dm_unregister_target(&dust_target);
+}
+
+module_init(dm_dust_init);
+module_exit(dm_dust_exit);
+
+MODULE_DESCRIPTION(DM_NAME " dust test target");
+MODULE_AUTHOR("Bryan Gurney <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index 721efc493942..3f4139ac1f60 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -11,6 +11,7 @@
 #define _LINUX_DM_EXCEPTION_STORE
 
 #include <linux/blkdev.h>
+#include <linux/list_bl.h>
 #include <linux/device-mapper.h>
 
 /*
@@ -27,7 +28,7 @@ typedef sector_t chunk_t;
  * chunk within the device.
  */
 struct dm_exception {
-	struct list_head hash_list;
+	struct hlist_bl_node hash_list;
 
 	chunk_t old_chunk;
 	chunk_t new_chunk;
diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c
index 4b76f84424c3..352e803f566e 100644
--- a/drivers/md/dm-init.c
+++ b/drivers/md/dm-init.c
@@ -160,7 +160,7 @@ static int __init dm_parse_table(struct dm_device *dev, char *str)
 
 	while (table_entry) {
 		DMDEBUG("parsing table \"%s\"", str);
-		if (++dev->dmi.target_count >= DM_MAX_TARGETS) {
+		if (++dev->dmi.target_count > DM_MAX_TARGETS) {
 			DMERR("too many targets %u > %d",
 			      dev->dmi.target_count, DM_MAX_TARGETS);
 			return -EINVAL;
@@ -242,9 +242,9 @@ static int __init dm_parse_devices(struct list_head *devices, char *str)
 			return -ENOMEM;
 		list_add_tail(&dev->list, devices);
 
-		if (++ndev >= DM_MAX_DEVICES) {
-			DMERR("too many targets %u > %d",
-			      dev->dmi.target_count, DM_MAX_TARGETS);
+		if (++ndev > DM_MAX_DEVICES) {
+			DMERR("too many devices %lu > %d",
+			      ndev, DM_MAX_DEVICES);
 			return -EINVAL;
 		}
 
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index c27c32cf4a30..44e76cda087a 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -15,6 +15,7 @@
 #include <linux/rbtree.h>
 #include <linux/delay.h>
 #include <linux/random.h>
+#include <linux/reboot.h>
 #include <crypto/hash.h>
 #include <crypto/skcipher.h>
 #include <linux/async_tx.h>
@@ -24,6 +25,7 @@
 
 #define DEFAULT_INTERLEAVE_SECTORS	32768
 #define DEFAULT_JOURNAL_SIZE_FACTOR	7
+#define DEFAULT_SECTORS_PER_BITMAP_BIT	32768
 #define DEFAULT_BUFFER_SECTORS		128
 #define DEFAULT_JOURNAL_WATERMARK	50
 #define DEFAULT_SYNC_MSEC		10000
@@ -33,6 +35,8 @@
 #define METADATA_WORKQUEUE_MAX_ACTIVE	16
 #define RECALC_SECTORS			8192
 #define RECALC_WRITE_SUPER		16
+#define BITMAP_BLOCK_SIZE		4096	/* don't change it */
+#define BITMAP_FLUSH_INTERVAL		(10 * HZ)
 
 /*
  * Warning - DEBUG_PRINT prints security-sensitive data to the log,
@@ -48,6 +52,7 @@
 #define SB_MAGIC			"integrt"
 #define SB_VERSION_1			1
 #define SB_VERSION_2			2
+#define SB_VERSION_3			3
 #define SB_SECTORS			8
 #define MAX_SECTORS_PER_BLOCK		8
 
@@ -60,12 +65,14 @@ struct superblock {
 	__u64 provided_data_sectors;	/* userspace uses this value */
 	__u32 flags;
 	__u8 log2_sectors_per_block;
-	__u8 pad[3];
+	__u8 log2_blocks_per_bitmap_bit;
+	__u8 pad[2];
 	__u64 recalc_sector;
 };
 
 #define SB_FLAG_HAVE_JOURNAL_MAC	0x1
 #define SB_FLAG_RECALCULATING		0x2
+#define SB_FLAG_DIRTY_BITMAP		0x4
 
 #define	JOURNAL_ENTRY_ROUNDUP		8
 
@@ -151,9 +158,18 @@ struct dm_integrity_c {
 	struct workqueue_struct *metadata_wq;
 	struct superblock *sb;
 	unsigned journal_pages;
+	unsigned n_bitmap_blocks;
+
 	struct page_list *journal;
 	struct page_list *journal_io;
 	struct page_list *journal_xor;
+	struct page_list *recalc_bitmap;
+	struct page_list *may_write_bitmap;
+	struct bitmap_block_status *bbs;
+	unsigned bitmap_flush_interval;
+	int synchronous_mode;
+	struct bio_list synchronous_bios;
+	struct delayed_work bitmap_flush_work;
 
 	struct crypto_skcipher *journal_crypt;
 	struct scatterlist **journal_scatterlist;
@@ -180,6 +196,7 @@ struct dm_integrity_c {
 	__s8 log2_metadata_run;
 	__u8 log2_buffer_sectors;
 	__u8 sectors_per_block;
+	__u8 log2_blocks_per_bitmap_bit;
 
 	unsigned char mode;
 	int suspending;
@@ -232,17 +249,20 @@ struct dm_integrity_c {
 
 	bool journal_uptodate;
 	bool just_formatted;
+	bool recalculate_flag;
 
 	struct alg_spec internal_hash_alg;
 	struct alg_spec journal_crypt_alg;
 	struct alg_spec journal_mac_alg;
 
 	atomic64_t number_of_mismatches;
+
+	struct notifier_block reboot_notifier;
 };
 
 struct dm_integrity_range {
 	sector_t logical_sector;
-	unsigned n_sectors;
+	sector_t n_sectors;
 	bool waiting;
 	union {
 		struct rb_node node;
@@ -288,6 +308,16 @@ struct journal_io {
 	struct journal_completion *comp;
 };
 
+struct bitmap_block_status {
+	struct work_struct work;
+	struct dm_integrity_c *ic;
+	unsigned idx;
+	unsigned long *bitmap;
+	struct bio_list bio_queue;
+	spinlock_t bio_queue_lock;
+
+};
+
 static struct kmem_cache *journal_io_cache;
 
 #define JOURNAL_IO_MEMPOOL	32
@@ -423,7 +453,9 @@ static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr)
 
 static void sb_set_version(struct dm_integrity_c *ic)
 {
-	if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
+	if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP))
+		ic->sb->version = SB_VERSION_3;
+	else if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
 		ic->sb->version = SB_VERSION_2;
 	else
 		ic->sb->version = SB_VERSION_1;
@@ -447,6 +479,137 @@ static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags)
 	return dm_io(&io_req, 1, &io_loc, NULL);
 }
 
+#define BITMAP_OP_TEST_ALL_SET		0
+#define BITMAP_OP_TEST_ALL_CLEAR	1
+#define BITMAP_OP_SET			2
+#define BITMAP_OP_CLEAR			3
+
+static bool block_bitmap_op(struct dm_integrity_c *ic, struct page_list *bitmap,
+			    sector_t sector, sector_t n_sectors, int mode)
+{
+	unsigned long bit, end_bit, this_end_bit, page, end_page;
+	unsigned long *data;
+
+	if (unlikely(((sector | n_sectors) & ((1 << ic->sb->log2_sectors_per_block) - 1)) != 0)) {
+		DMCRIT("invalid bitmap access (%llx,%llx,%d,%d,%d)",
+			(unsigned long long)sector,
+			(unsigned long long)n_sectors,
+			ic->sb->log2_sectors_per_block,
+			ic->log2_blocks_per_bitmap_bit,
+			mode);
+		BUG();
+	}
+
+	if (unlikely(!n_sectors))
+		return true;
+
+	bit = sector >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
+	end_bit = (sector + n_sectors - 1) >>
+		(ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
+
+	page = bit / (PAGE_SIZE * 8);
+	bit %= PAGE_SIZE * 8;
+
+	end_page = end_bit / (PAGE_SIZE * 8);
+	end_bit %= PAGE_SIZE * 8;
+
+repeat:
+	if (page < end_page) {
+		this_end_bit = PAGE_SIZE * 8 - 1;
+	} else {
+		this_end_bit = end_bit;
+	}
+
+	data = lowmem_page_address(bitmap[page].page);
+
+	if (mode == BITMAP_OP_TEST_ALL_SET) {
+		while (bit <= this_end_bit) {
+			if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
+				do {
+					if (data[bit / BITS_PER_LONG] != -1)
+						return false;
+					bit += BITS_PER_LONG;
+				} while (this_end_bit >= bit + BITS_PER_LONG - 1);
+				continue;
+			}
+			if (!test_bit(bit, data))
+				return false;
+			bit++;
+		}
+	} else if (mode == BITMAP_OP_TEST_ALL_CLEAR) {
+		while (bit <= this_end_bit) {
+			if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
+				do {
+					if (data[bit / BITS_PER_LONG] != 0)
+						return false;
+					bit += BITS_PER_LONG;
+				} while (this_end_bit >= bit + BITS_PER_LONG - 1);
+				continue;
+			}
+			if (test_bit(bit, data))
+				return false;
+			bit++;
+		}
+	} else if (mode == BITMAP_OP_SET) {
+		while (bit <= this_end_bit) {
+			if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
+				do {
+					data[bit / BITS_PER_LONG] = -1;
+					bit += BITS_PER_LONG;
+				} while (this_end_bit >= bit + BITS_PER_LONG - 1);
+				continue;
+			}
+			__set_bit(bit, data);
+			bit++;
+		}
+	} else if (mode == BITMAP_OP_CLEAR) {
+		if (!bit && this_end_bit == PAGE_SIZE * 8 - 1)
+			clear_page(data);
+		else while (bit <= this_end_bit) {
+			if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
+				do {
+					data[bit / BITS_PER_LONG] = 0;
+					bit += BITS_PER_LONG;
+				} while (this_end_bit >= bit + BITS_PER_LONG - 1);
+				continue;
+			}
+			__clear_bit(bit, data);
+			bit++;
+		}
+	} else {
+		BUG();
+	}
+
+	if (unlikely(page < end_page)) {
+		bit = 0;
+		page++;
+		goto repeat;
+	}
+
+	return true;
+}
+
+static void block_bitmap_copy(struct dm_integrity_c *ic, struct page_list *dst, struct page_list *src)
+{
+	unsigned n_bitmap_pages = DIV_ROUND_UP(ic->n_bitmap_blocks, PAGE_SIZE / BITMAP_BLOCK_SIZE);
+	unsigned i;
+
+	for (i = 0; i < n_bitmap_pages; i++) {
+		unsigned long *dst_data = lowmem_page_address(dst[i].page);
+		unsigned long *src_data = lowmem_page_address(src[i].page);
+		copy_page(dst_data, src_data);
+	}
+}
+
+static struct bitmap_block_status *sector_to_bitmap_block(struct dm_integrity_c *ic, sector_t sector)
+{
+	unsigned bit = sector >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
+	unsigned bitmap_block = bit / (BITMAP_BLOCK_SIZE * 8);
+
+	BUG_ON(bitmap_block >= ic->n_bitmap_blocks);
+	return &ic->bbs[bitmap_block];
+}
+
 static void access_journal_check(struct dm_integrity_c *ic, unsigned section, unsigned offset,
 				 bool e, const char *function)
 {
@@ -455,8 +618,8 @@ static void access_journal_check(struct dm_integrity_c *ic, unsigned section, un
 
 	if (unlikely(section >= ic->journal_sections) ||
 	    unlikely(offset >= limit)) {
-		printk(KERN_CRIT "%s: invalid access at (%u,%u), limit (%u,%u)\n",
-			function, section, offset, ic->journal_sections, limit);
+		DMCRIT("%s: invalid access at (%u,%u), limit (%u,%u)",
+		       function, section, offset, ic->journal_sections, limit);
 		BUG();
 	}
 #endif
@@ -756,12 +919,12 @@ static void complete_journal_io(unsigned long error, void *context)
 	complete_journal_op(comp);
 }
 
-static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section,
-		       unsigned n_sections, struct journal_completion *comp)
+static void rw_journal_sectors(struct dm_integrity_c *ic, int op, int op_flags,
+			       unsigned sector, unsigned n_sectors, struct journal_completion *comp)
 {
 	struct dm_io_request io_req;
 	struct dm_io_region io_loc;
-	unsigned sector, n_sectors, pl_index, pl_offset;
+	unsigned pl_index, pl_offset;
 	int r;
 
 	if (unlikely(dm_integrity_failed(ic))) {
@@ -770,9 +933,6 @@ static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned
 		return;
 	}
 
-	sector = section * ic->journal_section_sectors;
-	n_sectors = n_sections * ic->journal_section_sectors;
-
 	pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
 	pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
 
@@ -805,6 +965,17 @@ static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned
 	}
 }
 
+static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section,
+		       unsigned n_sections, struct journal_completion *comp)
+{
+	unsigned sector, n_sectors;
+
+	sector = section * ic->journal_section_sectors;
+	n_sectors = n_sections * ic->journal_section_sectors;
+
+	rw_journal_sectors(ic, op, op_flags, sector, n_sectors, comp);
+}
+
 static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsigned commit_sections)
 {
 	struct journal_completion io_comp;
@@ -988,6 +1159,12 @@ static void wait_and_add_new_range(struct dm_integrity_c *ic, struct dm_integrit
 	} while (unlikely(new_range->waiting));
 }
 
+static void add_new_range_and_wait(struct dm_integrity_c *ic, struct dm_integrity_range *new_range)
+{
+	if (unlikely(!add_new_range(ic, new_range, true)))
+		wait_and_add_new_range(ic, new_range);
+}
+
 static void init_journal_node(struct journal_node *node)
 {
 	RB_CLEAR_NODE(&node->node);
@@ -1204,6 +1381,14 @@ static void do_endio(struct dm_integrity_c *ic, struct bio *bio)
 	int r = dm_integrity_failed(ic);
 	if (unlikely(r) && !bio->bi_status)
 		bio->bi_status = errno_to_blk_status(r);
+	if (unlikely(ic->synchronous_mode) && bio_op(bio) == REQ_OP_WRITE) {
+		unsigned long flags;
+		spin_lock_irqsave(&ic->endio_wait.lock, flags);
+		bio_list_add(&ic->synchronous_bios, bio);
+		queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0);
+		spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
+		return;
+	}
 	bio_endio(bio);
 }
 
@@ -1477,7 +1662,8 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
 			else
 				wanted_tag_size *= ic->tag_size;
 			if (unlikely(wanted_tag_size != bip->bip_iter.bi_size)) {
-				DMERR("Invalid integrity data size %u, expected %u", bip->bip_iter.bi_size, wanted_tag_size);
+				DMERR("Invalid integrity data size %u, expected %u",
+				      bip->bip_iter.bi_size, wanted_tag_size);
 				return DM_MAPIO_KILL;
 			}
 		}
@@ -1681,7 +1867,7 @@ retry:
 			unsigned ws, we, range_sectors;
 
 			dio->range.n_sectors = min(dio->range.n_sectors,
-						   ic->free_sectors << ic->sb->log2_sectors_per_block);
+						   (sector_t)ic->free_sectors << ic->sb->log2_sectors_per_block);
 			if (unlikely(!dio->range.n_sectors)) {
 				if (from_map)
 					goto offload_to_thread;
@@ -1764,6 +1950,20 @@ offload_to_thread:
 		goto journal_read_write;
 	}
 
+	if (ic->mode == 'B' && dio->write) {
+		if (!block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
+				     dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) {
+			struct bitmap_block_status *bbs;
+
+			bbs = sector_to_bitmap_block(ic, dio->range.logical_sector);
+			spin_lock(&bbs->bio_queue_lock);
+			bio_list_add(&bbs->bio_queue, bio);
+			spin_unlock(&bbs->bio_queue_lock);
+			queue_work(ic->writer_wq, &bbs->work);
+			return;
+		}
+	}
+
 	dio->in_flight = (atomic_t)ATOMIC_INIT(2);
 
 	if (need_sync_io) {
@@ -1790,10 +1990,15 @@ offload_to_thread:
 
 	if (need_sync_io) {
 		wait_for_completion_io(&read_comp);
-		if (unlikely(ic->recalc_wq != NULL) &&
-		    ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
+		if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
 		    dio->range.logical_sector + dio->range.n_sectors > le64_to_cpu(ic->sb->recalc_sector))
 			goto skip_check;
+		if (ic->mode == 'B') {
+			if (!block_bitmap_op(ic, ic->recalc_bitmap, dio->range.logical_sector,
+					     dio->range.n_sectors, BITMAP_OP_TEST_ALL_CLEAR))
+				goto skip_check;
+		}
+
 		if (likely(!bio->bi_status))
 			integrity_metadata(&dio->work);
 		else
@@ -1831,8 +2036,16 @@ static void pad_uncommitted(struct dm_integrity_c *ic)
 		wraparound_section(ic, &ic->free_section);
 		ic->n_uncommitted_sections++;
 	}
-	WARN_ON(ic->journal_sections * ic->journal_section_entries !=
-		(ic->n_uncommitted_sections + ic->n_committed_sections) * ic->journal_section_entries + ic->free_sectors);
+	if (WARN_ON(ic->journal_sections * ic->journal_section_entries !=
+		    (ic->n_uncommitted_sections + ic->n_committed_sections) *
+		    ic->journal_section_entries + ic->free_sectors)) {
+		DMCRIT("journal_sections %u, journal_section_entries %u, "
+		       "n_uncommitted_sections %u, n_committed_sections %u, "
+		       "journal_section_entries %u, free_sectors %u",
+		       ic->journal_sections, ic->journal_section_entries,
+		       ic->n_uncommitted_sections, ic->n_committed_sections,
+		       ic->journal_section_entries, ic->free_sectors);
+	}
 }
 
 static void integrity_commit(struct work_struct *w)
@@ -1981,8 +2194,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
 			io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block;
 
 			spin_lock_irq(&ic->endio_wait.lock);
-			if (unlikely(!add_new_range(ic, &io->range, true)))
-				wait_and_add_new_range(ic, &io->range);
+			add_new_range_and_wait(ic, &io->range);
 
 			if (likely(!from_replay)) {
 				struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries];
@@ -2120,11 +2332,14 @@ static void integrity_recalc(struct work_struct *w)
 	sector_t area, offset;
 	sector_t metadata_block;
 	unsigned metadata_offset;
+	sector_t logical_sector, n_sectors;
 	__u8 *t;
 	unsigned i;
 	int r;
 	unsigned super_counter = 0;
 
+	DEBUG_print("start recalculation... (position %llx)\n", le64_to_cpu(ic->sb->recalc_sector));
+
 	spin_lock_irq(&ic->endio_wait.lock);
 
 next_chunk:
@@ -2133,21 +2348,49 @@ next_chunk:
 		goto unlock_ret;
 
 	range.logical_sector = le64_to_cpu(ic->sb->recalc_sector);
-	if (unlikely(range.logical_sector >= ic->provided_data_sectors))
+	if (unlikely(range.logical_sector >= ic->provided_data_sectors)) {
+		if (ic->mode == 'B') {
+			DEBUG_print("queue_delayed_work: bitmap_flush_work\n");
+			queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0);
+		}
 		goto unlock_ret;
+	}
 
 	get_area_and_offset(ic, range.logical_sector, &area, &offset);
 	range.n_sectors = min((sector_t)RECALC_SECTORS, ic->provided_data_sectors - range.logical_sector);
 	if (!ic->meta_dev)
-		range.n_sectors = min(range.n_sectors, (1U << ic->sb->log2_interleave_sectors) - (unsigned)offset);
-
-	if (unlikely(!add_new_range(ic, &range, true)))
-		wait_and_add_new_range(ic, &range);
+		range.n_sectors = min(range.n_sectors, ((sector_t)1U << ic->sb->log2_interleave_sectors) - (unsigned)offset);
 
+	add_new_range_and_wait(ic, &range);
 	spin_unlock_irq(&ic->endio_wait.lock);
+	logical_sector = range.logical_sector;
+	n_sectors = range.n_sectors;
+
+	if (ic->mode == 'B') {
+		if (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector, n_sectors, BITMAP_OP_TEST_ALL_CLEAR)) {
+			goto advance_and_next;
+		}
+		while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector,
+				       ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) {
+			logical_sector += ic->sectors_per_block;
+			n_sectors -= ic->sectors_per_block;
+			cond_resched();
+		}
+		while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector + n_sectors - ic->sectors_per_block,
+				       ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) {
+			n_sectors -= ic->sectors_per_block;
+			cond_resched();
+		}
+		get_area_and_offset(ic, logical_sector, &area, &offset);
+	}
+
+	DEBUG_print("recalculating: %lx, %lx\n", logical_sector, n_sectors);
 
 	if (unlikely(++super_counter == RECALC_WRITE_SUPER)) {
 		recalc_write_super(ic);
+		if (ic->mode == 'B') {
+			queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval);
+		}
 		super_counter = 0;
 	}
 
@@ -2162,7 +2405,7 @@ next_chunk:
 	io_req.client = ic->io;
 	io_loc.bdev = ic->dev->bdev;
 	io_loc.sector = get_data_sector(ic, area, offset);
-	io_loc.count = range.n_sectors;
+	io_loc.count = n_sectors;
 
 	r = dm_io(&io_req, 1, &io_loc, NULL);
 	if (unlikely(r)) {
@@ -2171,8 +2414,8 @@ next_chunk:
 	}
 
 	t = ic->recalc_tags;
-	for (i = 0; i < range.n_sectors; i += ic->sectors_per_block) {
-		integrity_sector_checksum(ic, range.logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t);
+	for (i = 0; i < n_sectors; i += ic->sectors_per_block) {
+		integrity_sector_checksum(ic, logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t);
 		t += ic->tag_size;
 	}
 
@@ -2184,6 +2427,9 @@ next_chunk:
 		goto err;
 	}
 
+advance_and_next:
+	cond_resched();
+
 	spin_lock_irq(&ic->endio_wait.lock);
 	remove_range_unlocked(ic, &range);
 	ic->sb->recalc_sector = cpu_to_le64(range.logical_sector + range.n_sectors);
@@ -2199,6 +2445,103 @@ unlock_ret:
 	recalc_write_super(ic);
 }
 
+static void bitmap_block_work(struct work_struct *w)
+{
+	struct bitmap_block_status *bbs = container_of(w, struct bitmap_block_status, work);
+	struct dm_integrity_c *ic = bbs->ic;
+	struct bio *bio;
+	struct bio_list bio_queue;
+	struct bio_list waiting;
+
+	bio_list_init(&waiting);
+
+	spin_lock(&bbs->bio_queue_lock);
+	bio_queue = bbs->bio_queue;
+	bio_list_init(&bbs->bio_queue);
+	spin_unlock(&bbs->bio_queue_lock);
+
+	while ((bio = bio_list_pop(&bio_queue))) {
+		struct dm_integrity_io *dio;
+
+		dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
+
+		if (block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
+				    dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) {
+			remove_range(ic, &dio->range);
+			INIT_WORK(&dio->work, integrity_bio_wait);
+			queue_work(ic->wait_wq, &dio->work);
+		} else {
+			block_bitmap_op(ic, ic->journal, dio->range.logical_sector,
+					dio->range.n_sectors, BITMAP_OP_SET);
+			bio_list_add(&waiting, bio);
+		}
+	}
+
+	if (bio_list_empty(&waiting))
+		return;
+
+	rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC,
+			   bbs->idx * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT),
+			   BITMAP_BLOCK_SIZE >> SECTOR_SHIFT, NULL);
+
+	while ((bio = bio_list_pop(&waiting))) {
+		struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
+
+		block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
+				dio->range.n_sectors, BITMAP_OP_SET);
+
+		remove_range(ic, &dio->range);
+		INIT_WORK(&dio->work, integrity_bio_wait);
+		queue_work(ic->wait_wq, &dio->work);
+	}
+
+	queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval);
+}
+
+static void bitmap_flush_work(struct work_struct *work)
+{
+	struct dm_integrity_c *ic = container_of(work, struct dm_integrity_c, bitmap_flush_work.work);
+	struct dm_integrity_range range;
+	unsigned long limit;
+	struct bio *bio;
+
+	dm_integrity_flush_buffers(ic);
+
+	range.logical_sector = 0;
+	range.n_sectors = ic->provided_data_sectors;
+
+	spin_lock_irq(&ic->endio_wait.lock);
+	add_new_range_and_wait(ic, &range);
+	spin_unlock_irq(&ic->endio_wait.lock);
+
+	dm_integrity_flush_buffers(ic);
+	if (ic->meta_dev)
+		blkdev_issue_flush(ic->dev->bdev, GFP_NOIO, NULL);
+
+	limit = ic->provided_data_sectors;
+	if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
+		limit = le64_to_cpu(ic->sb->recalc_sector)
+			>> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit)
+			<< (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
+	}
+	/*DEBUG_print("zeroing journal\n");*/
+	block_bitmap_op(ic, ic->journal, 0, limit, BITMAP_OP_CLEAR);
+	block_bitmap_op(ic, ic->may_write_bitmap, 0, limit, BITMAP_OP_CLEAR);
+
+	rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0,
+			   ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
+
+	spin_lock_irq(&ic->endio_wait.lock);
+	remove_range_unlocked(ic, &range);
+	while (unlikely((bio = bio_list_pop(&ic->synchronous_bios)) != NULL)) {
+		bio_endio(bio);
+		spin_unlock_irq(&ic->endio_wait.lock);
+		spin_lock_irq(&ic->endio_wait.lock);
+	}
+	spin_unlock_irq(&ic->endio_wait.lock);
+}
+
+
 static void init_journal(struct dm_integrity_c *ic, unsigned start_section,
 			 unsigned n_sections, unsigned char commit_seq)
 {
@@ -2395,9 +2738,37 @@ clear_journal:
 		init_journal_node(&ic->journal_tree[i]);
 }
 
+static void dm_integrity_enter_synchronous_mode(struct dm_integrity_c *ic)
+{
+	DEBUG_print("dm_integrity_enter_synchronous_mode\n");
+
+	if (ic->mode == 'B') {
+		ic->bitmap_flush_interval = msecs_to_jiffies(10) + 1;
+		ic->synchronous_mode = 1;
+
+		cancel_delayed_work_sync(&ic->bitmap_flush_work);
+		queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0);
+		flush_workqueue(ic->commit_wq);
+	}
+}
+
+static int dm_integrity_reboot(struct notifier_block *n, unsigned long code, void *x)
+{
+	struct dm_integrity_c *ic = container_of(n, struct dm_integrity_c, reboot_notifier);
+
+	DEBUG_print("dm_integrity_reboot\n");
+
+	dm_integrity_enter_synchronous_mode(ic);
+
+	return NOTIFY_DONE;
+}
+
 static void dm_integrity_postsuspend(struct dm_target *ti)
 {
 	struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
+	int r;
+
+	WARN_ON(unregister_reboot_notifier(&ic->reboot_notifier));
 
 	del_timer_sync(&ic->autocommit_timer);
 
@@ -2406,6 +2777,9 @@ static void dm_integrity_postsuspend(struct dm_target *ti)
 	if (ic->recalc_wq)
 		drain_workqueue(ic->recalc_wq);
 
+	if (ic->mode == 'B')
+		cancel_delayed_work_sync(&ic->bitmap_flush_work);
+
 	queue_work(ic->commit_wq, &ic->commit_work);
 	drain_workqueue(ic->commit_wq);
 
@@ -2416,6 +2790,18 @@ static void dm_integrity_postsuspend(struct dm_target *ti)
 		dm_integrity_flush_buffers(ic);
 	}
 
+	if (ic->mode == 'B') {
+		dm_integrity_flush_buffers(ic);
+#if 1
+		/* set to 0 to test bitmap replay code */
+		init_journal(ic, 0, ic->journal_sections, 0);
+		ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
+		r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
+		if (unlikely(r))
+			dm_integrity_io_error(ic, "writing superblock", r);
+#endif
+	}
+
 	WRITE_ONCE(ic->suspending, 0);
 
 	BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
@@ -2426,11 +2812,70 @@ static void dm_integrity_postsuspend(struct dm_target *ti)
 static void dm_integrity_resume(struct dm_target *ti)
 {
 	struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
+	int r;
+	DEBUG_print("resume\n");
+
+	if (ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) {
+		DEBUG_print("resume dirty_bitmap\n");
+		rw_journal_sectors(ic, REQ_OP_READ, 0, 0,
+				   ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
+		if (ic->mode == 'B') {
+			if (ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit) {
+				block_bitmap_copy(ic, ic->recalc_bitmap, ic->journal);
+				block_bitmap_copy(ic, ic->may_write_bitmap, ic->journal);
+				if (!block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors,
+						     BITMAP_OP_TEST_ALL_CLEAR)) {
+					ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
+					ic->sb->recalc_sector = cpu_to_le64(0);
+				}
+			} else {
+				DEBUG_print("non-matching blocks_per_bitmap_bit: %u, %u\n",
+					    ic->sb->log2_blocks_per_bitmap_bit, ic->log2_blocks_per_bitmap_bit);
+				ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit;
+				block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET);
+				block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET);
+				block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_SET);
+				rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0,
+						   ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
+				ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
+				ic->sb->recalc_sector = cpu_to_le64(0);
+			}
+		} else {
+			if (!(ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit &&
+			      block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_TEST_ALL_CLEAR))) {
+				ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
+				ic->sb->recalc_sector = cpu_to_le64(0);
+			}
+			init_journal(ic, 0, ic->journal_sections, 0);
+			replay_journal(ic);
+			ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
+		}
+		r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
+		if (unlikely(r))
+			dm_integrity_io_error(ic, "writing superblock", r);
+	} else {
+		replay_journal(ic);
+		if (ic->mode == 'B') {
+			int mode;
+			ic->sb->flags |= cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
+			ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit;
+			r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
+			if (unlikely(r))
+				dm_integrity_io_error(ic, "writing superblock", r);
+
+			mode = ic->recalculate_flag ? BITMAP_OP_SET : BITMAP_OP_CLEAR;
+			block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, mode);
+			block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, mode);
+			block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, mode);
+			rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0,
+					   ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
+		}
+	}
 
-	replay_journal(ic);
-
-	if (ic->recalc_wq && ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
+	DEBUG_print("testing recalc: %x\n", ic->sb->flags);
+	if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
 		__u64 recalc_pos = le64_to_cpu(ic->sb->recalc_sector);
+		DEBUG_print("recalc pos: %lx / %lx\n", (long)recalc_pos, ic->provided_data_sectors);
 		if (recalc_pos < ic->provided_data_sectors) {
 			queue_work(ic->recalc_wq, &ic->recalc_work);
 		} else if (recalc_pos > ic->provided_data_sectors) {
@@ -2438,6 +2883,16 @@ static void dm_integrity_resume(struct dm_target *ti)
 			recalc_write_super(ic);
 		}
 	}
+
+	ic->reboot_notifier.notifier_call = dm_integrity_reboot;
+	ic->reboot_notifier.next = NULL;
+	ic->reboot_notifier.priority = INT_MAX - 1;	/* be notified after md and before hardware drivers */
+	WARN_ON(register_reboot_notifier(&ic->reboot_notifier));
+
+#if 0
+	/* set to 1 to stress test synchronous mode */
+	dm_integrity_enter_synchronous_mode(ic);
+#endif
 }
 
 static void dm_integrity_status(struct dm_target *ti, status_type_t type,
@@ -2462,10 +2917,14 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
 		__u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100;
 		watermark_percentage += ic->journal_entries / 2;
 		do_div(watermark_percentage, ic->journal_entries);
-		arg_count = 5;
+		arg_count = 3;
 		arg_count += !!ic->meta_dev;
 		arg_count += ic->sectors_per_block != 1;
 		arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING));
+		arg_count += ic->mode == 'J';
+		arg_count += ic->mode == 'J';
+		arg_count += ic->mode == 'B';
+		arg_count += ic->mode == 'B';
 		arg_count += !!ic->internal_hash_alg.alg_string;
 		arg_count += !!ic->journal_crypt_alg.alg_string;
 		arg_count += !!ic->journal_mac_alg.alg_string;
@@ -2475,13 +2934,19 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
 			DMEMIT(" meta_device:%s", ic->meta_dev->name);
 		if (ic->sectors_per_block != 1)
 			DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT);
-		if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
+		if (ic->recalculate_flag)
 			DMEMIT(" recalculate");
 		DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS);
 		DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors);
 		DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors);
-		DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage);
-		DMEMIT(" commit_time:%u", ic->autocommit_msec);
+		if (ic->mode == 'J') {
+			DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage);
+			DMEMIT(" commit_time:%u", ic->autocommit_msec);
+		}
+		if (ic->mode == 'B') {
+			DMEMIT(" sectors_per_bit:%llu", (unsigned long long)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit);
+			DMEMIT(" bitmap_flush_interval:%u", jiffies_to_msecs(ic->bitmap_flush_interval));
+		}
 
 #define EMIT_ALG(a, n)							\
 		do {							\
@@ -2562,7 +3027,7 @@ static int calculate_device_limits(struct dm_integrity_c *ic)
 		if (last_sector < ic->start || last_sector >= ic->meta_device_sectors)
 			return -EINVAL;
 	} else {
-		__u64 meta_size = ic->provided_data_sectors * ic->tag_size;
+		__u64 meta_size = (ic->provided_data_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size;
 		meta_size = (meta_size + ((1U << (ic->log2_buffer_sectors + SECTOR_SHIFT)) - 1))
 				>> (ic->log2_buffer_sectors + SECTOR_SHIFT);
 		meta_size <<= ic->log2_buffer_sectors;
@@ -2659,37 +3124,37 @@ static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic)
 	blk_queue_max_integrity_segments(disk->queue, UINT_MAX);
 }
 
-static void dm_integrity_free_page_list(struct dm_integrity_c *ic, struct page_list *pl)
+static void dm_integrity_free_page_list(struct page_list *pl)
 {
 	unsigned i;
 
 	if (!pl)
 		return;
-	for (i = 0; i < ic->journal_pages; i++)
-		if (pl[i].page)
-			__free_page(pl[i].page);
+	for (i = 0; pl[i].page; i++)
+		__free_page(pl[i].page);
 	kvfree(pl);
 }
 
-static struct page_list *dm_integrity_alloc_page_list(struct dm_integrity_c *ic)
+static struct page_list *dm_integrity_alloc_page_list(unsigned n_pages)
 {
-	size_t page_list_desc_size = ic->journal_pages * sizeof(struct page_list);
 	struct page_list *pl;
 	unsigned i;
 
-	pl = kvmalloc(page_list_desc_size, GFP_KERNEL | __GFP_ZERO);
+	pl = kvmalloc_array(n_pages + 1, sizeof(struct page_list), GFP_KERNEL | __GFP_ZERO);
 	if (!pl)
 		return NULL;
 
-	for (i = 0; i < ic->journal_pages; i++) {
+	for (i = 0; i < n_pages; i++) {
 		pl[i].page = alloc_page(GFP_KERNEL);
 		if (!pl[i].page) {
-			dm_integrity_free_page_list(ic, pl);
+			dm_integrity_free_page_list(pl);
 			return NULL;
 		}
 		if (i)
 			pl[i - 1].next = &pl[i];
 	}
+	pl[i].page = NULL;
+	pl[i].next = NULL;
 
 	return pl;
 }
@@ -2702,7 +3167,8 @@ static void dm_integrity_free_journal_scatterlist(struct dm_integrity_c *ic, str
 	kvfree(sl);
 }
 
-static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic, struct page_list *pl)
+static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic,
+								   struct page_list *pl)
 {
 	struct scatterlist **sl;
 	unsigned i;
@@ -2721,7 +3187,8 @@ static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_int
 		unsigned idx;
 
 		page_list_location(ic, i, 0, &start_index, &start_offset);
-		page_list_location(ic, i, ic->journal_section_sectors - 1, &end_index, &end_offset);
+		page_list_location(ic, i, ic->journal_section_sectors - 1,
+				   &end_index, &end_offset);
 
 		n_pages = (end_index - start_index + 1);
 
@@ -2842,7 +3309,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
 	}
 	ic->journal_pages = journal_pages;
 
-	ic->journal = dm_integrity_alloc_page_list(ic);
+	ic->journal = dm_integrity_alloc_page_list(ic->journal_pages);
 	if (!ic->journal) {
 		*error = "Could not allocate memory for journal";
 		r = -ENOMEM;
@@ -2874,7 +3341,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
 		DEBUG_print("cipher %s, block size %u iv size %u\n",
 			    ic->journal_crypt_alg.alg_string, blocksize, ivsize);
 
-		ic->journal_io = dm_integrity_alloc_page_list(ic);
+		ic->journal_io = dm_integrity_alloc_page_list(ic->journal_pages);
 		if (!ic->journal_io) {
 			*error = "Could not allocate memory for journal io";
 			r = -ENOMEM;
@@ -2898,7 +3365,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
 				goto bad;
 			}
 
-			ic->journal_xor = dm_integrity_alloc_page_list(ic);
+			ic->journal_xor = dm_integrity_alloc_page_list(ic->journal_pages);
 			if (!ic->journal_xor) {
 				*error = "Could not allocate memory for journal xor";
 				r = -ENOMEM;
@@ -2922,7 +3389,8 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
 			sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids);
 			memset(crypt_iv, 0x00, ivsize);
 
-			skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, crypt_iv);
+			skcipher_request_set_crypt(req, sg, sg,
+						   PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, crypt_iv);
 			init_completion(&comp.comp);
 			comp.in_flight = (atomic_t)ATOMIC_INIT(1);
 			if (do_crypt(true, req, &comp))
@@ -3063,7 +3531,7 @@ bad:
  *	device
  *	offset from the start of the device
  *	tag size
- *	D - direct writes, J - journal writes, R - recovery mode
+ *	D - direct writes, J - journal writes, B - bitmap mode, R - recovery mode
  *	number of optional arguments
  *	optional arguments:
  *		journal_sectors
@@ -3071,10 +3539,14 @@ bad:
  *		buffer_sectors
  *		journal_watermark
  *		commit_time
+ *		meta_device
+ *		block_size
+ *		sectors_per_bit
+ *		bitmap_flush_interval
  *		internal_hash
  *		journal_crypt
  *		journal_mac
- *		block_size
+ *		recalculate
  */
 static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
@@ -3087,10 +3559,13 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		{0, 9, "Invalid number of feature args"},
 	};
 	unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec;
-	bool recalculate;
 	bool should_write_sb;
 	__u64 threshold;
 	unsigned long long start;
+	__s8 log2_sectors_per_bitmap_bit = -1;
+	__s8 log2_blocks_per_bitmap_bit;
+	__u64 bits_in_journal;
+	__u64 n_bitmap_bits;
 
 #define DIRECT_ARGUMENTS	4
 
@@ -3114,6 +3589,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	init_waitqueue_head(&ic->copy_to_journal_wait);
 	init_completion(&ic->crypto_backoff);
 	atomic64_set(&ic->number_of_mismatches, 0);
+	ic->bitmap_flush_interval = BITMAP_FLUSH_INTERVAL;
 
 	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ic->dev);
 	if (r) {
@@ -3136,10 +3612,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		}
 	}
 
-	if (!strcmp(argv[3], "J") || !strcmp(argv[3], "D") || !strcmp(argv[3], "R"))
+	if (!strcmp(argv[3], "J") || !strcmp(argv[3], "B") ||
+	    !strcmp(argv[3], "D") || !strcmp(argv[3], "R")) {
 		ic->mode = argv[3][0];
-	else {
-		ti->error = "Invalid mode (expecting J, D, R)";
+	} else {
+		ti->error = "Invalid mode (expecting J, B, D, R)";
 		r = -EINVAL;
 		goto bad;
 	}
@@ -3149,7 +3626,6 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	buffer_sectors = DEFAULT_BUFFER_SECTORS;
 	journal_watermark = DEFAULT_JOURNAL_WATERMARK;
 	sync_msec = DEFAULT_SYNC_MSEC;
-	recalculate = false;
 	ic->sectors_per_block = 1;
 
 	as.argc = argc - DIRECT_ARGUMENTS;
@@ -3161,6 +3637,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	while (extra_args--) {
 		const char *opt_string;
 		unsigned val;
+		unsigned long long llval;
 		opt_string = dm_shift_arg(&as);
 		if (!opt_string) {
 			r = -EINVAL;
@@ -3182,7 +3659,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 				dm_put_device(ti, ic->meta_dev);
 				ic->meta_dev = NULL;
 			}
-			r = dm_get_device(ti, strchr(opt_string, ':') + 1, dm_table_get_mode(ti->table), &ic->meta_dev);
+			r = dm_get_device(ti, strchr(opt_string, ':') + 1,
+					  dm_table_get_mode(ti->table), &ic->meta_dev);
 			if (r) {
 				ti->error = "Device lookup failed";
 				goto bad;
@@ -3196,6 +3674,14 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 				goto bad;
 			}
 			ic->sectors_per_block = val >> SECTOR_SHIFT;
+		} else if (sscanf(opt_string, "sectors_per_bit:%llu%c", &llval, &dummy) == 1) {
+			log2_sectors_per_bitmap_bit = !llval ? 0 : __ilog2_u64(llval);
+		} else if (sscanf(opt_string, "bitmap_flush_interval:%u%c", &val, &dummy) == 1) {
+			if (val >= (uint64_t)UINT_MAX * 1000 / HZ) {
+				r = -EINVAL;
+				ti->error = "Invalid bitmap_flush_interval argument";
+			}
+			ic->bitmap_flush_interval = msecs_to_jiffies(val);
 		} else if (!strncmp(opt_string, "internal_hash:", strlen("internal_hash:"))) {
 			r = get_alg_and_key(opt_string, &ic->internal_hash_alg, &ti->error,
 					    "Invalid internal_hash argument");
@@ -3212,7 +3698,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 			if (r)
 				goto bad;
 		} else if (!strcmp(opt_string, "recalculate")) {
-			recalculate = true;
+			ic->recalculate_flag = true;
 		} else {
 			r = -EINVAL;
 			ti->error = "Invalid argument";
@@ -3228,7 +3714,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	if (!journal_sectors) {
 		journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS,
-			ic->data_device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR);
+				      ic->data_device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR);
 	}
 
 	if (!buffer_sectors)
@@ -3263,6 +3749,12 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	else
 		ic->log2_tag_size = -1;
 
+	if (ic->mode == 'B' && !ic->internal_hash) {
+		r = -EINVAL;
+		ti->error = "Bitmap mode can be only used with internal hash";
+		goto bad;
+	}
+
 	ic->autocommit_jiffies = msecs_to_jiffies(sync_msec);
 	ic->autocommit_msec = sync_msec;
 	timer_setup(&ic->autocommit_timer, autocommit_fn, 0);
@@ -3308,7 +3800,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	}
 	INIT_WORK(&ic->commit_work, integrity_commit);
 
-	if (ic->mode == 'J') {
+	if (ic->mode == 'J' || ic->mode == 'B') {
 		ic->writer_wq = alloc_workqueue("dm-integrity-writer", WQ_MEM_RECLAIM, 1);
 		if (!ic->writer_wq) {
 			ti->error = "Cannot allocate workqueue";
@@ -3349,7 +3841,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 			should_write_sb = true;
 	}
 
-	if (!ic->sb->version || ic->sb->version > SB_VERSION_2) {
+	if (!ic->sb->version || ic->sb->version > SB_VERSION_3) {
 		r = -EINVAL;
 		ti->error = "Unknown version";
 		goto bad;
@@ -3409,6 +3901,27 @@ try_smaller_buffer:
 		ti->error = "The device is too small";
 		goto bad;
 	}
+
+	if (log2_sectors_per_bitmap_bit < 0)
+		log2_sectors_per_bitmap_bit = __fls(DEFAULT_SECTORS_PER_BITMAP_BIT);
+	if (log2_sectors_per_bitmap_bit < ic->sb->log2_sectors_per_block)
+		log2_sectors_per_bitmap_bit = ic->sb->log2_sectors_per_block;
+
+	bits_in_journal = ((__u64)ic->journal_section_sectors * ic->journal_sections) << (SECTOR_SHIFT + 3);
+	if (bits_in_journal > UINT_MAX)
+		bits_in_journal = UINT_MAX;
+	while (bits_in_journal < (ic->provided_data_sectors + ((sector_t)1 << log2_sectors_per_bitmap_bit) - 1) >> log2_sectors_per_bitmap_bit)
+		log2_sectors_per_bitmap_bit++;
+
+	log2_blocks_per_bitmap_bit = log2_sectors_per_bitmap_bit - ic->sb->log2_sectors_per_block;
+	ic->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit;
+	if (should_write_sb) {
+		ic->sb->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit;
+	}
+	n_bitmap_bits = ((ic->provided_data_sectors >> ic->sb->log2_sectors_per_block)
+				+ (((sector_t)1 << log2_blocks_per_bitmap_bit) - 1)) >> log2_blocks_per_bitmap_bit;
+	ic->n_bitmap_blocks = DIV_ROUND_UP(n_bitmap_bits, BITMAP_BLOCK_SIZE * 8);
+
 	if (!ic->meta_dev)
 		ic->log2_buffer_sectors = min(ic->log2_buffer_sectors, (__u8)__ffs(ic->metadata_run));
 
@@ -3433,25 +3946,21 @@ try_smaller_buffer:
 	DEBUG_print("	journal_sections %u\n", (unsigned)le32_to_cpu(ic->sb->journal_sections));
 	DEBUG_print("	journal_entries %u\n", ic->journal_entries);
 	DEBUG_print("	log2_interleave_sectors %d\n", ic->sb->log2_interleave_sectors);
-	DEBUG_print("	device_sectors 0x%llx\n", (unsigned long long)ic->device_sectors);
+	DEBUG_print("	data_device_sectors 0x%llx\n", i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT);
 	DEBUG_print("	initial_sectors 0x%x\n", ic->initial_sectors);
 	DEBUG_print("	metadata_run 0x%x\n", ic->metadata_run);
 	DEBUG_print("	log2_metadata_run %d\n", ic->log2_metadata_run);
 	DEBUG_print("	provided_data_sectors 0x%llx (%llu)\n", (unsigned long long)ic->provided_data_sectors,
 		    (unsigned long long)ic->provided_data_sectors);
 	DEBUG_print("	log2_buffer_sectors %u\n", ic->log2_buffer_sectors);
+	DEBUG_print("	bits_in_journal %llu\n", (unsigned long long)bits_in_journal);
 
-	if (recalculate && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) {
+	if (ic->recalculate_flag && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) {
 		ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
 		ic->sb->recalc_sector = cpu_to_le64(0);
 	}
 
-	if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
-		if (!ic->internal_hash) {
-			r = -EINVAL;
-			ti->error = "Recalculate is only valid with internal hash";
-			goto bad;
-		}
+	if (ic->internal_hash) {
 		ic->recalc_wq = alloc_workqueue("dm-integrity-recalc", WQ_MEM_RECLAIM, 1);
 		if (!ic->recalc_wq ) {
 			ti->error = "Cannot allocate workqueue";
@@ -3488,6 +3997,45 @@ try_smaller_buffer:
 		r = create_journal(ic, &ti->error);
 		if (r)
 			goto bad;
+
+	}
+
+	if (ic->mode == 'B') {
+		unsigned i;
+		unsigned n_bitmap_pages = DIV_ROUND_UP(ic->n_bitmap_blocks, PAGE_SIZE / BITMAP_BLOCK_SIZE);
+
+		ic->recalc_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages);
+		if (!ic->recalc_bitmap) {
+			r = -ENOMEM;
+			goto bad;
+		}
+		ic->may_write_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages);
+		if (!ic->may_write_bitmap) {
+			r = -ENOMEM;
+			goto bad;
+		}
+		ic->bbs = kvmalloc_array(ic->n_bitmap_blocks, sizeof(struct bitmap_block_status), GFP_KERNEL);
+		if (!ic->bbs) {
+			r = -ENOMEM;
+			goto bad;
+		}
+		INIT_DELAYED_WORK(&ic->bitmap_flush_work, bitmap_flush_work);
+		for (i = 0; i < ic->n_bitmap_blocks; i++) {
+			struct bitmap_block_status *bbs = &ic->bbs[i];
+			unsigned sector, pl_index, pl_offset;
+
+			INIT_WORK(&bbs->work, bitmap_block_work);
+			bbs->ic = ic;
+			bbs->idx = i;
+			bio_list_init(&bbs->bio_queue);
+			spin_lock_init(&bbs->bio_queue_lock);
+
+			sector = i * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT);
+			pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
+			pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
+
+			bbs->bitmap = lowmem_page_address(ic->journal[pl_index].page) + pl_offset;
+		}
 	}
 
 	if (should_write_sb) {
@@ -3512,6 +4060,17 @@ try_smaller_buffer:
 		if (r)
 			goto bad;
 	}
+	if (ic->mode == 'B') {
+		unsigned max_io_len = ((sector_t)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit) * (BITMAP_BLOCK_SIZE * 8);
+		if (!max_io_len)
+			max_io_len = 1U << 31;
+		DEBUG_print("max_io_len: old %u, new %u\n", ti->max_io_len, max_io_len);
+		if (!ti->max_io_len || ti->max_io_len > max_io_len) {
+			r = dm_set_target_max_io_len(ti, max_io_len);
+			if (r)
+				goto bad;
+		}
+	}
 
 	if (!ic->internal_hash)
 		dm_integrity_set(ti, ic);
@@ -3520,6 +4079,7 @@ try_smaller_buffer:
 	ti->flush_supported = true;
 
 	return 0;
+
 bad:
 	dm_integrity_dtr(ti);
 	return r;
@@ -3542,10 +4102,9 @@ static void dm_integrity_dtr(struct dm_target *ti)
 		destroy_workqueue(ic->writer_wq);
 	if (ic->recalc_wq)
 		destroy_workqueue(ic->recalc_wq);
-	if (ic->recalc_buffer)
-		vfree(ic->recalc_buffer);
-	if (ic->recalc_tags)
-		kvfree(ic->recalc_tags);
+	vfree(ic->recalc_buffer);
+	kvfree(ic->recalc_tags);
+	kvfree(ic->bbs);
 	if (ic->bufio)
 		dm_bufio_client_destroy(ic->bufio);
 	mempool_exit(&ic->journal_io_mempool);
@@ -3555,9 +4114,11 @@ static void dm_integrity_dtr(struct dm_target *ti)
 		dm_put_device(ti, ic->dev);
 	if (ic->meta_dev)
 		dm_put_device(ti, ic->meta_dev);
-	dm_integrity_free_page_list(ic, ic->journal);
-	dm_integrity_free_page_list(ic, ic->journal_io);
-	dm_integrity_free_page_list(ic, ic->journal_xor);
+	dm_integrity_free_page_list(ic->journal);
+	dm_integrity_free_page_list(ic->journal_io);
+	dm_integrity_free_page_list(ic->journal_xor);
+	dm_integrity_free_page_list(ic->recalc_bitmap);
+	dm_integrity_free_page_list(ic->may_write_bitmap);
 	if (ic->journal_scatterlist)
 		dm_integrity_free_journal_scatterlist(ic, ic->journal_scatterlist);
 	if (ic->journal_io_scatterlist)
@@ -3595,7 +4156,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
 
 static struct target_type integrity_target = {
 	.name			= "integrity",
-	.version		= {1, 2, 0},
+	.version		= {1, 3, 0},
 	.module			= THIS_MODULE,
 	.features		= DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
 	.ctr			= dm_integrity_ctr,
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index c740153b4e52..1e03bc89e20f 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -2069,7 +2069,7 @@ int __init dm_early_create(struct dm_ioctl *dmi,
 	/* alloc table */
 	r = dm_table_create(&t, get_mode(dmi), dmi->target_count, md);
 	if (r)
-		goto err_destroy_dm;
+		goto err_hash_remove;
 
 	/* add targets */
 	for (i = 0; i < dmi->target_count; i++) {
@@ -2116,6 +2116,10 @@ int __init dm_early_create(struct dm_ioctl *dmi,
 
 err_destroy_table:
 	dm_table_destroy(t);
+err_hash_remove:
+	(void) __hash_remove(__get_name_cell(dmi->name));
+	/* release reference from __get_name_cell */
+	dm_put(md);
 err_destroy_dm:
 	dm_put(md);
 	dm_destroy(md);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 2ee5e357a0a7..dbcc1e41cd57 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -544,8 +544,23 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 	return DM_MAPIO_REMAPPED;
 }
 
-static void multipath_release_clone(struct request *clone)
+static void multipath_release_clone(struct request *clone,
+				    union map_info *map_context)
 {
+	if (unlikely(map_context)) {
+		/*
+		 * non-NULL map_context means caller is still map
+		 * method; must undo multipath_clone_and_map()
+		 */
+		struct dm_mpath_io *mpio = get_mpio(map_context);
+		struct pgpath *pgpath = mpio->pgpath;
+
+		if (pgpath && pgpath->pg->ps.type->end_io)
+			pgpath->pg->ps.type->end_io(&pgpath->pg->ps,
+						    &pgpath->path,
+						    mpio->nr_bytes);
+	}
+
 	blk_put_request(clone);
 }
 
@@ -882,6 +897,7 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
 	if (attached_handler_name || m->hw_handler_name) {
 		INIT_DELAYED_WORK(&p->activate_path, activate_path_work);
 		r = setup_scsi_dh(p->path.dev->bdev, m, &attached_handler_name, &ti->error);
+		kfree(attached_handler_name);
 		if (r) {
 			dm_put_device(ti, p->path.dev);
 			goto bad;
@@ -896,7 +912,6 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
 
 	return p;
  bad:
-	kfree(attached_handler_name);
 	free_pgpath(p);
 	return ERR_PTR(r);
 }
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index b66745bd08bb..5f7063f05ae0 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -168,7 +168,7 @@ static void dm_end_request(struct request *clone, blk_status_t error)
 	struct request *rq = tio->orig;
 
 	blk_rq_unprep_clone(clone);
-	tio->ti->type->release_clone_rq(clone);
+	tio->ti->type->release_clone_rq(clone, NULL);
 
 	rq_end_stats(md, rq);
 	blk_mq_end_request(rq, error);
@@ -201,7 +201,7 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_
 	rq_end_stats(md, rq);
 	if (tio->clone) {
 		blk_rq_unprep_clone(tio->clone);
-		tio->ti->type->release_clone_rq(tio->clone);
+		tio->ti->type->release_clone_rq(tio->clone, NULL);
 	}
 
 	dm_mq_delay_requeue_request(rq, delay_ms);
@@ -398,7 +398,7 @@ static int map_request(struct dm_rq_target_io *tio)
 	case DM_MAPIO_REMAPPED:
 		if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
 			/* -ENOMEM */
-			ti->type->release_clone_rq(clone);
+			ti->type->release_clone_rq(clone, &tio->info);
 			return DM_MAPIO_REQUEUE;
 		}
 
@@ -408,7 +408,7 @@ static int map_request(struct dm_rq_target_io *tio)
 		ret = dm_dispatch_clone_request(clone, rq);
 		if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
 			blk_rq_unprep_clone(clone);
-			tio->ti->type->release_clone_rq(clone);
+			tio->ti->type->release_clone_rq(clone, &tio->info);
 			tio->clone = NULL;
 			return DM_MAPIO_REQUEUE;
 		}
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index a168963b757d..3107f2b1988b 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -13,6 +13,7 @@
 #include <linux/init.h>
 #include <linux/kdev_t.h>
 #include <linux/list.h>
+#include <linux/list_bl.h>
 #include <linux/mempool.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -44,11 +45,11 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
 struct dm_exception_table {
 	uint32_t hash_mask;
 	unsigned hash_shift;
-	struct list_head *table;
+	struct hlist_bl_head *table;
 };
 
 struct dm_snapshot {
-	struct mutex lock;
+	struct rw_semaphore lock;
 
 	struct dm_dev *origin;
 	struct dm_dev *cow;
@@ -76,7 +77,9 @@ struct dm_snapshot {
 
 	atomic_t pending_exceptions_count;
 
-	/* Protected by "lock" */
+	spinlock_t pe_allocation_lock;
+
+	/* Protected by "pe_allocation_lock" */
 	sector_t exception_start_sequence;
 
 	/* Protected by kcopyd single-threaded callback */
@@ -457,9 +460,9 @@ static int __find_snapshots_sharing_cow(struct dm_snapshot *snap,
 		if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
 			continue;
 
-		mutex_lock(&s->lock);
+		down_read(&s->lock);
 		active = s->active;
-		mutex_unlock(&s->lock);
+		up_read(&s->lock);
 
 		if (active) {
 			if (snap_src)
@@ -618,6 +621,36 @@ static void unregister_snapshot(struct dm_snapshot *s)
  * The lowest hash_shift bits of the chunk number are ignored, allowing
  * some consecutive chunks to be grouped together.
  */
+static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk);
+
+/* Lock to protect access to the completed and pending exception hash tables. */
+struct dm_exception_table_lock {
+	struct hlist_bl_head *complete_slot;
+	struct hlist_bl_head *pending_slot;
+};
+
+static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk,
+					 struct dm_exception_table_lock *lock)
+{
+	struct dm_exception_table *complete = &s->complete;
+	struct dm_exception_table *pending = &s->pending;
+
+	lock->complete_slot = &complete->table[exception_hash(complete, chunk)];
+	lock->pending_slot = &pending->table[exception_hash(pending, chunk)];
+}
+
+static void dm_exception_table_lock(struct dm_exception_table_lock *lock)
+{
+	hlist_bl_lock(lock->complete_slot);
+	hlist_bl_lock(lock->pending_slot);
+}
+
+static void dm_exception_table_unlock(struct dm_exception_table_lock *lock)
+{
+	hlist_bl_unlock(lock->pending_slot);
+	hlist_bl_unlock(lock->complete_slot);
+}
+
 static int dm_exception_table_init(struct dm_exception_table *et,
 				   uint32_t size, unsigned hash_shift)
 {
@@ -625,12 +658,12 @@ static int dm_exception_table_init(struct dm_exception_table *et,
 
 	et->hash_shift = hash_shift;
 	et->hash_mask = size - 1;
-	et->table = dm_vcalloc(size, sizeof(struct list_head));
+	et->table = dm_vcalloc(size, sizeof(struct hlist_bl_head));
 	if (!et->table)
 		return -ENOMEM;
 
 	for (i = 0; i < size; i++)
-		INIT_LIST_HEAD(et->table + i);
+		INIT_HLIST_BL_HEAD(et->table + i);
 
 	return 0;
 }
@@ -638,15 +671,16 @@ static int dm_exception_table_init(struct dm_exception_table *et,
 static void dm_exception_table_exit(struct dm_exception_table *et,
 				    struct kmem_cache *mem)
 {
-	struct list_head *slot;
-	struct dm_exception *ex, *next;
+	struct hlist_bl_head *slot;
+	struct dm_exception *ex;
+	struct hlist_bl_node *pos, *n;
 	int i, size;
 
 	size = et->hash_mask + 1;
 	for (i = 0; i < size; i++) {
 		slot = et->table + i;
 
-		list_for_each_entry_safe (ex, next, slot, hash_list)
+		hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list)
 			kmem_cache_free(mem, ex);
 	}
 
@@ -660,7 +694,7 @@ static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
 
 static void dm_remove_exception(struct dm_exception *e)
 {
-	list_del(&e->hash_list);
+	hlist_bl_del(&e->hash_list);
 }
 
 /*
@@ -670,11 +704,12 @@ static void dm_remove_exception(struct dm_exception *e)
 static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
 						chunk_t chunk)
 {
-	struct list_head *slot;
+	struct hlist_bl_head *slot;
+	struct hlist_bl_node *pos;
 	struct dm_exception *e;
 
 	slot = &et->table[exception_hash(et, chunk)];
-	list_for_each_entry (e, slot, hash_list)
+	hlist_bl_for_each_entry(e, pos, slot, hash_list)
 		if (chunk >= e->old_chunk &&
 		    chunk <= e->old_chunk + dm_consecutive_chunk_count(e))
 			return e;
@@ -721,7 +756,8 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe)
 static void dm_insert_exception(struct dm_exception_table *eh,
 				struct dm_exception *new_e)
 {
-	struct list_head *l;
+	struct hlist_bl_head *l;
+	struct hlist_bl_node *pos;
 	struct dm_exception *e = NULL;
 
 	l = &eh->table[exception_hash(eh, new_e->old_chunk)];
@@ -731,7 +767,7 @@ static void dm_insert_exception(struct dm_exception_table *eh,
 		goto out;
 
 	/* List is ordered by old_chunk */
-	list_for_each_entry_reverse(e, l, hash_list) {
+	hlist_bl_for_each_entry(e, pos, l, hash_list) {
 		/* Insert after an existing chunk? */
 		if (new_e->old_chunk == (e->old_chunk +
 					 dm_consecutive_chunk_count(e) + 1) &&
@@ -752,12 +788,24 @@ static void dm_insert_exception(struct dm_exception_table *eh,
 			return;
 		}
 
-		if (new_e->old_chunk > e->old_chunk)
+		if (new_e->old_chunk < e->old_chunk)
 			break;
 	}
 
 out:
-	list_add(&new_e->hash_list, e ? &e->hash_list : l);
+	if (!e) {
+		/*
+		 * Either the table doesn't support consecutive chunks or slot
+		 * l is empty.
+		 */
+		hlist_bl_add_head(&new_e->hash_list, l);
+	} else if (new_e->old_chunk < e->old_chunk) {
+		/* Add before an existing exception */
+		hlist_bl_add_before(&new_e->hash_list, &e->hash_list);
+	} else {
+		/* Add to l's tail: e is the last exception in this slot */
+		hlist_bl_add_behind(&new_e->hash_list, &e->hash_list);
+	}
 }
 
 /*
@@ -766,6 +814,7 @@ out:
  */
 static int dm_add_exception(void *context, chunk_t old, chunk_t new)
 {
+	struct dm_exception_table_lock lock;
 	struct dm_snapshot *s = context;
 	struct dm_exception *e;
 
@@ -778,7 +827,17 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
 	/* Consecutive_count is implicitly initialised to zero */
 	e->new_chunk = new;
 
+	/*
+	 * Although there is no need to lock access to the exception tables
+	 * here, if we don't then hlist_bl_add_head(), called by
+	 * dm_insert_exception(), will complain about accessing the
+	 * corresponding list without locking it first.
+	 */
+	dm_exception_table_lock_init(s, old, &lock);
+
+	dm_exception_table_lock(&lock);
 	dm_insert_exception(&s->complete, e);
+	dm_exception_table_unlock(&lock);
 
 	return 0;
 }
@@ -807,7 +866,7 @@ static int calc_max_buckets(void)
 {
 	/* use a fixed size of 2MB */
 	unsigned long mem = 2 * 1024 * 1024;
-	mem /= sizeof(struct list_head);
+	mem /= sizeof(struct hlist_bl_head);
 
 	return mem;
 }
@@ -927,7 +986,7 @@ static int remove_single_exception_chunk(struct dm_snapshot *s)
 	int r;
 	chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1;
 
-	mutex_lock(&s->lock);
+	down_write(&s->lock);
 
 	/*
 	 * Process chunks (and associated exceptions) in reverse order
@@ -942,7 +1001,7 @@ static int remove_single_exception_chunk(struct dm_snapshot *s)
 	b = __release_queued_bios_after_merge(s);
 
 out:
-	mutex_unlock(&s->lock);
+	up_write(&s->lock);
 	if (b)
 		flush_bios(b);
 
@@ -1001,9 +1060,9 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
 		if (linear_chunks < 0) {
 			DMERR("Read error in exception store: "
 			      "shutting down merge");
-			mutex_lock(&s->lock);
+			down_write(&s->lock);
 			s->merge_failed = 1;
-			mutex_unlock(&s->lock);
+			up_write(&s->lock);
 		}
 		goto shut;
 	}
@@ -1044,10 +1103,10 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
 		previous_count = read_pending_exceptions_done_count();
 	}
 
-	mutex_lock(&s->lock);
+	down_write(&s->lock);
 	s->first_merging_chunk = old_chunk;
 	s->num_merging_chunks = linear_chunks;
-	mutex_unlock(&s->lock);
+	up_write(&s->lock);
 
 	/* Wait until writes to all 'linear_chunks' drain */
 	for (i = 0; i < linear_chunks; i++)
@@ -1089,10 +1148,10 @@ static void merge_callback(int read_err, unsigned long write_err, void *context)
 	return;
 
 shut:
-	mutex_lock(&s->lock);
+	down_write(&s->lock);
 	s->merge_failed = 1;
 	b = __release_queued_bios_after_merge(s);
-	mutex_unlock(&s->lock);
+	up_write(&s->lock);
 	error_bios(b);
 
 	merge_shutdown(s);
@@ -1188,10 +1247,11 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	s->snapshot_overflowed = 0;
 	s->active = 0;
 	atomic_set(&s->pending_exceptions_count, 0);
+	spin_lock_init(&s->pe_allocation_lock);
 	s->exception_start_sequence = 0;
 	s->exception_complete_sequence = 0;
 	s->out_of_order_tree = RB_ROOT;
-	mutex_init(&s->lock);
+	init_rwsem(&s->lock);
 	INIT_LIST_HEAD(&s->list);
 	spin_lock_init(&s->pe_lock);
 	s->state_bits = 0;
@@ -1357,9 +1417,9 @@ static void snapshot_dtr(struct dm_target *ti)
 	/* Check whether exception handover must be cancelled */
 	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
 	if (snap_src && snap_dest && (s == snap_src)) {
-		mutex_lock(&snap_dest->lock);
+		down_write(&snap_dest->lock);
 		snap_dest->valid = 0;
-		mutex_unlock(&snap_dest->lock);
+		up_write(&snap_dest->lock);
 		DMERR("Cancelling snapshot handover.");
 	}
 	up_read(&_origins_lock);
@@ -1390,8 +1450,6 @@ static void snapshot_dtr(struct dm_target *ti)
 
 	dm_exception_store_destroy(s->store);
 
-	mutex_destroy(&s->lock);
-
 	dm_put_device(ti, s->cow);
 
 	dm_put_device(ti, s->origin);
@@ -1467,6 +1525,13 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err)
 	dm_table_event(s->ti->table);
 }
 
+static void invalidate_snapshot(struct dm_snapshot *s, int err)
+{
+	down_write(&s->lock);
+	__invalidate_snapshot(s, err);
+	up_write(&s->lock);
+}
+
 static void pending_complete(void *context, int success)
 {
 	struct dm_snap_pending_exception *pe = context;
@@ -1475,43 +1540,63 @@ static void pending_complete(void *context, int success)
 	struct bio *origin_bios = NULL;
 	struct bio *snapshot_bios = NULL;
 	struct bio *full_bio = NULL;
+	struct dm_exception_table_lock lock;
 	int error = 0;
 
+	dm_exception_table_lock_init(s, pe->e.old_chunk, &lock);
+
 	if (!success) {
 		/* Read/write error - snapshot is unusable */
-		mutex_lock(&s->lock);
-		__invalidate_snapshot(s, -EIO);
+		invalidate_snapshot(s, -EIO);
 		error = 1;
+
+		dm_exception_table_lock(&lock);
 		goto out;
 	}
 
 	e = alloc_completed_exception(GFP_NOIO);
 	if (!e) {
-		mutex_lock(&s->lock);
-		__invalidate_snapshot(s, -ENOMEM);
+		invalidate_snapshot(s, -ENOMEM);
 		error = 1;
+
+		dm_exception_table_lock(&lock);
 		goto out;
 	}
 	*e = pe->e;
 
-	mutex_lock(&s->lock);
+	down_read(&s->lock);
+	dm_exception_table_lock(&lock);
 	if (!s->valid) {
+		up_read(&s->lock);
 		free_completed_exception(e);
 		error = 1;
+
 		goto out;
 	}
 
-	/* Check for conflicting reads */
-	__check_for_conflicting_io(s, pe->e.old_chunk);
-
 	/*
-	 * Add a proper exception, and remove the
-	 * in-flight exception from the list.
+	 * Add a proper exception. After inserting the completed exception all
+	 * subsequent snapshot reads to this chunk will be redirected to the
+	 * COW device.  This ensures that we do not starve. Moreover, as long
+	 * as the pending exception exists, neither origin writes nor snapshot
+	 * merging can overwrite the chunk in origin.
 	 */
 	dm_insert_exception(&s->complete, e);
+	up_read(&s->lock);
+
+	/* Wait for conflicting reads to drain */
+	if (__chunk_is_tracked(s, pe->e.old_chunk)) {
+		dm_exception_table_unlock(&lock);
+		__check_for_conflicting_io(s, pe->e.old_chunk);
+		dm_exception_table_lock(&lock);
+	}
 
 out:
+	/* Remove the in-flight exception from the list */
 	dm_remove_exception(&pe->e);
+
+	dm_exception_table_unlock(&lock);
+
 	snapshot_bios = bio_list_get(&pe->snapshot_bios);
 	origin_bios = bio_list_get(&pe->origin_bios);
 	full_bio = pe->full_bio;
@@ -1519,8 +1604,6 @@ out:
 		full_bio->bi_end_io = pe->full_bio_end_io;
 	increment_pending_exceptions_done_count();
 
-	mutex_unlock(&s->lock);
-
 	/* Submit any pending write bios */
 	if (error) {
 		if (full_bio)
@@ -1660,43 +1743,59 @@ __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
 }
 
 /*
- * Looks to see if this snapshot already has a pending exception
- * for this chunk, otherwise it allocates a new one and inserts
- * it into the pending table.
+ * Inserts a pending exception into the pending table.
  *
- * NOTE: a write lock must be held on snap->lock before calling
- * this.
+ * NOTE: a write lock must be held on the chunk's pending exception table slot
+ * before calling this.
  */
 static struct dm_snap_pending_exception *
-__find_pending_exception(struct dm_snapshot *s,
-			 struct dm_snap_pending_exception *pe, chunk_t chunk)
+__insert_pending_exception(struct dm_snapshot *s,
+			   struct dm_snap_pending_exception *pe, chunk_t chunk)
 {
-	struct dm_snap_pending_exception *pe2;
-
-	pe2 = __lookup_pending_exception(s, chunk);
-	if (pe2) {
-		free_pending_exception(pe);
-		return pe2;
-	}
-
 	pe->e.old_chunk = chunk;
 	bio_list_init(&pe->origin_bios);
 	bio_list_init(&pe->snapshot_bios);
 	pe->started = 0;
 	pe->full_bio = NULL;
 
+	spin_lock(&s->pe_allocation_lock);
 	if (s->store->type->prepare_exception(s->store, &pe->e)) {
+		spin_unlock(&s->pe_allocation_lock);
 		free_pending_exception(pe);
 		return NULL;
 	}
 
 	pe->exception_sequence = s->exception_start_sequence++;
+	spin_unlock(&s->pe_allocation_lock);
 
 	dm_insert_exception(&s->pending, &pe->e);
 
 	return pe;
 }
 
+/*
+ * Looks to see if this snapshot already has a pending exception
+ * for this chunk, otherwise it allocates a new one and inserts
+ * it into the pending table.
+ *
+ * NOTE: a write lock must be held on the chunk's pending exception table slot
+ * before calling this.
+ */
+static struct dm_snap_pending_exception *
+__find_pending_exception(struct dm_snapshot *s,
+			 struct dm_snap_pending_exception *pe, chunk_t chunk)
+{
+	struct dm_snap_pending_exception *pe2;
+
+	pe2 = __lookup_pending_exception(s, chunk);
+	if (pe2) {
+		free_pending_exception(pe);
+		return pe2;
+	}
+
+	return __insert_pending_exception(s, pe, chunk);
+}
+
 static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
 			    struct bio *bio, chunk_t chunk)
 {
@@ -1714,6 +1813,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
 	int r = DM_MAPIO_REMAPPED;
 	chunk_t chunk;
 	struct dm_snap_pending_exception *pe = NULL;
+	struct dm_exception_table_lock lock;
 
 	init_tracked_chunk(bio);
 
@@ -1723,13 +1823,15 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
 	}
 
 	chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
+	dm_exception_table_lock_init(s, chunk, &lock);
 
 	/* Full snapshots are not usable */
 	/* To get here the table must be live so s->active is always set. */
 	if (!s->valid)
 		return DM_MAPIO_KILL;
 
-	mutex_lock(&s->lock);
+	down_read(&s->lock);
+	dm_exception_table_lock(&lock);
 
 	if (!s->valid || (unlikely(s->snapshot_overflowed) &&
 	    bio_data_dir(bio) == WRITE)) {
@@ -1752,15 +1854,9 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
 	if (bio_data_dir(bio) == WRITE) {
 		pe = __lookup_pending_exception(s, chunk);
 		if (!pe) {
-			mutex_unlock(&s->lock);
+			dm_exception_table_unlock(&lock);
 			pe = alloc_pending_exception(s);
-			mutex_lock(&s->lock);
-
-			if (!s->valid || s->snapshot_overflowed) {
-				free_pending_exception(pe);
-				r = DM_MAPIO_KILL;
-				goto out_unlock;
-			}
+			dm_exception_table_lock(&lock);
 
 			e = dm_lookup_exception(&s->complete, chunk);
 			if (e) {
@@ -1771,13 +1867,22 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
 
 			pe = __find_pending_exception(s, pe, chunk);
 			if (!pe) {
+				dm_exception_table_unlock(&lock);
+				up_read(&s->lock);
+
+				down_write(&s->lock);
+
 				if (s->store->userspace_supports_overflow) {
-					s->snapshot_overflowed = 1;
-					DMERR("Snapshot overflowed: Unable to allocate exception.");
+					if (s->valid && !s->snapshot_overflowed) {
+						s->snapshot_overflowed = 1;
+						DMERR("Snapshot overflowed: Unable to allocate exception.");
+					}
 				} else
 					__invalidate_snapshot(s, -ENOMEM);
+				up_write(&s->lock);
+
 				r = DM_MAPIO_KILL;
-				goto out_unlock;
+				goto out;
 			}
 		}
 
@@ -1789,7 +1894,10 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
 		    bio->bi_iter.bi_size ==
 		    (s->store->chunk_size << SECTOR_SHIFT)) {
 			pe->started = 1;
-			mutex_unlock(&s->lock);
+
+			dm_exception_table_unlock(&lock);
+			up_read(&s->lock);
+
 			start_full_bio(pe, bio);
 			goto out;
 		}
@@ -1797,9 +1905,12 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
 		bio_list_add(&pe->snapshot_bios, bio);
 
 		if (!pe->started) {
-			/* this is protected by snap->lock */
+			/* this is protected by the exception table lock */
 			pe->started = 1;
-			mutex_unlock(&s->lock);
+
+			dm_exception_table_unlock(&lock);
+			up_read(&s->lock);
+
 			start_copy(pe);
 			goto out;
 		}
@@ -1809,7 +1920,8 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
 	}
 
 out_unlock:
-	mutex_unlock(&s->lock);
+	dm_exception_table_unlock(&lock);
+	up_read(&s->lock);
 out:
 	return r;
 }
@@ -1845,7 +1957,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
 
 	chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
 
-	mutex_lock(&s->lock);
+	down_write(&s->lock);
 
 	/* Full merging snapshots are redirected to the origin */
 	if (!s->valid)
@@ -1876,12 +1988,12 @@ redirect_to_origin:
 	bio_set_dev(bio, s->origin->bdev);
 
 	if (bio_data_dir(bio) == WRITE) {
-		mutex_unlock(&s->lock);
+		up_write(&s->lock);
 		return do_origin(s->origin, bio);
 	}
 
 out_unlock:
-	mutex_unlock(&s->lock);
+	up_write(&s->lock);
 
 	return r;
 }
@@ -1913,7 +2025,7 @@ static int snapshot_preresume(struct dm_target *ti)
 	down_read(&_origins_lock);
 	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
 	if (snap_src && snap_dest) {
-		mutex_lock(&snap_src->lock);
+		down_read(&snap_src->lock);
 		if (s == snap_src) {
 			DMERR("Unable to resume snapshot source until "
 			      "handover completes.");
@@ -1923,7 +2035,7 @@ static int snapshot_preresume(struct dm_target *ti)
 			      "source is suspended.");
 			r = -EINVAL;
 		}
-		mutex_unlock(&snap_src->lock);
+		up_read(&snap_src->lock);
 	}
 	up_read(&_origins_lock);
 
@@ -1969,11 +2081,11 @@ static void snapshot_resume(struct dm_target *ti)
 
 	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
 	if (snap_src && snap_dest) {
-		mutex_lock(&snap_src->lock);
-		mutex_lock_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
+		down_write(&snap_src->lock);
+		down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
 		__handover_exceptions(snap_src, snap_dest);
-		mutex_unlock(&snap_dest->lock);
-		mutex_unlock(&snap_src->lock);
+		up_write(&snap_dest->lock);
+		up_write(&snap_src->lock);
 	}
 
 	up_read(&_origins_lock);
@@ -1988,9 +2100,9 @@ static void snapshot_resume(struct dm_target *ti)
 	/* Now we have correct chunk size, reregister */
 	reregister_snapshot(s);
 
-	mutex_lock(&s->lock);
+	down_write(&s->lock);
 	s->active = 1;
-	mutex_unlock(&s->lock);
+	up_write(&s->lock);
 }
 
 static uint32_t get_origin_minimum_chunksize(struct block_device *bdev)
@@ -2030,7 +2142,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type,
 	switch (type) {
 	case STATUSTYPE_INFO:
 
-		mutex_lock(&snap->lock);
+		down_write(&snap->lock);
 
 		if (!snap->valid)
 			DMEMIT("Invalid");
@@ -2055,7 +2167,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type,
 				DMEMIT("Unknown");
 		}
 
-		mutex_unlock(&snap->lock);
+		up_write(&snap->lock);
 
 		break;
 
@@ -2107,9 +2219,10 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
 	int r = DM_MAPIO_REMAPPED;
 	struct dm_snapshot *snap;
 	struct dm_exception *e;
-	struct dm_snap_pending_exception *pe;
+	struct dm_snap_pending_exception *pe, *pe2;
 	struct dm_snap_pending_exception *pe_to_start_now = NULL;
 	struct dm_snap_pending_exception *pe_to_start_last = NULL;
+	struct dm_exception_table_lock lock;
 	chunk_t chunk;
 
 	/* Do all the snapshots on this origin */
@@ -2121,52 +2234,59 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
 		if (dm_target_is_snapshot_merge(snap->ti))
 			continue;
 
-		mutex_lock(&snap->lock);
-
-		/* Only deal with valid and active snapshots */
-		if (!snap->valid || !snap->active)
-			goto next_snapshot;
-
 		/* Nothing to do if writing beyond end of snapshot */
 		if (sector >= dm_table_get_size(snap->ti->table))
-			goto next_snapshot;
+			continue;
 
 		/*
 		 * Remember, different snapshots can have
 		 * different chunk sizes.
 		 */
 		chunk = sector_to_chunk(snap->store, sector);
+		dm_exception_table_lock_init(snap, chunk, &lock);
 
-		/*
-		 * Check exception table to see if block
-		 * is already remapped in this snapshot
-		 * and trigger an exception if not.
-		 */
-		e = dm_lookup_exception(&snap->complete, chunk);
-		if (e)
+		down_read(&snap->lock);
+		dm_exception_table_lock(&lock);
+
+		/* Only deal with valid and active snapshots */
+		if (!snap->valid || !snap->active)
 			goto next_snapshot;
 
 		pe = __lookup_pending_exception(snap, chunk);
 		if (!pe) {
-			mutex_unlock(&snap->lock);
-			pe = alloc_pending_exception(snap);
-			mutex_lock(&snap->lock);
-
-			if (!snap->valid) {
-				free_pending_exception(pe);
-				goto next_snapshot;
-			}
-
+			/*
+			 * Check exception table to see if block is already
+			 * remapped in this snapshot and trigger an exception
+			 * if not.
+			 */
 			e = dm_lookup_exception(&snap->complete, chunk);
-			if (e) {
-				free_pending_exception(pe);
+			if (e)
 				goto next_snapshot;
-			}
 
-			pe = __find_pending_exception(snap, pe, chunk);
-			if (!pe) {
-				__invalidate_snapshot(snap, -ENOMEM);
-				goto next_snapshot;
+			dm_exception_table_unlock(&lock);
+			pe = alloc_pending_exception(snap);
+			dm_exception_table_lock(&lock);
+
+			pe2 = __lookup_pending_exception(snap, chunk);
+
+			if (!pe2) {
+				e = dm_lookup_exception(&snap->complete, chunk);
+				if (e) {
+					free_pending_exception(pe);
+					goto next_snapshot;
+				}
+
+				pe = __insert_pending_exception(snap, pe, chunk);
+				if (!pe) {
+					dm_exception_table_unlock(&lock);
+					up_read(&snap->lock);
+
+					invalidate_snapshot(snap, -ENOMEM);
+					continue;
+				}
+			} else {
+				free_pending_exception(pe);
+				pe = pe2;
 			}
 		}
 
@@ -2193,7 +2313,8 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
 		}
 
 next_snapshot:
-		mutex_unlock(&snap->lock);
+		dm_exception_table_unlock(&lock);
+		up_read(&snap->lock);
 
 		if (pe_to_start_now) {
 			start_copy(pe_to_start_now);
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 314d17ca6466..64dd0b34fcf4 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -136,7 +136,8 @@ static int io_err_clone_and_map_rq(struct dm_target *ti, struct request *rq,
 	return DM_MAPIO_KILL;
 }
 
-static void io_err_release_clone_rq(struct request *clone)
+static void io_err_release_clone_rq(struct request *clone,
+				    union map_info *map_context)
 {
 }
 
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index ed3caceaed07..7f0840601737 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -202,6 +202,13 @@ struct dm_pool_metadata {
 	bool fail_io:1;
 
 	/*
+	 * Set once a thin-pool has been accessed through one of the interfaces
+	 * that imply the pool is in-service (e.g. thin devices created/deleted,
+	 * thin-pool message, metadata snapshots, etc).
+	 */
+	bool in_service:1;
+
+	/*
 	 * Reading the space map roots can fail, so we read it into these
 	 * buffers before the superblock is locked and updated.
 	 */
@@ -367,6 +374,32 @@ static int subtree_equal(void *context, const void *value1_le, const void *value
 
 /*----------------------------------------------------------------*/
 
+/*
+ * Variant that is used for in-core only changes or code that
+ * shouldn't put the pool in service on its own (e.g. commit).
+ */
+static inline void __pmd_write_lock(struct dm_pool_metadata *pmd)
+	__acquires(pmd->root_lock)
+{
+	down_write(&pmd->root_lock);
+}
+#define pmd_write_lock_in_core(pmd) __pmd_write_lock((pmd))
+
+static inline void pmd_write_lock(struct dm_pool_metadata *pmd)
+{
+	__pmd_write_lock(pmd);
+	if (unlikely(!pmd->in_service))
+		pmd->in_service = true;
+}
+
+static inline void pmd_write_unlock(struct dm_pool_metadata *pmd)
+	__releases(pmd->root_lock)
+{
+	up_write(&pmd->root_lock);
+}
+
+/*----------------------------------------------------------------*/
+
 static int superblock_lock_zero(struct dm_pool_metadata *pmd,
 				struct dm_block **sblock)
 {
@@ -790,6 +823,9 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
 	 */
 	BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
 
+	if (unlikely(!pmd->in_service))
+		return 0;
+
 	r = __write_changed_details(pmd);
 	if (r < 0)
 		return r;
@@ -853,6 +889,7 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
 	pmd->time = 0;
 	INIT_LIST_HEAD(&pmd->thin_devices);
 	pmd->fail_io = false;
+	pmd->in_service = false;
 	pmd->bdev = bdev;
 	pmd->data_block_size = data_block_size;
 
@@ -903,7 +940,6 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
 			DMWARN("%s: __commit_transaction() failed, error = %d",
 			       __func__, r);
 	}
-
 	if (!pmd->fail_io)
 		__destroy_persistent_data_objects(pmd);
 
@@ -1032,10 +1068,10 @@ int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev)
 {
 	int r = -EINVAL;
 
-	down_write(&pmd->root_lock);
+	pmd_write_lock(pmd);
 	if (!pmd->fail_io)
 		r = __create_thin(pmd, dev);
-	up_write(&pmd->root_lock);
+	pmd_write_unlock(pmd);
 
 	return r;
 }
@@ -1123,10 +1159,10 @@ int dm_pool_create_snap(struct dm_pool_metadata *pmd,
 {
 	int r = -EINVAL;
 
-	down_write(&pmd->root_lock);
+	pmd_write_lock(pmd);
 	if (!pmd->fail_io)
 		r = __create_snap(pmd, dev, origin);
-	up_write(&pmd->root_lock);
+	pmd_write_unlock(pmd);
 
 	return r;
 }
@@ -1166,10 +1202,10 @@ int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
 {
 	int r = -EINVAL;
 
-	down_write(&pmd->root_lock);
+	pmd_write_lock(pmd);
 	if (!pmd->fail_io)
 		r = __delete_device(pmd, dev);
-	up_write(&pmd->root_lock);
+	pmd_write_unlock(pmd);
 
 	return r;
 }
@@ -1180,7 +1216,7 @@ int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
 {
 	int r = -EINVAL;
 
-	down_write(&pmd->root_lock);
+	pmd_write_lock(pmd);
 
 	if (pmd->fail_io)
 		goto out;
@@ -1194,7 +1230,7 @@ int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
 	r = 0;
 
 out:
-	up_write(&pmd->root_lock);
+	pmd_write_unlock(pmd);
 
 	return r;
 }
@@ -1225,7 +1261,12 @@ static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
 	 * We commit to ensure the btree roots which we increment in a
 	 * moment are up to date.
 	 */
-	__commit_transaction(pmd);
+	r = __commit_transaction(pmd);
+	if (r < 0) {
+		DMWARN("%s: __commit_transaction() failed, error = %d",
+		       __func__, r);
+		return r;
+	}
 
 	/*
 	 * Copy the superblock.
@@ -1283,10 +1324,10 @@ int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd)
 {
 	int r = -EINVAL;
 
-	down_write(&pmd->root_lock);
+	pmd_write_lock(pmd);
 	if (!pmd->fail_io)
 		r = __reserve_metadata_snap(pmd);
-	up_write(&pmd->root_lock);
+	pmd_write_unlock(pmd);
 
 	return r;
 }
@@ -1331,10 +1372,10 @@ int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd)
 {
 	int r = -EINVAL;
 
-	down_write(&pmd->root_lock);
+	pmd_write_lock(pmd);
 	if (!pmd->fail_io)
 		r = __release_metadata_snap(pmd);
-	up_write(&pmd->root_lock);
+	pmd_write_unlock(pmd);
 
 	return r;
 }
@@ -1377,19 +1418,19 @@ int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
 {
 	int r = -EINVAL;
 
-	down_write(&pmd->root_lock);
+	pmd_write_lock_in_core(pmd);
 	if (!pmd->fail_io)
 		r = __open_device(pmd, dev, 0, td);
-	up_write(&pmd->root_lock);
+	pmd_write_unlock(pmd);
 
 	return r;
 }
 
 int dm_pool_close_thin_device(struct dm_thin_device *td)
 {
-	down_write(&td->pmd->root_lock);
+	pmd_write_lock_in_core(td->pmd);
 	__close_device(td);
-	up_write(&td->pmd->root_lock);
+	pmd_write_unlock(td->pmd);
 
 	return 0;
 }
@@ -1570,10 +1611,10 @@ int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
 {
 	int r = -EINVAL;
 
-	down_write(&td->pmd->root_lock);
+	pmd_write_lock(td->pmd);
 	if (!td->pmd->fail_io)
 		r = __insert(td, block, data_block);
-	up_write(&td->pmd->root_lock);
+	pmd_write_unlock(td->pmd);
 
 	return r;
 }
@@ -1657,10 +1698,10 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
 {
 	int r = -EINVAL;
 
-	down_write(&td->pmd->root_lock);
+	pmd_write_lock(td->pmd);
 	if (!td->pmd->fail_io)
 		r = __remove(td, block);
-	up_write(&td->pmd->root_lock);
+	pmd_write_unlock(td->pmd);
 
 	return r;
 }
@@ -1670,10 +1711,10 @@ int dm_thin_remove_range(struct dm_thin_device *td,
 {
 	int r = -EINVAL;
 
-	down_write(&td->pmd->root_lock);
+	pmd_write_lock(td->pmd);
 	if (!td->pmd->fail_io)
 		r = __remove_range(td, begin, end);
-	up_write(&td->pmd->root_lock);
+	pmd_write_unlock(td->pmd);
 
 	return r;
 }
@@ -1696,13 +1737,13 @@ int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_
 {
 	int r = 0;
 
-	down_write(&pmd->root_lock);
+	pmd_write_lock(pmd);
 	for (; b != e; b++) {
 		r = dm_sm_inc_block(pmd->data_sm, b);
 		if (r)
 			break;
 	}
-	up_write(&pmd->root_lock);
+	pmd_write_unlock(pmd);
 
 	return r;
 }
@@ -1711,13 +1752,13 @@ int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_
 {
 	int r = 0;
 
-	down_write(&pmd->root_lock);
+	pmd_write_lock(pmd);
 	for (; b != e; b++) {
 		r = dm_sm_dec_block(pmd->data_sm, b);
 		if (r)
 			break;
 	}
-	up_write(&pmd->root_lock);
+	pmd_write_unlock(pmd);
 
 	return r;
 }
@@ -1765,10 +1806,10 @@ int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
 {
 	int r = -EINVAL;
 
-	down_write(&pmd->root_lock);
+	pmd_write_lock(pmd);
 	if (!pmd->fail_io)
 		r = dm_sm_new_block(pmd->data_sm, result);
-	up_write(&pmd->root_lock);
+	pmd_write_unlock(pmd);
 
 	return r;
 }
@@ -1777,12 +1818,16 @@ int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
 {
 	int r = -EINVAL;
 
-	down_write(&pmd->root_lock);
+	/*
+	 * Care is taken to not have commit be what
+	 * triggers putting the thin-pool in-service.
+	 */
+	__pmd_write_lock(pmd);
 	if (pmd->fail_io)
 		goto out;
 
 	r = __commit_transaction(pmd);
-	if (r <= 0)
+	if (r < 0)
 		goto out;
 
 	/*
@@ -1790,7 +1835,7 @@ int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
 	 */
 	r = __begin_transaction(pmd);
 out:
-	up_write(&pmd->root_lock);
+	pmd_write_unlock(pmd);
 	return r;
 }
 
@@ -1806,7 +1851,7 @@ int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
 {
 	int r = -EINVAL;
 
-	down_write(&pmd->root_lock);
+	pmd_write_lock(pmd);
 	if (pmd->fail_io)
 		goto out;
 
@@ -1817,7 +1862,7 @@ int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
 		pmd->fail_io = true;
 
 out:
-	up_write(&pmd->root_lock);
+	pmd_write_unlock(pmd);
 
 	return r;
 }
@@ -1948,10 +1993,10 @@ int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
 {
 	int r = -EINVAL;
 
-	down_write(&pmd->root_lock);
+	pmd_write_lock(pmd);
 	if (!pmd->fail_io)
 		r = __resize_space_map(pmd->data_sm, new_count);
-	up_write(&pmd->root_lock);
+	pmd_write_unlock(pmd);
 
 	return r;
 }
@@ -1960,29 +2005,29 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_cou
 {
 	int r = -EINVAL;
 
-	down_write(&pmd->root_lock);
+	pmd_write_lock(pmd);
 	if (!pmd->fail_io) {
 		r = __resize_space_map(pmd->metadata_sm, new_count);
 		if (!r)
 			__set_metadata_reserve(pmd);
 	}
-	up_write(&pmd->root_lock);
+	pmd_write_unlock(pmd);
 
 	return r;
 }
 
 void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
 {
-	down_write(&pmd->root_lock);
+	pmd_write_lock_in_core(pmd);
 	dm_bm_set_read_only(pmd->bm);
-	up_write(&pmd->root_lock);
+	pmd_write_unlock(pmd);
 }
 
 void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
 {
-	down_write(&pmd->root_lock);
+	pmd_write_lock_in_core(pmd);
 	dm_bm_set_read_write(pmd->bm);
-	up_write(&pmd->root_lock);
+	pmd_write_unlock(pmd);
 }
 
 int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
@@ -1992,9 +2037,9 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
 {
 	int r;
 
-	down_write(&pmd->root_lock);
+	pmd_write_lock_in_core(pmd);
 	r = dm_sm_register_threshold_callback(pmd->metadata_sm, threshold, fn, context);
-	up_write(&pmd->root_lock);
+	pmd_write_unlock(pmd);
 
 	return r;
 }
@@ -2005,7 +2050,7 @@ int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
 	struct dm_block *sblock;
 	struct thin_disk_superblock *disk_super;
 
-	down_write(&pmd->root_lock);
+	pmd_write_lock(pmd);
 	pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
 
 	r = superblock_lock(pmd, &sblock);
@@ -2019,7 +2064,7 @@ int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
 
 	dm_bm_unlock(sblock);
 out:
-	up_write(&pmd->root_lock);
+	pmd_write_unlock(pmd);
 	return r;
 }
 
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index f7822875589e..1cb137f0ef9d 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -190,7 +190,6 @@ struct writeback_struct {
 	struct dm_writecache *wc;
 	struct wc_entry **wc_list;
 	unsigned wc_list_n;
-	unsigned page_offset;
 	struct page *page;
 	struct wc_entry *wc_list_inline[WB_LIST_INLINE];
 	struct bio bio;
@@ -546,21 +545,20 @@ static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
 		e = container_of(node, struct wc_entry, rb_node);
 		if (read_original_sector(wc, e) == block)
 			break;
+
 		node = (read_original_sector(wc, e) >= block ?
 			e->rb_node.rb_left : e->rb_node.rb_right);
 		if (unlikely(!node)) {
-			if (!(flags & WFE_RETURN_FOLLOWING)) {
+			if (!(flags & WFE_RETURN_FOLLOWING))
 				return NULL;
-			}
 			if (read_original_sector(wc, e) >= block) {
-				break;
+				return e;
 			} else {
 				node = rb_next(&e->rb_node);
-				if (unlikely(!node)) {
+				if (unlikely(!node))
 					return NULL;
-				}
 				e = container_of(node, struct wc_entry, rb_node);
-				break;
+				return e;
 			}
 		}
 	}
@@ -571,7 +569,7 @@ static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
 			node = rb_prev(&e->rb_node);
 		else
 			node = rb_next(&e->rb_node);
-		if (!node)
+		if (unlikely(!node))
 			return e;
 		e2 = container_of(node, struct wc_entry, rb_node);
 		if (read_original_sector(wc, e2) != block)
@@ -804,7 +802,7 @@ static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_
 			writecache_free_entry(wc, e);
 		}
 
-		if (!node)
+		if (unlikely(!node))
 			break;
 
 		e = container_of(node, struct wc_entry, rb_node);
@@ -1478,10 +1476,9 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba
 		bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set);
 		wb = container_of(bio, struct writeback_struct, bio);
 		wb->wc = wc;
-		wb->bio.bi_end_io = writecache_writeback_endio;
-		bio_set_dev(&wb->bio, wc->dev->bdev);
-		wb->bio.bi_iter.bi_sector = read_original_sector(wc, e);
-		wb->page_offset = PAGE_SIZE;
+		bio->bi_end_io = writecache_writeback_endio;
+		bio_set_dev(bio, wc->dev->bdev);
+		bio->bi_iter.bi_sector = read_original_sector(wc, e);
 		if (max_pages <= WB_LIST_INLINE ||
 		    unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
 							   GFP_NOIO | __GFP_NORETRY |
@@ -1507,12 +1504,12 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba
 			wb->wc_list[wb->wc_list_n++] = f;
 			e = f;
 		}
-		bio_set_op_attrs(&wb->bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA);
+		bio_set_op_attrs(bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA);
 		if (writecache_has_error(wc)) {
 			bio->bi_status = BLK_STS_IOERR;
-			bio_endio(&wb->bio);
+			bio_endio(bio);
 		} else {
-			submit_bio(&wb->bio);
+			submit_bio(bio);
 		}
 
 		__writeback_throttle(wc, wbl);
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index fa68336560c3..d8334cd45d7c 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -1169,6 +1169,9 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
 			goto out;
 		}
 
+		if (!nr_blkz)
+			break;
+
 		/* Process report */
 		for (i = 0; i < nr_blkz; i++) {
 			ret = dmz_init_zone(zmd, zone, &blkz[i]);
@@ -1204,6 +1207,8 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
 	/* Get zone information from disk */
 	ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone),
 				  &blkz, &nr_blkz, GFP_NOIO);
+	if (!nr_blkz)
+		ret = -EIO;
 	if (ret) {
 		dmz_dev_err(zmd->dev, "Get zone %u report failed",
 			    dmz_id(zmd, zone));
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 8865c1709e16..51d029bbb740 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -643,7 +643,8 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path)
 
 	q = bdev_get_queue(dev->bdev);
 	dev->capacity = i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
-	aligned_capacity = dev->capacity & ~(blk_queue_zone_sectors(q) - 1);
+	aligned_capacity = dev->capacity &
+				~((sector_t)blk_queue_zone_sectors(q) - 1);
 	if (ti->begin ||
 	    ((ti->len != dev->capacity) && (ti->len != aligned_capacity))) {
 		ti->error = "Partial mapping not supported";
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 043f0761e4a0..1fb1333fefec 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -781,7 +781,8 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
 }
 
 static struct table_device *find_table_device(struct list_head *l, dev_t dev,
-					      fmode_t mode) {
+					      fmode_t mode)
+{
 	struct table_device *td;
 
 	list_for_each_entry(td, l, list)
@@ -792,7 +793,8 @@ static struct table_device *find_table_device(struct list_head *l, dev_t dev,
 }
 
 int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
-			struct dm_dev **result) {
+			struct dm_dev **result)
+{
 	int r;
 	struct table_device *td;
 
@@ -1906,7 +1908,6 @@ static void cleanup_mapped_device(struct mapped_device *md)
 static struct mapped_device *alloc_dev(int minor)
 {
 	int r, numa_node_id = dm_get_numa_node();
-	struct dax_device *dax_dev = NULL;
 	struct mapped_device *md;
 	void *old_md;
 
@@ -1969,11 +1970,10 @@ static struct mapped_device *alloc_dev(int minor)
 	sprintf(md->disk->disk_name, "dm-%d", minor);
 
 	if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
-		dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
-		if (!dax_dev)
+		md->dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
+		if (!md->dax_dev)
 			goto bad;
 	}
-	md->dax_dev = dax_dev;
 
 	add_disk_no_queue_reg(md->disk);
 	format_dev_t(md->name, MKDEV(_major, minor));
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index 0a3b8ae4a29c..b8a62188f6be 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -190,6 +190,8 @@ static int sm_find_free(void *addr, unsigned begin, unsigned end,
 
 static int sm_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm)
 {
+	memset(ll, 0, sizeof(struct ll_disk));
+
 	ll->tm = tm;
 
 	ll->bitmap_info.tm = tm;