From 762a80d9fc9f690a3a35983f3b4619a220650808 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 2 Aug 2011 12:32:00 +0100
Subject: dm snapshot: flush disk cache when merging

This patch makes dm-snapshot flush disk cache when writing metadata for
merging snapshot.

Without cache flushing the disk may reorder metadata write and other
data writes and there is a possibility of data corruption in case of
power fault.

Cc: stable@kernel.org
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap-persistent.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 135c2f1fdbfc..e4ecadf0548a 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -753,7 +753,7 @@ static int persistent_commit_merge(struct dm_exception_store *store,
 	for (i = 0; i < nr_merged; i++)
 		clear_exception(ps, ps->current_committed - 1 - i);
 
-	r = area_io(ps, WRITE);
+	r = area_io(ps, WRITE_FLUSH_FUA);
 	if (r < 0)
 		return r;
 
-- 
cgit v1.2.3


From 286f367dad40beb3234a18c17391d03ba939a7f3 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 2 Aug 2011 12:32:00 +0100
Subject: dm mpath: fix potential NULL pointer in feature arg processing

Avoid dereferencing a NULL pointer if the number of feature arguments
supplied is fewer than indicated.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Cc: stable@kernel.org
---
 drivers/md/dm-mpath.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'drivers')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index c3547016f0f1..adf851a081bd 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -807,6 +807,11 @@ static int parse_features(struct arg_set *as, struct multipath *m)
 	if (!argc)
 		return 0;
 
+	if (argc > as->argc) {
+		ti->error = "not enough arguments for features";
+		return -EINVAL;
+	}
+
 	do {
 		param_name = shift(as);
 		argc--;
-- 
cgit v1.2.3


From bb91bc7bacb906c9f3a9b22744c53fa7564b51ba Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 2 Aug 2011 12:32:01 +0100
Subject: dm io: flush cpu cache with vmapped io

For normal kernel pages, CPU cache is synchronized by the dma layer.
However, this is not done for pages allocated with vmalloc. If we do I/O
to/from vmallocated pages, we must synchronize CPU cache explicitly.

Prior to doing I/O on vmallocated page we must call
flush_kernel_vmap_range to flush dirty cache on the virtual address.
After finished read we must call invalidate_kernel_vmap_range to
invalidate cache on the virtual address, so that accesses to the virtual
address return newly read data and not stale data from CPU cache.

This patch fixes metadata corruption on dm-snapshots on PA-RISC and
possibly other architectures with caches indexed by virtual address.

Cc: stable <stable@kernel.org>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-io.c | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 2067288f61f9..ad2eba40e319 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -38,6 +38,8 @@ struct io {
 	struct dm_io_client *client;
 	io_notify_fn callback;
 	void *context;
+	void *vma_invalidate_address;
+	unsigned long vma_invalidate_size;
 } __attribute__((aligned(DM_IO_MAX_REGIONS)));
 
 static struct kmem_cache *_dm_io_cache;
@@ -116,6 +118,10 @@ static void dec_count(struct io *io, unsigned int region, int error)
 		set_bit(region, &io->error_bits);
 
 	if (atomic_dec_and_test(&io->count)) {
+		if (io->vma_invalidate_size)
+			invalidate_kernel_vmap_range(io->vma_invalidate_address,
+						     io->vma_invalidate_size);
+
 		if (io->sleeper)
 			wake_up_process(io->sleeper);
 
@@ -159,6 +165,9 @@ struct dpages {
 
 	unsigned context_u;
 	void *context_ptr;
+
+	void *vma_invalidate_address;
+	unsigned long vma_invalidate_size;
 };
 
 /*
@@ -377,6 +386,9 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
 	io->sleeper = current;
 	io->client = client;
 
+	io->vma_invalidate_address = dp->vma_invalidate_address;
+	io->vma_invalidate_size = dp->vma_invalidate_size;
+
 	dispatch_io(rw, num_regions, where, dp, io, 1);
 
 	while (1) {
@@ -415,13 +427,21 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
 	io->callback = fn;
 	io->context = context;
 
+	io->vma_invalidate_address = dp->vma_invalidate_address;
+	io->vma_invalidate_size = dp->vma_invalidate_size;
+
 	dispatch_io(rw, num_regions, where, dp, io, 0);
 	return 0;
 }
 
-static int dp_init(struct dm_io_request *io_req, struct dpages *dp)
+static int dp_init(struct dm_io_request *io_req, struct dpages *dp,
+		   unsigned long size)
 {
 	/* Set up dpages based on memory type */
+
+	dp->vma_invalidate_address = NULL;
+	dp->vma_invalidate_size = 0;
+
 	switch (io_req->mem.type) {
 	case DM_IO_PAGE_LIST:
 		list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset);
@@ -432,6 +452,11 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp)
 		break;
 
 	case DM_IO_VMA:
+		flush_kernel_vmap_range(io_req->mem.ptr.vma, size);
+		if ((io_req->bi_rw & RW_MASK) == READ) {
+			dp->vma_invalidate_address = io_req->mem.ptr.vma;
+			dp->vma_invalidate_size = size;
+		}
 		vm_dp_init(dp, io_req->mem.ptr.vma);
 		break;
 
@@ -460,7 +485,7 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions,
 	int r;
 	struct dpages dp;
 
-	r = dp_init(io_req, &dp);
+	r = dp_init(io_req, &dp, (unsigned long)where->count << SECTOR_SHIFT);
 	if (r)
 		return r;
 
-- 
cgit v1.2.3


From d15b774c2920d55e3d58275c97fbe3adc3afde38 Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Tue, 2 Aug 2011 12:32:01 +0100
Subject: dm: fix idr leak on module removal

Destroy _minor_idr when unloading the core dm module.  (Found by kmemleak.)

Cc: stable@kernel.org
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0cf68b478878..41abc6dd481b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -37,6 +37,8 @@ static const char *_name = DM_NAME;
 static unsigned int major = 0;
 static unsigned int _major = 0;
 
+static DEFINE_IDR(_minor_idr);
+
 static DEFINE_SPINLOCK(_minor_lock);
 /*
  * For bio-based dm.
@@ -313,6 +315,12 @@ static void __exit dm_exit(void)
 
 	while (i--)
 		_exits[i]();
+
+	/*
+	 * Should be empty by this point.
+	 */
+	idr_remove_all(&_minor_idr);
+	idr_destroy(&_minor_idr);
 }
 
 /*
@@ -1705,8 +1713,6 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
 /*-----------------------------------------------------------------
  * An IDR is used to keep track of allocated minor numbers.
  *---------------------------------------------------------------*/
-static DEFINE_IDR(_minor_idr);
-
 static void free_minor(int minor)
 {
 	spin_lock(&_minor_lock);
-- 
cgit v1.2.3


From 283a8328ca5b987e547848de8ff0e28edcfb9e08 Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Tue, 2 Aug 2011 12:32:01 +0100
Subject: dm: suppress endian warnings

Suppress sparse warnings about cpu_to_le32() by using __le32 types for
on-disk data etc.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c           |  6 ++--
 drivers/md/dm-log.c             | 20 ++++++++----
 drivers/md/dm-snap-persistent.c | 71 ++++++++++++++++++++++-------------------
 3 files changed, 54 insertions(+), 43 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index bae6c4e23d3f..f5406766ece3 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -239,7 +239,7 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
 			      struct dm_crypt_request *dmreq)
 {
 	memset(iv, 0, cc->iv_size);
-	*(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff);
+	*(__le32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff);
 
 	return 0;
 }
@@ -248,7 +248,7 @@ static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
 				struct dm_crypt_request *dmreq)
 {
 	memset(iv, 0, cc->iv_size);
-	*(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
+	*(__le64 *)iv = cpu_to_le64(dmreq->iv_sector);
 
 	return 0;
 }
@@ -415,7 +415,7 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
 	struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private;
 
 	memset(iv, 0, cc->iv_size);
-	*(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
+	*(__le64 *)iv = cpu_to_le64(dmreq->iv_sector);
 	crypto_cipher_encrypt_one(essiv_tfm, iv, iv);
 
 	return 0;
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 948e3f4925bf..5f06fb687408 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -197,15 +197,21 @@ EXPORT_SYMBOL(dm_dirty_log_destroy);
 #define MIRROR_DISK_VERSION 2
 #define LOG_OFFSET 2
 
-struct log_header {
-	uint32_t magic;
+struct log_header_disk {
+	__le32 magic;
 
 	/*
 	 * Simple, incrementing version. no backward
 	 * compatibility.
 	 */
+	__le32 version;
+	__le64 nr_regions;
+} __packed;
+
+struct log_header_core {
+	uint32_t magic;
 	uint32_t version;
-	sector_t nr_regions;
+	uint64_t nr_regions;
 };
 
 struct log_c {
@@ -239,10 +245,10 @@ struct log_c {
 	int log_dev_failed;
 	int log_dev_flush_failed;
 	struct dm_dev *log_dev;
-	struct log_header header;
+	struct log_header_core header;
 
 	struct dm_io_region header_location;
-	struct log_header *disk_header;
+	struct log_header_disk *disk_header;
 };
 
 /*
@@ -271,14 +277,14 @@ static inline void log_clear_bit(struct log_c *l,
 /*----------------------------------------------------------------
  * Header IO
  *--------------------------------------------------------------*/
-static void header_to_disk(struct log_header *core, struct log_header *disk)
+static void header_to_disk(struct log_header_core *core, struct log_header_disk *disk)
 {
 	disk->magic = cpu_to_le32(core->magic);
 	disk->version = cpu_to_le32(core->version);
 	disk->nr_regions = cpu_to_le64(core->nr_regions);
 }
 
-static void header_from_disk(struct log_header *core, struct log_header *disk)
+static void header_from_disk(struct log_header_core *core, struct log_header_disk *disk)
 {
 	core->magic = le32_to_cpu(disk->magic);
 	core->version = le32_to_cpu(disk->version);
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index e4ecadf0548a..39becbec4dfe 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -58,25 +58,30 @@
 #define NUM_SNAPSHOT_HDR_CHUNKS 1
 
 struct disk_header {
-	uint32_t magic;
+	__le32 magic;
 
 	/*
 	 * Is this snapshot valid.  There is no way of recovering
 	 * an invalid snapshot.
 	 */
-	uint32_t valid;
+	__le32 valid;
 
 	/*
 	 * Simple, incrementing version. no backward
 	 * compatibility.
 	 */
-	uint32_t version;
+	__le32 version;
 
 	/* In sectors */
-	uint32_t chunk_size;
-};
+	__le32 chunk_size;
+} __packed;
 
 struct disk_exception {
+	__le64 old_chunk;
+	__le64 new_chunk;
+} __packed;
+
+struct core_exception {
 	uint64_t old_chunk;
 	uint64_t new_chunk;
 };
@@ -396,32 +401,32 @@ static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
 }
 
 static void read_exception(struct pstore *ps,
-			   uint32_t index, struct disk_exception *result)
+			   uint32_t index, struct core_exception *result)
 {
-	struct disk_exception *e = get_exception(ps, index);
+	struct disk_exception *de = get_exception(ps, index);
 
 	/* copy it */
-	result->old_chunk = le64_to_cpu(e->old_chunk);
-	result->new_chunk = le64_to_cpu(e->new_chunk);
+	result->old_chunk = le64_to_cpu(de->old_chunk);
+	result->new_chunk = le64_to_cpu(de->new_chunk);
 }
 
 static void write_exception(struct pstore *ps,
-			    uint32_t index, struct disk_exception *de)
+			    uint32_t index, struct core_exception *e)
 {
-	struct disk_exception *e = get_exception(ps, index);
+	struct disk_exception *de = get_exception(ps, index);
 
 	/* copy it */
-	e->old_chunk = cpu_to_le64(de->old_chunk);
-	e->new_chunk = cpu_to_le64(de->new_chunk);
+	de->old_chunk = cpu_to_le64(e->old_chunk);
+	de->new_chunk = cpu_to_le64(e->new_chunk);
 }
 
 static void clear_exception(struct pstore *ps, uint32_t index)
 {
-	struct disk_exception *e = get_exception(ps, index);
+	struct disk_exception *de = get_exception(ps, index);
 
 	/* clear it */
-	e->old_chunk = 0;
-	e->new_chunk = 0;
+	de->old_chunk = 0;
+	de->new_chunk = 0;
 }
 
 /*
@@ -437,13 +442,13 @@ static int insert_exceptions(struct pstore *ps,
 {
 	int r;
 	unsigned int i;
-	struct disk_exception de;
+	struct core_exception e;
 
 	/* presume the area is full */
 	*full = 1;
 
 	for (i = 0; i < ps->exceptions_per_area; i++) {
-		read_exception(ps, i, &de);
+		read_exception(ps, i, &e);
 
 		/*
 		 * If the new_chunk is pointing at the start of
@@ -451,7 +456,7 @@ static int insert_exceptions(struct pstore *ps,
 		 * is we know that we've hit the end of the
 		 * exceptions.  Therefore the area is not full.
 		 */
-		if (de.new_chunk == 0LL) {
+		if (e.new_chunk == 0LL) {
 			ps->current_committed = i;
 			*full = 0;
 			break;
@@ -460,13 +465,13 @@ static int insert_exceptions(struct pstore *ps,
 		/*
 		 * Keep track of the start of the free chunks.
 		 */
-		if (ps->next_free <= de.new_chunk)
-			ps->next_free = de.new_chunk + 1;
+		if (ps->next_free <= e.new_chunk)
+			ps->next_free = e.new_chunk + 1;
 
 		/*
 		 * Otherwise we add the exception to the snapshot.
 		 */
-		r = callback(callback_context, de.old_chunk, de.new_chunk);
+		r = callback(callback_context, e.old_chunk, e.new_chunk);
 		if (r)
 			return r;
 	}
@@ -641,12 +646,12 @@ static void persistent_commit_exception(struct dm_exception_store *store,
 {
 	unsigned int i;
 	struct pstore *ps = get_info(store);
-	struct disk_exception de;
+	struct core_exception ce;
 	struct commit_callback *cb;
 
-	de.old_chunk = e->old_chunk;
-	de.new_chunk = e->new_chunk;
-	write_exception(ps, ps->current_committed++, &de);
+	ce.old_chunk = e->old_chunk;
+	ce.new_chunk = e->new_chunk;
+	write_exception(ps, ps->current_committed++, &ce);
 
 	/*
 	 * Add the callback to the back of the array.  This code
@@ -701,7 +706,7 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
 				    chunk_t *last_new_chunk)
 {
 	struct pstore *ps = get_info(store);
-	struct disk_exception de;
+	struct core_exception ce;
 	int nr_consecutive;
 	int r;
 
@@ -722,9 +727,9 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
 		ps->current_committed = ps->exceptions_per_area;
 	}
 
-	read_exception(ps, ps->current_committed - 1, &de);
-	*last_old_chunk = de.old_chunk;
-	*last_new_chunk = de.new_chunk;
+	read_exception(ps, ps->current_committed - 1, &ce);
+	*last_old_chunk = ce.old_chunk;
+	*last_new_chunk = ce.new_chunk;
 
 	/*
 	 * Find number of consecutive chunks within the current area,
@@ -733,9 +738,9 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
 	for (nr_consecutive = 1; nr_consecutive < ps->current_committed;
 	     nr_consecutive++) {
 		read_exception(ps, ps->current_committed - 1 - nr_consecutive,
-			       &de);
-		if (de.old_chunk != *last_old_chunk - nr_consecutive ||
-		    de.new_chunk != *last_new_chunk - nr_consecutive)
+			       &ce);
+		if (ce.old_chunk != *last_old_chunk - nr_consecutive ||
+		    ce.new_chunk != *last_new_chunk - nr_consecutive)
 			break;
 	}
 
-- 
cgit v1.2.3


From 936688d7eb0f39be96c5791be1a04994cc8d6aa0 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 2 Aug 2011 12:32:01 +0100
Subject: dm table: fix discard support

Remove 'discards_supported' from the dm_table structure.  The same
information can be easily discovered from the table's target(s) in
dm_table_supports_discards().

Before this fix dm_table_supports_discards() would skip checking the
individual targets' 'discards_supported' flag if any one target in the
table didn't set num_discard_requests > 0.  Now the per-target
'discards_supported' flag is effective at insuring the final DM device
advertises discard support.  But, to be clear, targets that don't
support discards (!num_discard_requests) will not receive discard
requests.

Also DMWARN if a target sets 'discards_supported' override but forgets
to set 'num_discard_requests'.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c | 15 +++++++--------
 drivers/md/dm.c       |  3 ++-
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index bfe9c2333cea..3909fa259f55 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -54,7 +54,6 @@ struct dm_table {
 	sector_t *highs;
 	struct dm_target *targets;
 
-	unsigned discards_supported:1;
 	unsigned integrity_supported:1;
 
 	/*
@@ -209,7 +208,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
 	INIT_LIST_HEAD(&t->devices);
 	INIT_LIST_HEAD(&t->target_callbacks);
 	atomic_set(&t->holders, 0);
-	t->discards_supported = 1;
 
 	if (!num_targets)
 		num_targets = KEYS_PER_NODE;
@@ -791,8 +789,9 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 
 	t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
 
-	if (!tgt->num_discard_requests)
-		t->discards_supported = 0;
+	if (!tgt->num_discard_requests && tgt->discards_supported)
+		DMWARN("%s: %s: ignoring discards_supported because num_discard_requests is zero.",
+		       dm_device_name(t->md), type);
 
 	return 0;
 
@@ -1359,19 +1358,19 @@ bool dm_table_supports_discards(struct dm_table *t)
 	struct dm_target *ti;
 	unsigned i = 0;
 
-	if (!t->discards_supported)
-		return 0;
-
 	/*
 	 * Unless any target used by the table set discards_supported,
 	 * require at least one underlying device to support discards.
 	 * t->devices includes internal dm devices such as mirror logs
 	 * so we need to use iterate_devices here, which targets
-	 * supporting discard must provide.
+	 * supporting discard selectively must provide.
 	 */
 	while (i < dm_table_get_num_targets(t)) {
 		ti = dm_table_get_target(t, i++);
 
+		if (!ti->num_discard_requests)
+			continue;
+
 		if (ti->discards_supported)
 			return 1;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 41abc6dd481b..aeb0fa1ccfe4 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1179,7 +1179,8 @@ static int __clone_and_map_discard(struct clone_info *ci)
 
 		/*
 		 * Even though the device advertised discard support,
-		 * reconfiguration might have changed that since the
+		 * that does not mean every target supports it, and
+		 * reconfiguration might also have changed that since the
 		 * check was performed.
 		 */
 		if (!ti->num_discard_requests)
-- 
cgit v1.2.3


From c8f543e0786785d8c7118fc2878e42bc0193a88d Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 2 Aug 2011 12:32:01 +0100
Subject: dm log: clean up bit little endian bitops

Using __test_and_{set,clear}_bit_le() with ignoring its return value
can be replaced with __{set,clear}_bit_le().

This also removes unnecessary casts.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 5f06fb687408..306ce123d6b4 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -257,20 +257,20 @@ struct log_c {
  */
 static inline int log_test_bit(uint32_t *bs, unsigned bit)
 {
-	return test_bit_le(bit, (unsigned long *) bs) ? 1 : 0;
+	return test_bit_le(bit, bs) ? 1 : 0;
 }
 
 static inline void log_set_bit(struct log_c *l,
 			       uint32_t *bs, unsigned bit)
 {
-	__test_and_set_bit_le(bit, (unsigned long *) bs);
+	__set_bit_le(bit, bs);
 	l->touched_cleaned = 1;
 }
 
 static inline void log_clear_bit(struct log_c *l,
 				 uint32_t *bs, unsigned bit)
 {
-	__test_and_clear_bit_le(bit, (unsigned long *) bs);
+	__clear_bit_le(bit, bs);
 	l->touched_dirtied = 1;
 }
 
@@ -745,8 +745,7 @@ static int core_get_resync_work(struct dm_dirty_log *log, region_t *region)
 		return 0;
 
 	do {
-		*region = find_next_zero_bit_le(
-					     (unsigned long *) lc->sync_bits,
+		*region = find_next_zero_bit_le(lc->sync_bits,
 					     lc->region_count,
 					     lc->sync_search);
 		lc->sync_search = *region + 1;
-- 
cgit v1.2.3


From 6c9b27ab08aaf46426515b8b858ad9c60731c7a1 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill@shutemov.name>
Date: Tue, 2 Aug 2011 12:32:02 +0100
Subject: dm log: userspace use list_move

Replace list_del() followed by list_add() with list_move().

Signed-off-by: Kirill A. Shutemov <kirill@shutemov.name>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log-userspace-base.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index aa2e0c374ab3..1021c8986011 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -394,8 +394,7 @@ static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
 			group[count] = fe->region;
 			count++;
 
-			list_del(&fe->list);
-			list_add(&fe->list, &tmp_list);
+			list_move(&fe->list, &tmp_list);
 
 			type = fe->type;
 			if (count >= MAX_FLUSH_GROUP_COUNT)
-- 
cgit v1.2.3


From e29e65aacbd9e628378084905cbcf62a9fa4a8cc Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 2 Aug 2011 12:32:02 +0100
Subject: dm: use vzalloc

Use vzalloc() instead of vmalloc()+memset().

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log.c             | 3 +--
 drivers/md/dm-snap-persistent.c | 3 +--
 drivers/md/dm-table.c           | 4 +---
 3 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 306ce123d6b4..3b52bb72bd1f 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -492,7 +492,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 	memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size);
 	lc->sync_count = (sync == NOSYNC) ? region_count : 0;
 
-	lc->recovering_bits = vmalloc(bitset_size);
+	lc->recovering_bits = vzalloc(bitset_size);
 	if (!lc->recovering_bits) {
 		DMWARN("couldn't allocate sync bitset");
 		vfree(lc->sync_bits);
@@ -504,7 +504,6 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 		kfree(lc);
 		return -ENOMEM;
 	}
-	memset(lc->recovering_bits, 0, bitset_size);
 	lc->sync_search = 0;
 	log->context = lc;
 
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 39becbec4dfe..1a0acb8abdf6 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -174,10 +174,9 @@ static int alloc_area(struct pstore *ps)
 	if (!ps->area)
 		goto err_area;
 
-	ps->zero_area = vmalloc(len);
+	ps->zero_area = vzalloc(len);
 	if (!ps->zero_area)
 		goto err_zero_area;
-	memset(ps->zero_area, 0, len);
 
 	ps->header_area = vmalloc(len);
 	if (!ps->header_area)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 3909fa259f55..8dc67555e736 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -153,9 +153,7 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size)
 		return NULL;
 
 	size = nmemb * elem_size;
-	addr = vmalloc(size);
-	if (addr)
-		memset(addr, 0, size);
+	addr = vzalloc(size);
 
 	return addr;
 }
-- 
cgit v1.2.3


From 4622afb3f50e03ce6da42002e7ed3675dfafc187 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 2 Aug 2011 12:32:02 +0100
Subject: dm kcopyd: remove offset field from job structure

The offset field in struct kcopyd_job is always zero so remove it.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-kcopyd.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 320401dec104..e7926fa1eef2 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -224,7 +224,6 @@ struct kcopyd_job {
 	unsigned int num_dests;
 	struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS];
 
-	sector_t offset;
 	unsigned int nr_pages;
 	struct page_list *pages;
 
@@ -380,7 +379,7 @@ static int run_io_job(struct kcopyd_job *job)
 		.bi_rw = job->rw,
 		.mem.type = DM_IO_PAGE_LIST,
 		.mem.ptr.pl = job->pages,
-		.mem.offset = job->offset,
+		.mem.offset = 0,
 		.notify.fn = complete_io,
 		.notify.context = job,
 		.client = job->kc->io_client,
@@ -398,8 +397,7 @@ static int run_pages_job(struct kcopyd_job *job)
 {
 	int r;
 
-	job->nr_pages = dm_div_up(job->dests[0].count + job->offset,
-				  PAGE_SIZE >> 9);
+	job->nr_pages = dm_div_up(job->dests[0].count, PAGE_SIZE >> 9);
 	r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages);
 	if (!r) {
 		/* this job is ready for io */
@@ -602,7 +600,6 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
 	job->num_dests = num_dests;
 	memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
 
-	job->offset = 0;
 	job->nr_pages = 0;
 	job->pages = NULL;
 
-- 
cgit v1.2.3


From 5bf45a3dcdba9ff43959f7b5b44523fab254c19c Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 2 Aug 2011 12:32:02 +0100
Subject: dm kcopyd: remove nr_pages field from job structure

The nr_pages field in struct kcopyd_job is only used temporarily in
run_pages_job() to count the number of required pages.
We can use a local variable instead.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-kcopyd.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index e7926fa1eef2..98725e119324 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -224,7 +224,6 @@ struct kcopyd_job {
 	unsigned int num_dests;
 	struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS];
 
-	unsigned int nr_pages;
 	struct page_list *pages;
 
 	/*
@@ -396,9 +395,9 @@ static int run_io_job(struct kcopyd_job *job)
 static int run_pages_job(struct kcopyd_job *job)
 {
 	int r;
+	unsigned nr_pages = dm_div_up(job->dests[0].count, PAGE_SIZE >> 9);
 
-	job->nr_pages = dm_div_up(job->dests[0].count, PAGE_SIZE >> 9);
-	r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages);
+	r = kcopyd_get_pages(job->kc, nr_pages, &job->pages);
 	if (!r) {
 		/* this job is ready for io */
 		push(&job->kc->io_jobs, job);
@@ -600,7 +599,6 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
 	job->num_dests = num_dests;
 	memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
 
-	job->nr_pages = 0;
 	job->pages = NULL;
 
 	job->fn = fn;
-- 
cgit v1.2.3


From aa3f0794d279cd154ac100f92ff3904ea1f56de2 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 2 Aug 2011 12:32:03 +0100
Subject: dm snapshot: remove unused definitions

Remove a couple of unused #defines.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 9ecff5f3023a..9d6daa0eaff6 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -29,16 +29,6 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
 #define dm_target_is_snapshot_merge(ti) \
 	((ti)->type->name == dm_snapshot_merge_target_name)
 
-/*
- * The percentage increment we will wake up users at
- */
-#define WAKE_UP_PERCENT 5
-
-/*
- * kcopyd priority of snapshot operations
- */
-#define SNAPSHOT_COPY_PRIORITY 2
-
 /*
  * The size of the mempool used to track chunks in use.
  */
-- 
cgit v1.2.3


From a2d2b0345a0f30c169b7d08b8cebdd4853fcb0f8 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Tue, 2 Aug 2011 12:32:03 +0100
Subject: dm snapshot: style cleanups

Coding style cleanups.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
---
 drivers/md/dm-snap-persistent.c |  4 ++--
 drivers/md/dm-snap.c            | 14 ++++++--------
 2 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 1a0acb8abdf6..d1f1d7017103 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -567,7 +567,7 @@ static int persistent_read_metadata(struct dm_exception_store *store,
 	ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) /
 				  sizeof(struct disk_exception);
 	ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
-			sizeof(*ps->callbacks));
+				   sizeof(*ps->callbacks));
 	if (!ps->callbacks)
 		return -ENOMEM;
 
@@ -674,7 +674,7 @@ static void persistent_commit_exception(struct dm_exception_store *store,
 	 * If we completely filled the current area, then wipe the next one.
 	 */
 	if ((ps->current_committed == ps->exceptions_per_area) &&
-	     zero_disk_area(ps, ps->current_area + 1))
+	    zero_disk_area(ps, ps->current_area + 1))
 		ps->valid = 0;
 
 	/*
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 9d6daa0eaff6..94dee05dd28e 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1045,8 +1045,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	s = kmalloc(sizeof(*s), GFP_KERNEL);
 	if (!s) {
-		ti->error = "Cannot allocate snapshot context private "
-		    "structure";
+		ti->error = "Cannot allocate private snapshot structure";
 		r = -ENOMEM;
 		goto bad;
 	}
@@ -1405,7 +1404,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
 	 */
 	dm_insert_exception(&s->complete, e);
 
- out:
+out:
 	dm_remove_exception(&pe->e);
 	snapshot_bios = bio_list_get(&pe->snapshot_bios);
 	origin_bios = bio_list_get(&pe->origin_bios);
@@ -1470,8 +1469,7 @@ static void start_copy(struct dm_snap_pending_exception *pe)
 	dest.count = src.count;
 
 	/* Hand over to kcopyd */
-	dm_kcopyd_copy(s->kcopyd_client,
-		    &src, 1, &dest, 0, copy_callback, pe);
+	dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
 }
 
 static struct dm_snap_pending_exception *
@@ -1618,9 +1616,9 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
 		map_context->ptr = track_chunk(s, chunk);
 	}
 
- out_unlock:
+out_unlock:
 	up_write(&s->lock);
- out:
+out:
 	return r;
 }
 
@@ -1964,7 +1962,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
 			pe_to_start_now = pe;
 		}
 
- next_snapshot:
+next_snapshot:
 		up_write(&snap->lock);
 
 		if (pe_to_start_now) {
-- 
cgit v1.2.3


From 13c87583ea4e867211fc3e7edab750c353c47c95 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Tue, 2 Aug 2011 12:32:03 +0100
Subject: dm raid: cleanup parameter handling

Re-order the parameters so they are handled consistently in the same order
where defined, parsed and output.

Only include rebuild parameters in the STATUSTYPE_TABLE output if they were
supplied in the original table line.

Correct the parameter count when outputting rebuild: there are two words,
not one.

Use case-independent checks for keywords (as in other device-mapper targets).

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid.c | 42 +++++++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 19 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index e5d8904fc8f6..efa960ff5ba4 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -43,13 +43,14 @@ struct raid_dev {
 /*
  * Flags for rs->print_flags field.
  */
-#define DMPF_DAEMON_SLEEP      0x1
-#define DMPF_MAX_WRITE_BEHIND  0x2
-#define DMPF_SYNC              0x4
-#define DMPF_NOSYNC            0x8
-#define DMPF_STRIPE_CACHE      0x10
-#define DMPF_MIN_RECOVERY_RATE 0x20
-#define DMPF_MAX_RECOVERY_RATE 0x40
+#define DMPF_SYNC              0x1
+#define DMPF_NOSYNC            0x2
+#define DMPF_REBUILD           0x4
+#define DMPF_DAEMON_SLEEP      0x8
+#define DMPF_MIN_RECOVERY_RATE 0x10
+#define DMPF_MAX_RECOVERY_RATE 0x20
+#define DMPF_MAX_WRITE_BEHIND  0x40
+#define DMPF_STRIPE_CACHE      0x80
 
 struct raid_set {
 	struct dm_target *ti;
@@ -275,13 +276,13 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 		set_bit(In_sync, &rs->dev[i].rdev.flags);
 
 	for (i = 0; i < num_raid_params; i++) {
-		if (!strcmp(argv[i], "nosync")) {
+		if (!strcasecmp(argv[i], "nosync")) {
 			rs->md.recovery_cp = MaxSector;
 			rs->print_flags |= DMPF_NOSYNC;
 			rs->md.flags |= MD_SYNC_STATE_FORCED;
 			continue;
 		}
-		if (!strcmp(argv[i], "sync")) {
+		if (!strcasecmp(argv[i], "sync")) {
 			rs->md.recovery_cp = 0;
 			rs->print_flags |= DMPF_SYNC;
 			rs->md.flags |= MD_SYNC_STATE_FORCED;
@@ -300,7 +301,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 			return -EINVAL;
 		}
 
-		if (!strcmp(key, "rebuild")) {
+		if (!strcasecmp(key, "rebuild")) {
 			if (++rebuild_cnt > rs->raid_type->parity_devs) {
 				rs->ti->error = "Too many rebuild drives given";
 				return -EINVAL;
@@ -311,7 +312,8 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 			}
 			clear_bit(In_sync, &rs->dev[value].rdev.flags);
 			rs->dev[value].rdev.recovery_offset = 0;
-		} else if (!strcmp(key, "max_write_behind")) {
+			rs->print_flags |= DMPF_REBUILD;
+		} else if (!strcasecmp(key, "max_write_behind")) {
 			rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
 
 			/*
@@ -324,14 +326,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 				return -EINVAL;
 			}
 			rs->md.bitmap_info.max_write_behind = value;
-		} else if (!strcmp(key, "daemon_sleep")) {
+		} else if (!strcasecmp(key, "daemon_sleep")) {
 			rs->print_flags |= DMPF_DAEMON_SLEEP;
 			if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
 				rs->ti->error = "daemon sleep period out of range";
 				return -EINVAL;
 			}
 			rs->md.bitmap_info.daemon_sleep = value;
-		} else if (!strcmp(key, "stripe_cache")) {
+		} else if (!strcasecmp(key, "stripe_cache")) {
 			rs->print_flags |= DMPF_STRIPE_CACHE;
 
 			/*
@@ -348,14 +350,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 				rs->ti->error = "Bad stripe_cache size";
 				return -EINVAL;
 			}
-		} else if (!strcmp(key, "min_recovery_rate")) {
+		} else if (!strcasecmp(key, "min_recovery_rate")) {
 			rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
 			if (value > INT_MAX) {
 				rs->ti->error = "min_recovery_rate out of range";
 				return -EINVAL;
 			}
 			rs->md.sync_speed_min = (int)value;
-		} else if (!strcmp(key, "max_recovery_rate")) {
+		} else if (!strcasecmp(key, "max_recovery_rate")) {
 			rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
 			if (value > INT_MAX) {
 				rs->ti->error = "max_recovery_rate out of range";
@@ -547,11 +549,12 @@ static int raid_status(struct dm_target *ti, status_type_t type,
 	case STATUSTYPE_TABLE:
 		/* The string you would use to construct this array */
 		for (i = 0; i < rs->md.raid_disks; i++)
-			if (rs->dev[i].data_dev &&
+			if ((rs->print_flags & DMPF_REBUILD) &&
+			    rs->dev[i].data_dev &&
 			    !test_bit(In_sync, &rs->dev[i].rdev.flags))
-				raid_param_cnt++; /* for rebuilds */
+				raid_param_cnt += 2; /* for rebuilds */
 
-		raid_param_cnt += (hweight64(rs->print_flags) * 2);
+		raid_param_cnt += (hweight64(rs->print_flags & ~DMPF_REBUILD) * 2);
 		if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
 			raid_param_cnt--;
 
@@ -565,7 +568,8 @@ static int raid_status(struct dm_target *ti, status_type_t type,
 			DMEMIT(" nosync");
 
 		for (i = 0; i < rs->md.raid_disks; i++)
-			if (rs->dev[i].data_dev &&
+			if ((rs->print_flags & DMPF_REBUILD) &&
+			    rs->dev[i].data_dev &&
 			    !test_bit(In_sync, &rs->dev[i].rdev.flags))
 				DMEMIT(" rebuild %u", i);
 
-- 
cgit v1.2.3


From 2ca4c92f58f9386e080b26f9ccd78c9ca5825a42 Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Tue, 2 Aug 2011 12:32:03 +0100
Subject: dm ioctl: prevent empty message

Detect invalid empty messages in core dm instead of requiring every target to
check this.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'drivers')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 4cacdad2270a..1622a6bc0bfd 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1402,6 +1402,11 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
 		goto out;
 	}
 
+	if (!argc) {
+		DMWARN("Empty message received.");
+		goto out;
+	}
+
 	table = dm_get_live_table(md);
 	if (!table)
 		goto out_argv;
-- 
cgit v1.2.3


From 3e8dbb7f3966c80d77272f8b4f430babc99f6595 Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Tue, 2 Aug 2011 12:32:03 +0100
Subject: dm raid: tidy includes

A dm target only needs to use include/linux dm headers.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index efa960ff5ba4..d4e95b2e39f6 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -9,9 +9,10 @@
 
 #include "md.h"
 #include "raid5.h"
-#include "dm.h"
 #include "bitmap.h"
 
+#include <linux/device-mapper.h>
+
 #define DM_MSG_PREFIX "raid"
 
 /*
-- 
cgit v1.2.3


From 08649012545cfb116798260352547cf4d47064ec Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 2 Aug 2011 12:32:04 +0100
Subject: dm table: clean dm_get_device and move exports

There is no need for __table_get_device to be factored out.
Also move the exports to the end of their respective functions.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c | 33 +++++++++++++--------------------
 1 file changed, 13 insertions(+), 20 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 8dc67555e736..d58b200a3531 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -157,6 +157,7 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size)
 
 	return addr;
 }
+EXPORT_SYMBOL(dm_vcalloc);
 
 /*
  * highs, and targets are managed as dynamic arrays during a
@@ -277,6 +278,7 @@ void dm_table_get(struct dm_table *t)
 {
 	atomic_inc(&t->holders);
 }
+EXPORT_SYMBOL(dm_table_get);
 
 void dm_table_put(struct dm_table *t)
 {
@@ -286,6 +288,7 @@ void dm_table_put(struct dm_table *t)
 	smp_mb__before_atomic_dec();
 	atomic_dec(&t->holders);
 }
+EXPORT_SYMBOL(dm_table_put);
 
 /*
  * Checks to see if we need to extend highs or targets.
@@ -451,13 +454,14 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
  * Add a device to the list, or just increment the usage count if
  * it's already present.
  */
-static int __table_get_device(struct dm_table *t, struct dm_target *ti,
-		      const char *path, fmode_t mode, struct dm_dev **result)
+int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
+		  struct dm_dev **result)
 {
 	int r;
 	dev_t uninitialized_var(dev);
 	struct dm_dev_internal *dd;
 	unsigned int major, minor;
+	struct dm_table *t = ti->table;
 
 	BUG_ON(!t);
 
@@ -505,6 +509,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
 	*result = &dd->dm_dev;
 	return 0;
 }
+EXPORT_SYMBOL(dm_get_device);
 
 int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
 			 sector_t start, sector_t len, void *data)
@@ -543,15 +548,8 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
 }
 EXPORT_SYMBOL_GPL(dm_set_device_limits);
 
-int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
-		  struct dm_dev **result)
-{
-	return __table_get_device(ti->table, ti, path, mode, result);
-}
-
-
 /*
- * Decrement a devices use count and remove it if necessary.
+ * Decrement a device's use count and remove it if necessary.
  */
 void dm_put_device(struct dm_target *ti, struct dm_dev *d)
 {
@@ -564,6 +562,7 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d)
 		kfree(dd);
 	}
 }
+EXPORT_SYMBOL(dm_put_device);
 
 /*
  * Checks to see if the target joins onto the end of the table.
@@ -1074,11 +1073,13 @@ void dm_table_event(struct dm_table *t)
 		t->event_fn(t->event_context);
 	mutex_unlock(&_event_lock);
 }
+EXPORT_SYMBOL(dm_table_event);
 
 sector_t dm_table_get_size(struct dm_table *t)
 {
 	return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
 }
+EXPORT_SYMBOL(dm_table_get_size);
 
 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
 {
@@ -1234,6 +1235,7 @@ fmode_t dm_table_get_mode(struct dm_table *t)
 {
 	return t->mode;
 }
+EXPORT_SYMBOL(dm_table_get_mode);
 
 static void suspend_targets(struct dm_table *t, unsigned postsuspend)
 {
@@ -1342,6 +1344,7 @@ struct mapped_device *dm_table_get_md(struct dm_table *t)
 {
 	return t->md;
 }
+EXPORT_SYMBOL(dm_table_get_md);
 
 static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
 				  sector_t start, sector_t len, void *data)
@@ -1379,13 +1382,3 @@ bool dm_table_supports_discards(struct dm_table *t)
 
 	return 0;
 }
-
-EXPORT_SYMBOL(dm_vcalloc);
-EXPORT_SYMBOL(dm_get_device);
-EXPORT_SYMBOL(dm_put_device);
-EXPORT_SYMBOL(dm_table_event);
-EXPORT_SYMBOL(dm_table_get_size);
-EXPORT_SYMBOL(dm_table_get_mode);
-EXPORT_SYMBOL(dm_table_get_md);
-EXPORT_SYMBOL(dm_table_put);
-EXPORT_SYMBOL(dm_table_get);
-- 
cgit v1.2.3


From d5b9dd04bd74b774b8e8d93ced7a0d15ad403fa9 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 2 Aug 2011 12:32:04 +0100
Subject: dm: ignore merge_bvec for snapshots when safe

Add a new flag DMF_MERGE_IS_OPTIONAL to struct mapped_device to indicate
whether the device can accept bios larger than the size its merge
function returns.  When set, use this to send large bios to snapshots
which can split them if necessary.  Snapshot I/O may be significantly
fragmented and this approach seems to improve peformance.

Before the patch, dm_set_device_limits restricted bio size to page size
if the underlying device had a merge function and the target didn't
provide a merge function.  After the patch, dm_set_device_limits
restricts bio size to page size if the underlying device has a merge
function, doesn't have DMF_MERGE_IS_OPTIONAL flag and the target doesn't
provide a merge function.

The snapshot target can't provide a merge function because when the merge
function is called, it is impossible to determine where the bio will be
remapped.  Previously this led us to impose a 4k limit, which we can
now remove if the snapshot store is located on a device without a merge
function.  Together with another patch for optimizing full chunk writes,
it improves performance from 29MB/s to 40MB/s when writing to the
filesystem on snapshot store.

If the snapshot store is placed on a non-dm device with a merge function
(such as md-raid), device mapper still limits all bios to page size.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c |  3 +--
 drivers/md/dm.c       | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/dm.h       |  2 ++
 3 files changed, 64 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index d58b200a3531..03516c7f5be1 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -540,8 +540,7 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
 	 * If not we'll force DM to use PAGE_SIZE or
 	 * smaller I/O, just to be safe.
 	 */
-
-	if (q->merge_bvec_fn && !ti->type->merge)
+	if (dm_queue_merge_is_compulsory(q) && !ti->type->merge)
 		blk_limits_max_hw_sectors(limits,
 					  (unsigned int) (PAGE_SIZE >> 9));
 	return 0;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index aeb0fa1ccfe4..1000eaf984ef 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -111,6 +111,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
 #define DMF_FREEING 3
 #define DMF_DELETING 4
 #define DMF_NOFLUSH_SUSPENDING 5
+#define DMF_MERGE_IS_OPTIONAL 6
 
 /*
  * Work processed by per-device workqueue.
@@ -1992,6 +1993,59 @@ static void __set_size(struct mapped_device *md, sector_t size)
 	i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
 }
 
+/*
+ * Return 1 if the queue has a compulsory merge_bvec_fn function.
+ *
+ * If this function returns 0, then the device is either a non-dm
+ * device without a merge_bvec_fn, or it is a dm device that is
+ * able to split any bios it receives that are too big.
+ */
+int dm_queue_merge_is_compulsory(struct request_queue *q)
+{
+	struct mapped_device *dev_md;
+
+	if (!q->merge_bvec_fn)
+		return 0;
+
+	if (q->make_request_fn == dm_request) {
+		dev_md = q->queuedata;
+		if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
+			return 0;
+	}
+
+	return 1;
+}
+
+static int dm_device_merge_is_compulsory(struct dm_target *ti,
+					 struct dm_dev *dev, sector_t start,
+					 sector_t len, void *data)
+{
+	struct block_device *bdev = dev->bdev;
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	return dm_queue_merge_is_compulsory(q);
+}
+
+/*
+ * Return 1 if it is acceptable to ignore merge_bvec_fn based
+ * on the properties of the underlying devices.
+ */
+static int dm_table_merge_is_optional(struct dm_table *table)
+{
+	unsigned i = 0;
+	struct dm_target *ti;
+
+	while (i < dm_table_get_num_targets(table)) {
+		ti = dm_table_get_target(table, i++);
+
+		if (ti->type->iterate_devices &&
+		    ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL))
+			return 0;
+	}
+
+	return 1;
+}
+
 /*
  * Returns old map, which caller must destroy.
  */
@@ -2002,6 +2056,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 	struct request_queue *q = md->queue;
 	sector_t size;
 	unsigned long flags;
+	int merge_is_optional;
 
 	size = dm_table_get_size(t);
 
@@ -2027,10 +2082,16 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 
 	__bind_mempools(md, t);
 
+	merge_is_optional = dm_table_merge_is_optional(t);
+
 	write_lock_irqsave(&md->map_lock, flags);
 	old_map = md->map;
 	md->map = t;
 	dm_table_set_restrictions(t, q, limits);
+	if (merge_is_optional)
+		set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
+	else
+		clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
 	write_unlock_irqrestore(&md->map_lock, flags);
 
 	return old_map;
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 1aaf16746da8..6745dbd278a4 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -66,6 +66,8 @@ int dm_table_alloc_md_mempools(struct dm_table *t);
 void dm_table_free_md_mempools(struct dm_table *t);
 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
 
+int dm_queue_merge_is_compulsory(struct request_queue *q);
+
 void dm_lock_md_type(struct mapped_device *md);
 void dm_unlock_md_type(struct mapped_device *md);
 void dm_set_md_type(struct mapped_device *md, unsigned type);
-- 
cgit v1.2.3


From a6e50b409d3f9e0833e69c3c9cca822e8fa4adbb Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 2 Aug 2011 12:32:04 +0100
Subject: dm snapshot: skip reading origin when overwriting complete chunk

If we write a full chunk in the snapshot, skip reading the origin device
because the whole chunk will be overwritten anyway.

This patch changes the snapshot write logic when a full chunk is written.
In this case:
  1. allocate the exception
  2. dispatch the bio (but don't report the bio completion to device mapper)
  3. write the exception record
  4. report bio completed

Callbacks must be done through the kcopyd thread, because callbacks must not
race with each other.  So we create two new functions:

  dm_kcopyd_prepare_callback: allocate a job structure and prepare the callback.
  (This function must not be called from interrupt context.)

  dm_kcopyd_do_callback: submit callback.
  (This function may be called from interrupt context.)

Performance test (on snapshots with 4k chunk size):
  without the patch:
    non-direct-io sequential write (dd):    17.7MB/s
    direct-io sequential write (dd):        20.9MB/s
    non-direct-io random write (mkfs.ext2): 0.44s

  with the patch:
    non-direct-io sequential write (dd):    26.5MB/s
    direct-io sequential write (dd):        33.2MB/s
    non-direct-io random write (mkfs.ext2): 0.27s

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-kcopyd.c    | 31 ++++++++++++++++++++++++
 drivers/md/dm-snap.c      | 60 ++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/dm-kcopyd.h | 15 ++++++++++++
 3 files changed, 103 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 98725e119324..f82147029636 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -617,6 +617,37 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
 }
 EXPORT_SYMBOL(dm_kcopyd_copy);
 
+void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc,
+				 dm_kcopyd_notify_fn fn, void *context)
+{
+	struct kcopyd_job *job;
+
+	job = mempool_alloc(kc->job_pool, GFP_NOIO);
+
+	memset(job, 0, sizeof(struct kcopyd_job));
+	job->kc = kc;
+	job->fn = fn;
+	job->context = context;
+
+	atomic_inc(&kc->nr_jobs);
+
+	return job;
+}
+EXPORT_SYMBOL(dm_kcopyd_prepare_callback);
+
+void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err)
+{
+	struct kcopyd_job *job = j;
+	struct dm_kcopyd_client *kc = job->kc;
+
+	job->read_err = read_err;
+	job->write_err = write_err;
+
+	push(&kc->complete_jobs, job);
+	wake(kc);
+}
+EXPORT_SYMBOL(dm_kcopyd_do_callback);
+
 /*
  * Cancels a kcopyd job, eg. someone might be deactivating a
  * mirror.
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 94dee05dd28e..6f758870fc19 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -170,6 +170,13 @@ struct dm_snap_pending_exception {
 	 * kcopyd.
 	 */
 	int started;
+
+	/*
+	 * For writing a complete chunk, bypassing the copy.
+	 */
+	struct bio *full_bio;
+	bio_end_io_t *full_bio_end_io;
+	void *full_bio_private;
 };
 
 /*
@@ -1369,6 +1376,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
 	struct dm_snapshot *s = pe->snap;
 	struct bio *origin_bios = NULL;
 	struct bio *snapshot_bios = NULL;
+	struct bio *full_bio = NULL;
 	int error = 0;
 
 	if (!success) {
@@ -1408,6 +1416,11 @@ out:
 	dm_remove_exception(&pe->e);
 	snapshot_bios = bio_list_get(&pe->snapshot_bios);
 	origin_bios = bio_list_get(&pe->origin_bios);
+	full_bio = pe->full_bio;
+	if (full_bio) {
+		full_bio->bi_end_io = pe->full_bio_end_io;
+		full_bio->bi_private = pe->full_bio_private;
+	}
 	free_pending_exception(pe);
 
 	increment_pending_exceptions_done_count();
@@ -1415,10 +1428,15 @@ out:
 	up_write(&s->lock);
 
 	/* Submit any pending write bios */
-	if (error)
+	if (error) {
+		if (full_bio)
+			bio_io_error(full_bio);
 		error_bios(snapshot_bios);
-	else
+	} else {
+		if (full_bio)
+			bio_endio(full_bio, 0);
 		flush_bios(snapshot_bios);
+	}
 
 	retry_origin_bios(s, origin_bios);
 }
@@ -1472,6 +1490,32 @@ static void start_copy(struct dm_snap_pending_exception *pe)
 	dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
 }
 
+static void full_bio_end_io(struct bio *bio, int error)
+{
+	void *callback_data = bio->bi_private;
+
+	dm_kcopyd_do_callback(callback_data, 0, error ? 1 : 0);
+}
+
+static void start_full_bio(struct dm_snap_pending_exception *pe,
+			   struct bio *bio)
+{
+	struct dm_snapshot *s = pe->snap;
+	void *callback_data;
+
+	pe->full_bio = bio;
+	pe->full_bio_end_io = bio->bi_end_io;
+	pe->full_bio_private = bio->bi_private;
+
+	callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
+						   copy_callback, pe);
+
+	bio->bi_end_io = full_bio_end_io;
+	bio->bi_private = callback_data;
+
+	generic_make_request(bio);
+}
+
 static struct dm_snap_pending_exception *
 __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
 {
@@ -1507,6 +1551,7 @@ __find_pending_exception(struct dm_snapshot *s,
 	bio_list_init(&pe->origin_bios);
 	bio_list_init(&pe->snapshot_bios);
 	pe->started = 0;
+	pe->full_bio = NULL;
 
 	if (s->store->type->prepare_exception(s->store, &pe->e)) {
 		free_pending_exception(pe);
@@ -1600,10 +1645,19 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
 		}
 
 		remap_exception(s, &pe->e, bio, chunk);
-		bio_list_add(&pe->snapshot_bios, bio);
 
 		r = DM_MAPIO_SUBMITTED;
 
+		if (!pe->started &&
+		    bio->bi_size == (s->store->chunk_size << SECTOR_SHIFT)) {
+			pe->started = 1;
+			up_write(&s->lock);
+			start_full_bio(pe, bio);
+			goto out;
+		}
+
+		bio_list_add(&pe->snapshot_bios, bio);
+
 		if (!pe->started) {
 			/* this is protected by snap->lock */
 			pe->started = 1;
diff --git a/include/linux/dm-kcopyd.h b/include/linux/dm-kcopyd.h
index 298d587e349b..5e54458e920f 100644
--- a/include/linux/dm-kcopyd.h
+++ b/include/linux/dm-kcopyd.h
@@ -42,5 +42,20 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
 		   unsigned num_dests, struct dm_io_region *dests,
 		   unsigned flags, dm_kcopyd_notify_fn fn, void *context);
 
+/*
+ * Prepare a callback and submit it via the kcopyd thread.
+ *
+ * dm_kcopyd_prepare_callback allocates a callback structure and returns it.
+ * It must not be called from interrupt context.
+ * The returned value should be passed into dm_kcopyd_do_callback.
+ *
+ * dm_kcopyd_do_callback submits the callback.
+ * It may be called from interrupt context.
+ * The callback is issued from the kcopyd thread.
+ */
+void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc,
+				 dm_kcopyd_notify_fn fn, void *context);
+void dm_kcopyd_do_callback(void *job, int read_err, unsigned long write_err);
+
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_DM_KCOPYD_H */
-- 
cgit v1.2.3


From 498f0103ea13123e007660def9072a0b7dd1c599 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 2 Aug 2011 12:32:04 +0100
Subject: dm table: share target argument parsing functions

Move multipath target argument parsing code into dm-table so other
targets can share it.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c         |   7 +-
 drivers/md/dm-mpath.c         | 152 ++++++++++++------------------------------
 drivers/md/dm-table.c         |  57 ++++++++++++++++
 include/linux/device-mapper.h |  43 ++++++++++++
 4 files changed, 147 insertions(+), 112 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index f5406766ece3..b79e7472a9b3 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -30,7 +30,6 @@
 #include <linux/device-mapper.h>
 
 #define DM_MSG_PREFIX "crypt"
-#define MESG_STR(x) x, sizeof(x)
 
 /*
  * context holding the current state of a multi-part conversion
@@ -1770,12 +1769,12 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
 	if (argc < 2)
 		goto error;
 
-	if (!strnicmp(argv[0], MESG_STR("key"))) {
+	if (!strcasecmp(argv[0], "key")) {
 		if (!test_bit(DM_CRYPT_SUSPENDED, &cc->flags)) {
 			DMWARN("not suspended during key manipulation.");
 			return -EINVAL;
 		}
-		if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) {
+		if (argc == 3 && !strcasecmp(argv[1], "set")) {
 			ret = crypt_set_key(cc, argv[2]);
 			if (ret)
 				return ret;
@@ -1783,7 +1782,7 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
 				ret = cc->iv_gen_ops->init(cc);
 			return ret;
 		}
-		if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) {
+		if (argc == 2 && !strcasecmp(argv[1], "wipe")) {
 			if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) {
 				ret = cc->iv_gen_ops->wipe(cc);
 				if (ret)
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index adf851a081bd..5e0090ef4182 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -22,7 +22,6 @@
 #include <linux/atomic.h>
 
 #define DM_MSG_PREFIX "multipath"
-#define MESG_STR(x) x, sizeof(x)
 #define DM_PG_INIT_DELAY_MSECS 2000
 #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
 
@@ -505,80 +504,29 @@ static void trigger_event(struct work_struct *work)
  *      <#paths> <#per-path selector args>
  *         [<path> [<arg>]* ]+ ]+
  *---------------------------------------------------------------*/
-struct param {
-	unsigned min;
-	unsigned max;
-	char *error;
-};
-
-static int read_param(struct param *param, char *str, unsigned *v, char **error)
-{
-	if (!str ||
-	    (sscanf(str, "%u", v) != 1) ||
-	    (*v < param->min) ||
-	    (*v > param->max)) {
-		*error = param->error;
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-struct arg_set {
-	unsigned argc;
-	char **argv;
-};
-
-static char *shift(struct arg_set *as)
-{
-	char *r;
-
-	if (as->argc) {
-		as->argc--;
-		r = *as->argv;
-		as->argv++;
-		return r;
-	}
-
-	return NULL;
-}
-
-static void consume(struct arg_set *as, unsigned n)
-{
-	BUG_ON (as->argc < n);
-	as->argc -= n;
-	as->argv += n;
-}
-
-static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
+static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg,
 			       struct dm_target *ti)
 {
 	int r;
 	struct path_selector_type *pst;
 	unsigned ps_argc;
 
-	static struct param _params[] = {
+	static struct dm_arg _args[] = {
 		{0, 1024, "invalid number of path selector args"},
 	};
 
-	pst = dm_get_path_selector(shift(as));
+	pst = dm_get_path_selector(dm_shift_arg(as));
 	if (!pst) {
 		ti->error = "unknown path selector type";
 		return -EINVAL;
 	}
 
-	r = read_param(_params, shift(as), &ps_argc, &ti->error);
+	r = dm_read_arg_group(_args, as, &ps_argc, &ti->error);
 	if (r) {
 		dm_put_path_selector(pst);
 		return -EINVAL;
 	}
 
-	if (ps_argc > as->argc) {
-		dm_put_path_selector(pst);
-		ti->error = "not enough arguments for path selector";
-		return -EINVAL;
-	}
-
 	r = pst->create(&pg->ps, ps_argc, as->argv);
 	if (r) {
 		dm_put_path_selector(pst);
@@ -587,12 +535,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
 	}
 
 	pg->ps.type = pst;
-	consume(as, ps_argc);
+	dm_consume_args(as, ps_argc);
 
 	return 0;
 }
 
-static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
+static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps,
 			       struct dm_target *ti)
 {
 	int r;
@@ -609,7 +557,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
 	if (!p)
 		return ERR_PTR(-ENOMEM);
 
-	r = dm_get_device(ti, shift(as), dm_table_get_mode(ti->table),
+	r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
 			  &p->path.dev);
 	if (r) {
 		ti->error = "error getting device";
@@ -660,16 +608,16 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
 	return ERR_PTR(r);
 }
 
-static struct priority_group *parse_priority_group(struct arg_set *as,
+static struct priority_group *parse_priority_group(struct dm_arg_set *as,
 						   struct multipath *m)
 {
-	static struct param _params[] = {
+	static struct dm_arg _args[] = {
 		{1, 1024, "invalid number of paths"},
 		{0, 1024, "invalid number of selector args"}
 	};
 
 	int r;
-	unsigned i, nr_selector_args, nr_params;
+	unsigned i, nr_selector_args, nr_args;
 	struct priority_group *pg;
 	struct dm_target *ti = m->ti;
 
@@ -693,26 +641,26 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
 	/*
 	 * read the paths
 	 */
-	r = read_param(_params, shift(as), &pg->nr_pgpaths, &ti->error);
+	r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error);
 	if (r)
 		goto bad;
 
-	r = read_param(_params + 1, shift(as), &nr_selector_args, &ti->error);
+	r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error);
 	if (r)
 		goto bad;
 
-	nr_params = 1 + nr_selector_args;
+	nr_args = 1 + nr_selector_args;
 	for (i = 0; i < pg->nr_pgpaths; i++) {
 		struct pgpath *pgpath;
-		struct arg_set path_args;
+		struct dm_arg_set path_args;
 
-		if (as->argc < nr_params) {
+		if (as->argc < nr_args) {
 			ti->error = "not enough path parameters";
 			r = -EINVAL;
 			goto bad;
 		}
 
-		path_args.argc = nr_params;
+		path_args.argc = nr_args;
 		path_args.argv = as->argv;
 
 		pgpath = parse_path(&path_args, &pg->ps, ti);
@@ -723,7 +671,7 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
 
 		pgpath->pg = pg;
 		list_add_tail(&pgpath->list, &pg->pgpaths);
-		consume(as, nr_params);
+		dm_consume_args(as, nr_args);
 	}
 
 	return pg;
@@ -733,28 +681,23 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
 	return ERR_PTR(r);
 }
 
-static int parse_hw_handler(struct arg_set *as, struct multipath *m)
+static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
 {
 	unsigned hw_argc;
 	int ret;
 	struct dm_target *ti = m->ti;
 
-	static struct param _params[] = {
+	static struct dm_arg _args[] = {
 		{0, 1024, "invalid number of hardware handler args"},
 	};
 
-	if (read_param(_params, shift(as), &hw_argc, &ti->error))
+	if (dm_read_arg_group(_args, as, &hw_argc, &ti->error))
 		return -EINVAL;
 
 	if (!hw_argc)
 		return 0;
 
-	if (hw_argc > as->argc) {
-		ti->error = "not enough arguments for hardware handler";
-		return -EINVAL;
-	}
-
-	m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL);
+	m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
 	request_module("scsi_dh_%s", m->hw_handler_name);
 	if (scsi_dh_handler_exist(m->hw_handler_name) == 0) {
 		ti->error = "unknown hardware handler type";
@@ -778,7 +721,7 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m)
 		for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1)
 			j = sprintf(p, "%s", as->argv[i]);
 	}
-	consume(as, hw_argc - 1);
+	dm_consume_args(as, hw_argc - 1);
 
 	return 0;
 fail:
@@ -787,52 +730,45 @@ fail:
 	return ret;
 }
 
-static int parse_features(struct arg_set *as, struct multipath *m)
+static int parse_features(struct dm_arg_set *as, struct multipath *m)
 {
 	int r;
 	unsigned argc;
 	struct dm_target *ti = m->ti;
-	const char *param_name;
+	const char *arg_name;
 
-	static struct param _params[] = {
+	static struct dm_arg _args[] = {
 		{0, 5, "invalid number of feature args"},
 		{1, 50, "pg_init_retries must be between 1 and 50"},
 		{0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
 	};
 
-	r = read_param(_params, shift(as), &argc, &ti->error);
+	r = dm_read_arg_group(_args, as, &argc, &ti->error);
 	if (r)
 		return -EINVAL;
 
 	if (!argc)
 		return 0;
 
-	if (argc > as->argc) {
-		ti->error = "not enough arguments for features";
-		return -EINVAL;
-	}
-
 	do {
-		param_name = shift(as);
+		arg_name = dm_shift_arg(as);
 		argc--;
 
-		if (!strnicmp(param_name, MESG_STR("queue_if_no_path"))) {
+		if (!strcasecmp(arg_name, "queue_if_no_path")) {
 			r = queue_if_no_path(m, 1, 0);
 			continue;
 		}
 
-		if (!strnicmp(param_name, MESG_STR("pg_init_retries")) &&
+		if (!strcasecmp(arg_name, "pg_init_retries") &&
 		    (argc >= 1)) {
-			r = read_param(_params + 1, shift(as),
-				       &m->pg_init_retries, &ti->error);
+			r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
 			argc--;
 			continue;
 		}
 
-		if (!strnicmp(param_name, MESG_STR("pg_init_delay_msecs")) &&
+		if (!strcasecmp(arg_name, "pg_init_delay_msecs") &&
 		    (argc >= 1)) {
-			r = read_param(_params + 2, shift(as),
-				       &m->pg_init_delay_msecs, &ti->error);
+			r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error);
 			argc--;
 			continue;
 		}
@@ -847,15 +783,15 @@ static int parse_features(struct arg_set *as, struct multipath *m)
 static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 			 char **argv)
 {
-	/* target parameters */
-	static struct param _params[] = {
+	/* target arguments */
+	static struct dm_arg _args[] = {
 		{0, 1024, "invalid number of priority groups"},
 		{0, 1024, "invalid initial priority group number"},
 	};
 
 	int r;
 	struct multipath *m;
-	struct arg_set as;
+	struct dm_arg_set as;
 	unsigned pg_count = 0;
 	unsigned next_pg_num;
 
@@ -876,11 +812,11 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 	if (r)
 		goto bad;
 
-	r = read_param(_params, shift(&as), &m->nr_priority_groups, &ti->error);
+	r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error);
 	if (r)
 		goto bad;
 
-	r = read_param(_params + 1, shift(&as), &next_pg_num, &ti->error);
+	r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error);
 	if (r)
 		goto bad;
 
@@ -1510,10 +1446,10 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
 	}
 
 	if (argc == 1) {
-		if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) {
+		if (!strcasecmp(argv[0], "queue_if_no_path")) {
 			r = queue_if_no_path(m, 1, 0);
 			goto out;
-		} else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) {
+		} else if (!strcasecmp(argv[0], "fail_if_no_path")) {
 			r = queue_if_no_path(m, 0, 0);
 			goto out;
 		}
@@ -1524,18 +1460,18 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
 		goto out;
 	}
 
-	if (!strnicmp(argv[0], MESG_STR("disable_group"))) {
+	if (!strcasecmp(argv[0], "disable_group")) {
 		r = bypass_pg_num(m, argv[1], 1);
 		goto out;
-	} else if (!strnicmp(argv[0], MESG_STR("enable_group"))) {
+	} else if (!strcasecmp(argv[0], "enable_group")) {
 		r = bypass_pg_num(m, argv[1], 0);
 		goto out;
-	} else if (!strnicmp(argv[0], MESG_STR("switch_group"))) {
+	} else if (!strcasecmp(argv[0], "switch_group")) {
 		r = switch_pg_num(m, argv[1]);
 		goto out;
-	} else if (!strnicmp(argv[0], MESG_STR("reinstate_path")))
+	} else if (!strcasecmp(argv[0], "reinstate_path"))
 		action = reinstate_path;
-	else if (!strnicmp(argv[0], MESG_STR("fail_path")))
+	else if (!strcasecmp(argv[0], "fail_path"))
 		action = fail_path;
 	else {
 		DMWARN("Unrecognised multipath message received.");
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 03516c7f5be1..259ce99302fc 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -797,6 +797,63 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 	return r;
 }
 
+/*
+ * Target argument parsing helpers.
+ */
+static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
+			     unsigned *value, char **error, unsigned grouped)
+{
+	const char *arg_str = dm_shift_arg(arg_set);
+
+	if (!arg_str ||
+	    (sscanf(arg_str, "%u", value) != 1) ||
+	    (*value < arg->min) ||
+	    (*value > arg->max) ||
+	    (grouped && arg_set->argc < *value)) {
+		*error = arg->error;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
+		unsigned *value, char **error)
+{
+	return validate_next_arg(arg, arg_set, value, error, 0);
+}
+EXPORT_SYMBOL(dm_read_arg);
+
+int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set,
+		      unsigned *value, char **error)
+{
+	return validate_next_arg(arg, arg_set, value, error, 1);
+}
+EXPORT_SYMBOL(dm_read_arg_group);
+
+const char *dm_shift_arg(struct dm_arg_set *as)
+{
+	char *r;
+
+	if (as->argc) {
+		as->argc--;
+		r = *as->argv;
+		as->argv++;
+		return r;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(dm_shift_arg);
+
+void dm_consume_args(struct dm_arg_set *as, unsigned num_args)
+{
+	BUG_ON(as->argc < num_args);
+	as->argc -= num_args;
+	as->argv += num_args;
+}
+EXPORT_SYMBOL(dm_consume_args);
+
 static int dm_table_set_type(struct dm_table *t)
 {
 	unsigned i;
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 4427e0454051..3fa1f3d90ce0 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -208,6 +208,49 @@ struct dm_target_callbacks {
 int dm_register_target(struct target_type *t);
 void dm_unregister_target(struct target_type *t);
 
+/*
+ * Target argument parsing.
+ */
+struct dm_arg_set {
+	unsigned argc;
+	char **argv;
+};
+
+/*
+ * The minimum and maximum value of a numeric argument, together with
+ * the error message to use if the number is found to be outside that range.
+ */
+struct dm_arg {
+	unsigned min;
+	unsigned max;
+	char *error;
+};
+
+/*
+ * Validate the next argument, either returning it as *value or, if invalid,
+ * returning -EINVAL and setting *error.
+ */
+int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
+		unsigned *value, char **error);
+
+/*
+ * Process the next argument as the start of a group containing between
+ * arg->min and arg->max further arguments. Either return the size as
+ * *num_args or, if invalid, return -EINVAL and set *error.
+ */
+int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set,
+		      unsigned *num_args, char **error);
+
+/*
+ * Return the current argument and shift to the next.
+ */
+const char *dm_shift_arg(struct dm_arg_set *as);
+
+/*
+ * Move through num_args arguments.
+ */
+void dm_consume_args(struct dm_arg_set *as, unsigned num_args);
+
 /*-----------------------------------------------------------------
  * Functions for creating and manipulating mapped devices.
  * Drop the reference with dm_put when you finish with the object.
-- 
cgit v1.2.3


From 30e4171bfe3d1c49689803338005cc0071dddaff Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 2 Aug 2011 12:32:05 +0100
Subject: dm flakey: use dm_target_offset and support discards

Use dm_target_offset() and support discards.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-flakey.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index ea790623c30b..70a69b2f93d2 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -79,6 +79,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 
 	ti->num_flush_requests = 1;
+	ti->num_discard_requests = 1;
 	ti->private = fc;
 	return 0;
 
@@ -99,7 +100,7 @@ static sector_t flakey_map_sector(struct dm_target *ti, sector_t bi_sector)
 {
 	struct flakey_c *fc = ti->private;
 
-	return fc->start + (bi_sector - ti->begin);
+	return fc->start + dm_target_offset(ti, bi_sector);
 }
 
 static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
-- 
cgit v1.2.3


From dfd068b01f02653c6650f1c0eda443b2655d1471 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 2 Aug 2011 12:32:05 +0100
Subject: dm flakey: support feature args

Add the ability to specify arbitrary feature flags when creating a
flakey target.  This code uses the same target argument helpers that
the multipath target does.

Also remove the superfluous 'dm-flakey' prefixes from the error messages,
as they already contain the prefix 'flakey'.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-flakey.c | 83 ++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 64 insertions(+), 19 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 70a69b2f93d2..dd963dc3ca08 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -27,54 +27,99 @@ struct flakey_c {
 	unsigned down_interval;
 };
 
+static int parse_features(struct dm_arg_set *as, struct dm_target *ti)
+{
+	int r;
+	unsigned argc;
+	const char *arg_name;
+
+	static struct dm_arg _args[] = {
+		{0, 0, "Invalid number of feature args"},
+	};
+
+	/* No feature arguments supplied. */
+	if (!as->argc)
+		return 0;
+
+	r = dm_read_arg_group(_args, as, &argc, &ti->error);
+	if (r)
+		return -EINVAL;
+
+	while (argc && !r) {
+		arg_name = dm_shift_arg(as);
+		argc--;
+
+		ti->error = "Unrecognised flakey feature requested";
+		r = -EINVAL;
+	}
+
+	return r;
+}
+
 /*
- * Construct a flakey mapping: <dev_path> <offset> <up interval> <down interval>
+ * Construct a flakey mapping:
+ * <dev_path> <offset> <up interval> <down interval> [<#feature args> [<arg>]*]
  */
 static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
+	static struct dm_arg _args[] = {
+		{0, UINT_MAX, "Invalid up interval"},
+		{0, UINT_MAX, "Invalid down interval"},
+	};
+
+	int r;
 	struct flakey_c *fc;
-	unsigned long long tmp;
+	unsigned long long tmpll;
+	struct dm_arg_set as;
+	const char *devname;
 
-	if (argc != 4) {
-		ti->error = "dm-flakey: Invalid argument count";
+	as.argc = argc;
+	as.argv = argv;
+
+	if (argc < 4) {
+		ti->error = "Invalid argument count";
 		return -EINVAL;
 	}
 
 	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
 	if (!fc) {
-		ti->error = "dm-flakey: Cannot allocate linear context";
+		ti->error = "Cannot allocate linear context";
 		return -ENOMEM;
 	}
 	fc->start_time = jiffies;
 
-	if (sscanf(argv[1], "%llu", &tmp) != 1) {
-		ti->error = "dm-flakey: Invalid device sector";
+	devname = dm_shift_arg(&as);
+
+	if (sscanf(dm_shift_arg(&as), "%llu", &tmpll) != 1) {
+		ti->error = "Invalid device sector";
 		goto bad;
 	}
-	fc->start = tmp;
+	fc->start = tmpll;
 
-	if (sscanf(argv[2], "%u", &fc->up_interval) != 1) {
-		ti->error = "dm-flakey: Invalid up interval";
+	r = dm_read_arg(_args, &as, &fc->up_interval, &ti->error);
+	if (r)
 		goto bad;
-	}
 
-	if (sscanf(argv[3], "%u", &fc->down_interval) != 1) {
-		ti->error = "dm-flakey: Invalid down interval";
+	r = dm_read_arg(_args, &as, &fc->down_interval, &ti->error);
+	if (r)
 		goto bad;
-	}
 
 	if (!(fc->up_interval + fc->down_interval)) {
-		ti->error = "dm-flakey: Total (up + down) interval is zero";
+		ti->error = "Total (up + down) interval is zero";
 		goto bad;
 	}
 
 	if (fc->up_interval + fc->down_interval < fc->up_interval) {
-		ti->error = "dm-flakey: Interval overflow";
+		ti->error = "Interval overflow";
 		goto bad;
 	}
 
-	if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &fc->dev)) {
-		ti->error = "dm-flakey: Device lookup failed";
+	r = parse_features(&as, ti);
+	if (r)
+		goto bad;
+
+	if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &fc->dev)) {
+		ti->error = "Device lookup failed";
 		goto bad;
 	}
 
@@ -178,7 +223,7 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_
 
 static struct target_type flakey_target = {
 	.name   = "flakey",
-	.version = {1, 1, 0},
+	.version = {1, 2, 0},
 	.module = THIS_MODULE,
 	.ctr    = flakey_ctr,
 	.dtr    = flakey_dtr,
-- 
cgit v1.2.3


From b26f5e3d7127487e934758c1fbe05d683b082cb0 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 2 Aug 2011 12:32:05 +0100
Subject: dm flakey: add drop_writes

Add 'drop_writes' option to drop writes silently while the
device is 'down'.  Reads are not touched.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 Documentation/device-mapper/dm-flakey.txt | 29 +++++++++++---
 drivers/md/dm-flakey.c                    | 63 +++++++++++++++++++++++++++----
 2 files changed, 78 insertions(+), 14 deletions(-)

(limited to 'drivers')

diff --git a/Documentation/device-mapper/dm-flakey.txt b/Documentation/device-mapper/dm-flakey.txt
index c8efdfd19a65..1b66c868ee7e 100644
--- a/Documentation/device-mapper/dm-flakey.txt
+++ b/Documentation/device-mapper/dm-flakey.txt
@@ -1,17 +1,34 @@
 dm-flakey
 =========
 
-This target is the same as the linear target except that it returns I/O
-errors periodically.  It's been found useful in simulating failing
-devices for testing purposes.
+This target is the same as the linear target except that it exhibits
+unreliable behaviour periodically.  It's been found useful in simulating
+failing devices for testing purposes.
 
 Starting from the time the table is loaded, the device is available for
-<up interval> seconds, then returns errors for <down interval> seconds,
-and then this cycle repeats.
+<up interval> seconds, then exhibits unreliable behaviour for <down
+interval> seconds, and then this cycle repeats.
 
-Parameters: <dev path> <offset> <up interval> <down interval>
+Also, consider using this in combination with the dm-delay target too,
+which can delay reads and writes and/or send them to different
+underlying devices.
+
+Table parameters
+----------------
+  <dev path> <offset> <up interval> <down interval> \
+    [<num_features> [<feature arguments>]]
+
+Mandatory parameters:
     <dev path>: Full pathname to the underlying block-device, or a
                 "major:minor" device-number.
     <offset>: Starting sector within the device.
     <up interval>: Number of seconds device is available.
     <down interval>: Number of seconds device returns errors.
+
+Optional feature parameters:
+  If no feature parameters are present, during the periods of
+  unreliability, all I/O returns errors.
+
+  drop_writes:
+	All write I/O is silently ignored.
+	Read I/O is handled correctly.
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index dd963dc3ca08..e7c4c2a64f4b 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -25,16 +25,22 @@ struct flakey_c {
 	sector_t start;
 	unsigned up_interval;
 	unsigned down_interval;
+	unsigned long flags;
 };
 
-static int parse_features(struct dm_arg_set *as, struct dm_target *ti)
+enum feature_flag_bits {
+	DROP_WRITES
+};
+
+static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
+			  struct dm_target *ti)
 {
 	int r;
 	unsigned argc;
 	const char *arg_name;
 
 	static struct dm_arg _args[] = {
-		{0, 0, "Invalid number of feature args"},
+		{0, 1, "Invalid number of feature args"},
 	};
 
 	/* No feature arguments supplied. */
@@ -49,6 +55,18 @@ static int parse_features(struct dm_arg_set *as, struct dm_target *ti)
 		arg_name = dm_shift_arg(as);
 		argc--;
 
+		/*
+		 * drop_writes
+		 */
+		if (!strcasecmp(arg_name, "drop_writes")) {
+			if (test_and_set_bit(DROP_WRITES, &fc->flags)) {
+				ti->error = "Feature drop_writes duplicated";
+				return -EINVAL;
+			}
+
+			continue;
+		}
+
 		ti->error = "Unrecognised flakey feature requested";
 		r = -EINVAL;
 	}
@@ -59,6 +77,9 @@ static int parse_features(struct dm_arg_set *as, struct dm_target *ti)
 /*
  * Construct a flakey mapping:
  * <dev_path> <offset> <up interval> <down interval> [<#feature args> [<arg>]*]
+ *
+ *   Feature args:
+ *     [drop_writes]
  */
 static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
@@ -81,7 +102,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		return -EINVAL;
 	}
 
-	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
+	fc = kzalloc(sizeof(*fc), GFP_KERNEL);
 	if (!fc) {
 		ti->error = "Cannot allocate linear context";
 		return -ENOMEM;
@@ -114,7 +135,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 	}
 
-	r = parse_features(&as, ti);
+	r = parse_features(&as, fc, ti);
 	if (r)
 		goto bad;
 
@@ -162,12 +183,31 @@ static int flakey_map(struct dm_target *ti, struct bio *bio,
 {
 	struct flakey_c *fc = ti->private;
 	unsigned elapsed;
+	unsigned rw;
 
 	/* Are we alive ? */
 	elapsed = (jiffies - fc->start_time) / HZ;
-	if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval)
+	if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) {
+		rw = bio_data_dir(bio);
+
+		/*
+		 * Drop writes.  Map reads as normal.
+		 */
+		if (test_bit(DROP_WRITES, &fc->flags)) {
+			if (rw == WRITE) {
+				bio_endio(bio, 0);
+				return DM_MAPIO_SUBMITTED;
+			}
+			goto map_bio;
+		}
+
+		/*
+		 * Default setting errors all I/O.
+		 */
 		return -EIO;
+	}
 
+map_bio:
 	flakey_map_bio(ti, bio);
 
 	return DM_MAPIO_REMAPPED;
@@ -176,7 +216,9 @@ static int flakey_map(struct dm_target *ti, struct bio *bio,
 static int flakey_status(struct dm_target *ti, status_type_t type,
 			 char *result, unsigned int maxlen)
 {
+	unsigned sz = 0;
 	struct flakey_c *fc = ti->private;
+	unsigned drop_writes;
 
 	switch (type) {
 	case STATUSTYPE_INFO:
@@ -184,9 +226,14 @@ static int flakey_status(struct dm_target *ti, status_type_t type,
 		break;
 
 	case STATUSTYPE_TABLE:
-		snprintf(result, maxlen, "%s %llu %u %u", fc->dev->name,
-			 (unsigned long long)fc->start, fc->up_interval,
-			 fc->down_interval);
+		DMEMIT("%s %llu %u %u ", fc->dev->name,
+		       (unsigned long long)fc->start, fc->up_interval,
+		       fc->down_interval);
+
+		drop_writes = test_bit(DROP_WRITES, &fc->flags);
+		DMEMIT("%u ", drop_writes);
+		if (drop_writes)
+			DMEMIT("drop_writes ");
 		break;
 	}
 	return 0;
-- 
cgit v1.2.3


From a3998799fb4df0b0af8271a7d50c4269032397aa Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 2 Aug 2011 12:32:06 +0100
Subject: dm flakey: add corrupt_bio_byte feature

Add corrupt_bio_byte feature to simulate corruption by overwriting a byte at a
specified position with a specified value during intervals when the device is
"down".

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 Documentation/device-mapper/dm-flakey.txt |  19 ++++
 drivers/md/dm-flakey.c                    | 155 +++++++++++++++++++++++++++---
 2 files changed, 159 insertions(+), 15 deletions(-)

(limited to 'drivers')

diff --git a/Documentation/device-mapper/dm-flakey.txt b/Documentation/device-mapper/dm-flakey.txt
index 1b66c868ee7e..6ff5c2327227 100644
--- a/Documentation/device-mapper/dm-flakey.txt
+++ b/Documentation/device-mapper/dm-flakey.txt
@@ -32,3 +32,22 @@ Optional feature parameters:
   drop_writes:
 	All write I/O is silently ignored.
 	Read I/O is handled correctly.
+
+  corrupt_bio_byte <Nth_byte> <direction> <value> <flags>:
+	During <down interval>, replace <Nth_byte> of the data of
+	each matching bio with <value>.
+
+    <Nth_byte>: The offset of the byte to replace.
+		Counting starts at 1, to replace the first byte.
+    <direction>: Either 'r' to corrupt reads or 'w' to corrupt writes.
+		 'w' is incompatible with drop_writes.
+    <value>: The value (from 0-255) to write.
+    <flags>: Perform the replacement only if bio->bi_rw has all the
+	     selected flags set.
+
+Examples:
+  corrupt_bio_byte 32 r 1 0
+	- replaces the 32nd byte of READ bios with the value 1
+
+  corrupt_bio_byte 224 w 0 32
+	- replaces the 224th byte of REQ_META (=32) bios with the value 0
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index e7c4c2a64f4b..89f73ca22cfa 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2003 Sistina Software (UK) Limited.
- * Copyright (C) 2004, 2010 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004, 2010-2011 Red Hat, Inc. All rights reserved.
  *
  * This file is released under the GPL.
  */
@@ -15,6 +15,9 @@
 
 #define DM_MSG_PREFIX "flakey"
 
+#define all_corrupt_bio_flags_match(bio, fc)	\
+	(((bio)->bi_rw & (fc)->corrupt_bio_flags) == (fc)->corrupt_bio_flags)
+
 /*
  * Flakey: Used for testing only, simulates intermittent,
  * catastrophic device failure.
@@ -26,6 +29,10 @@ struct flakey_c {
 	unsigned up_interval;
 	unsigned down_interval;
 	unsigned long flags;
+	unsigned corrupt_bio_byte;
+	unsigned corrupt_bio_rw;
+	unsigned corrupt_bio_value;
+	unsigned corrupt_bio_flags;
 };
 
 enum feature_flag_bits {
@@ -40,7 +47,10 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
 	const char *arg_name;
 
 	static struct dm_arg _args[] = {
-		{0, 1, "Invalid number of feature args"},
+		{0, 6, "Invalid number of feature args"},
+		{1, UINT_MAX, "Invalid corrupt bio byte"},
+		{0, 255, "Invalid corrupt value to write into bio byte (0-255)"},
+		{0, UINT_MAX, "Invalid corrupt bio flags mask"},
 	};
 
 	/* No feature arguments supplied. */
@@ -49,9 +59,9 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
 
 	r = dm_read_arg_group(_args, as, &argc, &ti->error);
 	if (r)
-		return -EINVAL;
+		return r;
 
-	while (argc && !r) {
+	while (argc) {
 		arg_name = dm_shift_arg(as);
 		argc--;
 
@@ -67,11 +77,61 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
 			continue;
 		}
 
+		/*
+		 * corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>
+		 */
+		if (!strcasecmp(arg_name, "corrupt_bio_byte")) {
+			if (!argc)
+				ti->error = "Feature corrupt_bio_byte requires parameters";
+
+			r = dm_read_arg(_args + 1, as, &fc->corrupt_bio_byte, &ti->error);
+			if (r)
+				return r;
+			argc--;
+
+			/*
+			 * Direction r or w?
+			 */
+			arg_name = dm_shift_arg(as);
+			if (!strcasecmp(arg_name, "w"))
+				fc->corrupt_bio_rw = WRITE;
+			else if (!strcasecmp(arg_name, "r"))
+				fc->corrupt_bio_rw = READ;
+			else {
+				ti->error = "Invalid corrupt bio direction (r or w)";
+				return -EINVAL;
+			}
+			argc--;
+
+			/*
+			 * Value of byte (0-255) to write in place of correct one.
+			 */
+			r = dm_read_arg(_args + 2, as, &fc->corrupt_bio_value, &ti->error);
+			if (r)
+				return r;
+			argc--;
+
+			/*
+			 * Only corrupt bios with these flags set.
+			 */
+			r = dm_read_arg(_args + 3, as, &fc->corrupt_bio_flags, &ti->error);
+			if (r)
+				return r;
+			argc--;
+
+			continue;
+		}
+
 		ti->error = "Unrecognised flakey feature requested";
-		r = -EINVAL;
+		return -EINVAL;
 	}
 
-	return r;
+	if (test_bit(DROP_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) {
+		ti->error = "drop_writes is incompatible with corrupt_bio_byte with the WRITE flag set";
+		return -EINVAL;
+	}
+
+	return 0;
 }
 
 /*
@@ -80,6 +140,11 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
  *
  *   Feature args:
  *     [drop_writes]
+ *     [corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>]
+ *
+ *   Nth_byte starts from 1 for the first byte.
+ *   Direction is r for READ or w for WRITE.
+ *   bio_flags is ignored if 0.
  */
 static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
@@ -178,31 +243,64 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
 		bio->bi_sector = flakey_map_sector(ti, bio->bi_sector);
 }
 
+static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
+{
+	unsigned bio_bytes = bio_cur_bytes(bio);
+	char *data = bio_data(bio);
+
+	/*
+	 * Overwrite the Nth byte of the data returned.
+	 */
+	if (data && bio_bytes >= fc->corrupt_bio_byte) {
+		data[fc->corrupt_bio_byte - 1] = fc->corrupt_bio_value;
+
+		DMDEBUG("Corrupting data bio=%p by writing %u to byte %u "
+			"(rw=%c bi_rw=%lu bi_sector=%llu cur_bytes=%u)\n",
+			bio, fc->corrupt_bio_value, fc->corrupt_bio_byte,
+			(bio_data_dir(bio) == WRITE) ? 'w' : 'r',
+			bio->bi_rw, (unsigned long long)bio->bi_sector, bio_bytes);
+	}
+}
+
 static int flakey_map(struct dm_target *ti, struct bio *bio,
 		      union map_info *map_context)
 {
 	struct flakey_c *fc = ti->private;
 	unsigned elapsed;
-	unsigned rw;
 
 	/* Are we alive ? */
 	elapsed = (jiffies - fc->start_time) / HZ;
 	if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) {
-		rw = bio_data_dir(bio);
+		/*
+		 * Flag this bio as submitted while down.
+		 */
+		map_context->ll = 1;
+
+		/*
+		 * Map reads as normal.
+		 */
+		if (bio_data_dir(bio) == READ)
+			goto map_bio;
 
 		/*
-		 * Drop writes.  Map reads as normal.
+		 * Drop writes?
 		 */
 		if (test_bit(DROP_WRITES, &fc->flags)) {
-			if (rw == WRITE) {
-				bio_endio(bio, 0);
-				return DM_MAPIO_SUBMITTED;
-			}
+			bio_endio(bio, 0);
+			return DM_MAPIO_SUBMITTED;
+		}
+
+		/*
+		 * Corrupt matching writes.
+		 */
+		if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == WRITE)) {
+			if (all_corrupt_bio_flags_match(bio, fc))
+				corrupt_bio_data(bio, fc);
 			goto map_bio;
 		}
 
 		/*
-		 * Default setting errors all I/O.
+		 * By default, error all I/O.
 		 */
 		return -EIO;
 	}
@@ -213,6 +311,24 @@ map_bio:
 	return DM_MAPIO_REMAPPED;
 }
 
+static int flakey_end_io(struct dm_target *ti, struct bio *bio,
+			 int error, union map_info *map_context)
+{
+	struct flakey_c *fc = ti->private;
+	unsigned bio_submitted_while_down = map_context->ll;
+
+	/*
+	 * Corrupt successful READs while in down state.
+	 * If flags were specified, only corrupt those that match.
+	 */
+	if (!error && bio_submitted_while_down &&
+	    (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) &&
+	    all_corrupt_bio_flags_match(bio, fc))
+		corrupt_bio_data(bio, fc);
+
+	return error;
+}
+
 static int flakey_status(struct dm_target *ti, status_type_t type,
 			 char *result, unsigned int maxlen)
 {
@@ -231,9 +347,17 @@ static int flakey_status(struct dm_target *ti, status_type_t type,
 		       fc->down_interval);
 
 		drop_writes = test_bit(DROP_WRITES, &fc->flags);
-		DMEMIT("%u ", drop_writes);
+		DMEMIT("%u ", drop_writes + (fc->corrupt_bio_byte > 0) * 5);
+
 		if (drop_writes)
 			DMEMIT("drop_writes ");
+
+		if (fc->corrupt_bio_byte)
+			DMEMIT("corrupt_bio_byte %u %c %u %u ",
+			       fc->corrupt_bio_byte,
+			       (fc->corrupt_bio_rw == WRITE) ? 'w' : 'r',
+			       fc->corrupt_bio_value, fc->corrupt_bio_flags);
+
 		break;
 	}
 	return 0;
@@ -275,6 +399,7 @@ static struct target_type flakey_target = {
 	.ctr    = flakey_ctr,
 	.dtr    = flakey_dtr,
 	.map    = flakey_map,
+	.end_io = flakey_end_io,
 	.status = flakey_status,
 	.ioctl	= flakey_ioctl,
 	.merge	= flakey_merge,
-- 
cgit v1.2.3


From 0ddf9644cc26e74ed671525e61a17bdbebf18da6 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 2 Aug 2011 12:32:06 +0100
Subject: dm ioctl: fill in device parameters in more ioctls

Move parameter filling from find_device to __find_device_hash_cell.

This patch causes ioctls using __find_device_hash_cell
(DM_DEV_REMOVE_CMD, DM_DEV_SUSPEND_CMD - resume, DM_TABLE_CLEAR_CMD)
to return device parameters, bringing them into line with the other
ioctls.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c    | 63 +++++++++++++++++++++++++++---------------------
 include/linux/dm-ioctl.h |  4 +--
 2 files changed, 38 insertions(+), 29 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 1622a6bc0bfd..99d38a6925a4 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -719,24 +719,49 @@ static int dev_create(struct dm_ioctl *param, size_t param_size)
 static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
 {
 	struct mapped_device *md;
-	void *mdptr = NULL;
+	struct hash_cell *hc = NULL;
 
-	if (*param->uuid)
-		return __get_uuid_cell(param->uuid);
+	if (*param->uuid) {
+		hc = __get_uuid_cell(param->uuid);
+		if (!hc)
+			return NULL;
+		goto fill_params;
+	}
 
-	if (*param->name)
-		return __get_name_cell(param->name);
+	if (*param->name) {
+		hc = __get_name_cell(param->name);
+		if (!hc)
+			return NULL;
+		goto fill_params;
+	}
 
 	md = dm_get_md(huge_decode_dev(param->dev));
 	if (!md)
-		goto out;
+		return NULL;
 
-	mdptr = dm_get_mdptr(md);
-	if (!mdptr)
+	hc = dm_get_mdptr(md);
+	if (!hc) {
 		dm_put(md);
+		return NULL;
+	}
 
-out:
-	return mdptr;
+fill_params:
+	/*
+	 * Sneakily write in both the name and the uuid
+	 * while we have the cell.
+	 */
+	strlcpy(param->name, hc->name, sizeof(param->name));
+	if (hc->uuid)
+		strlcpy(param->uuid, hc->uuid, sizeof(param->uuid));
+	else
+		param->uuid[0] = '\0';
+
+	if (hc->new_map)
+		param->flags |= DM_INACTIVE_PRESENT_FLAG;
+	else
+		param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
+
+	return hc;
 }
 
 static struct mapped_device *find_device(struct dm_ioctl *param)
@@ -746,24 +771,8 @@ static struct mapped_device *find_device(struct dm_ioctl *param)
 
 	down_read(&_hash_lock);
 	hc = __find_device_hash_cell(param);
-	if (hc) {
+	if (hc)
 		md = hc->md;
-
-		/*
-		 * Sneakily write in both the name and the uuid
-		 * while we have the cell.
-		 */
-		strlcpy(param->name, hc->name, sizeof(param->name));
-		if (hc->uuid)
-			strlcpy(param->uuid, hc->uuid, sizeof(param->uuid));
-		else
-			param->uuid[0] = '\0';
-
-		if (hc->new_map)
-			param->flags |= DM_INACTIVE_PRESENT_FLAG;
-		else
-			param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
-	}
 	up_read(&_hash_lock);
 
 	return md;
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index 3708455ee6c3..0cb8eff76bd6 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -267,9 +267,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	20
+#define DM_VERSION_MINOR	21
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2011-02-02)"
+#define DM_VERSION_EXTRA	"-ioctl (2011-07-06)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
cgit v1.2.3


From ba2e19b0f4ccd6920fe175a86521ff18ede260cb Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 2 Aug 2011 12:32:06 +0100
Subject: dm ioctl: introduce __get_dev_cell

Move logic to find device based on major/minor number to a separate
function __get_dev_cell (similar to __get_uuid_cell and __get_name_cell).
This makes the function __find_device_hash_cell more straightforward.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c | 41 ++++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 17 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 99d38a6925a4..4c350914f4a0 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -128,6 +128,24 @@ static struct hash_cell *__get_uuid_cell(const char *str)
 	return NULL;
 }
 
+static struct hash_cell *__get_dev_cell(uint64_t dev)
+{
+	struct mapped_device *md;
+	struct hash_cell *hc;
+
+	md = dm_get_md(huge_decode_dev(dev));
+	if (!md)
+		return NULL;
+
+	hc = dm_get_mdptr(md);
+	if (!hc) {
+		dm_put(md);
+		return NULL;
+	}
+
+	return hc;
+}
+
 /*-----------------------------------------------------------------
  * Inserting, removing and renaming a device.
  *---------------------------------------------------------------*/
@@ -718,34 +736,23 @@ static int dev_create(struct dm_ioctl *param, size_t param_size)
  */
 static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
 {
-	struct mapped_device *md;
 	struct hash_cell *hc = NULL;
 
 	if (*param->uuid) {
 		hc = __get_uuid_cell(param->uuid);
 		if (!hc)
 			return NULL;
-		goto fill_params;
-	}
-
-	if (*param->name) {
+	} else if (*param->name) {
 		hc = __get_name_cell(param->name);
 		if (!hc)
 			return NULL;
-		goto fill_params;
-	}
-
-	md = dm_get_md(huge_decode_dev(param->dev));
-	if (!md)
-		return NULL;
-
-	hc = dm_get_mdptr(md);
-	if (!hc) {
-		dm_put(md);
+	} else if (param->dev) {
+		hc = __get_dev_cell(param->dev);
+		if (!hc)
+			return NULL;
+	} else
 		return NULL;
-	}
 
-fill_params:
 	/*
 	 * Sneakily write in both the name and the uuid
 	 * while we have the cell.
-- 
cgit v1.2.3


From 759dea204cce9f1fc2a5d00ea25211299fc7a4a0 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 2 Aug 2011 12:32:06 +0100
Subject: dm ioctl: forbid multiple device specifiers

Exactly one of name, uuid or device must be specified when referencing
an existing device.  This removes the ambiguity (risking the wrong
device being updated) if two conflicting parameters were specified.
Previously one parameter got used and any others were ignored silently.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'drivers')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 4c350914f4a0..2e9a3ca37bdd 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -739,10 +739,16 @@ static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
 	struct hash_cell *hc = NULL;
 
 	if (*param->uuid) {
+		if (*param->name || param->dev)
+			return NULL;
+
 		hc = __get_uuid_cell(param->uuid);
 		if (!hc)
 			return NULL;
 	} else if (*param->name) {
+		if (param->dev)
+			return NULL;
+
 		hc = __get_name_cell(param->name);
 		if (!hc)
 			return NULL;
-- 
cgit v1.2.3


From c1084561bb85da3630540ebe951749a8cd8fc714 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Tue, 2 Aug 2011 12:32:07 +0100
Subject: dm raid: add region_size parameter

Allow the user to specify the region_size.

Ensures that the supplied value meets md's constraints, viz. the number of
regions does not exceed 2^21.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 Documentation/device-mapper/dm-raid.txt |  4 ++
 drivers/md/dm-raid.c                    | 82 +++++++++++++++++++++++++++++++--
 2 files changed, 83 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index 4f9dd3cecc11..be4419a30781 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -52,6 +52,10 @@ The target is named "raid" and it accepts the following parameters:
 	[max_recovery_rate <kB/sec/disk>]  Throttle RAID initialization
 	[max_write_behind <sectors>]       See '-write-behind=' (man mdadm)
 	[stripe_cache <sectors>]           Stripe cache size (higher RAIDs only)
+	[region_size <sectors>]
+		The region_size multiplied by the number of regions is the
+		logical size of the array.  The bitmap records the device
+		synchronisation state for each region.
 
 <#raid_devs>: The number of devices composing the array.
 	Each device consists of two entries.  The first is the device
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index d4e95b2e39f6..a8a1915a450d 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -52,7 +52,7 @@ struct raid_dev {
 #define DMPF_MAX_RECOVERY_RATE 0x20
 #define DMPF_MAX_WRITE_BEHIND  0x40
 #define DMPF_STRIPE_CACHE      0x80
-
+#define DMPF_REGION_SIZE       0X100
 struct raid_set {
 	struct dm_target *ti;
 
@@ -236,6 +236,67 @@ static int dev_parms(struct raid_set *rs, char **argv)
 	return 0;
 }
 
+/*
+ * validate_region_size
+ * @rs
+ * @region_size:  region size in sectors.  If 0, pick a size (4MiB default).
+ *
+ * Set rs->md.bitmap_info.chunksize (which really refers to 'region size').
+ * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap.
+ *
+ * Returns: 0 on success, -EINVAL on failure.
+ */
+static int validate_region_size(struct raid_set *rs, unsigned long region_size)
+{
+	unsigned long min_region_size = rs->ti->len / (1 << 21);
+
+	if (!region_size) {
+		/*
+		 * Choose a reasonable default.  All figures in sectors.
+		 */
+		if (min_region_size > (1 << 13)) {
+			DMINFO("Choosing default region size of %lu sectors",
+			       region_size);
+			region_size = min_region_size;
+		} else {
+			DMINFO("Choosing default region size of 4MiB");
+			region_size = 1 << 13; /* sectors */
+		}
+	} else {
+		/*
+		 * Validate user-supplied value.
+		 */
+		if (region_size > rs->ti->len) {
+			rs->ti->error = "Supplied region size is too large";
+			return -EINVAL;
+		}
+
+		if (region_size < min_region_size) {
+			DMERR("Supplied region_size (%lu sectors) below minimum (%lu)",
+			      region_size, min_region_size);
+			rs->ti->error = "Supplied region size is too small";
+			return -EINVAL;
+		}
+
+		if (!is_power_of_2(region_size)) {
+			rs->ti->error = "Region size is not a power of 2";
+			return -EINVAL;
+		}
+
+		if (region_size < rs->md.chunk_sectors) {
+			rs->ti->error = "Region size is smaller than the chunk size";
+			return -EINVAL;
+		}
+	}
+
+	/*
+	 * Convert sectors to bytes.
+	 */
+	rs->md.bitmap_info.chunksize = (region_size << 9);
+
+	return 0;
+}
+
 /*
  * Possible arguments are...
  * RAID456:
@@ -249,12 +310,13 @@ static int dev_parms(struct raid_set *rs, char **argv)
  *    [max_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
  *    [max_write_behind <sectors>]	See '-write-behind=' (man mdadm)
  *    [stripe_cache <sectors>]		Stripe cache size for higher RAIDs
+ *    [region_size <sectors>]           Defines granularity of bitmap
  */
 static int parse_raid_params(struct raid_set *rs, char **argv,
 			     unsigned num_raid_params)
 {
 	unsigned i, rebuild_cnt = 0;
-	unsigned long value;
+	unsigned long value, region_size = 0;
 	char *key;
 
 	/*
@@ -365,6 +427,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 				return -EINVAL;
 			}
 			rs->md.sync_speed_max = (int)value;
+		} else if (!strcasecmp(key, "region_size")) {
+			rs->print_flags |= DMPF_REGION_SIZE;
+			region_size = value;
 		} else {
 			DMERR("Unable to parse RAID parameter: %s", key);
 			rs->ti->error = "Unable to parse RAID parameters";
@@ -372,6 +437,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 		}
 	}
 
+	if (validate_region_size(rs, region_size))
+		return -EINVAL;
+
+	if (rs->md.chunk_sectors)
+		rs->ti->split_io = rs->md.chunk_sectors;
+	else
+		rs->ti->split_io = region_size;
+
 	/* Assume there are no metadata devices until the drives are parsed */
 	rs->md.persistent = 0;
 	rs->md.external = 1;
@@ -469,7 +542,6 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto bad;
 
 	INIT_WORK(&rs->md.event_work, do_table_event);
-	ti->split_io = rs->md.chunk_sectors;
 	ti->private = rs;
 
 	mutex_lock(&rs->md.reconfig_mutex);
@@ -596,6 +668,10 @@ static int raid_status(struct dm_target *ti, status_type_t type,
 			       conf ? conf->max_nr_stripes * 2 : 0);
 		}
 
+		if (rs->print_flags & DMPF_REGION_SIZE)
+			DMEMIT(" region_size %lu",
+			       rs->md.bitmap_info.chunksize >> 9);
+
 		DMEMIT(" %d", rs->md.raid_disks);
 		for (i = 0; i < rs->md.raid_disks; i++) {
 			DMEMIT(" -"); /* metadata device */
-- 
cgit v1.2.3


From 46bed2b5c16bb7c82e1088d7ae75fb958c8a8c4e Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Tue, 2 Aug 2011 12:32:07 +0100
Subject: dm raid: add write_mostly parameter

Add the write_mostly parameter to RAID1 dm-raid tables.

This allows the user to set the WriteMostly flag on a RAID1 device that
should normally be avoided for read I/O.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 Documentation/device-mapper/dm-raid.txt |  4 +++-
 drivers/md/dm-raid.c                    | 26 +++++++++++++++++++++++++-
 2 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index be4419a30781..a7d1c4abc927 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -50,6 +50,7 @@ The target is named "raid" and it accepts the following parameters:
 
 	[min_recovery_rate <kB/sec/disk>]  Throttle RAID initialization
 	[max_recovery_rate <kB/sec/disk>]  Throttle RAID initialization
+	[write_mostly <idx>]		   Drive index is write-mostly
 	[max_write_behind <sectors>]       See '-write-behind=' (man mdadm)
 	[stripe_cache <sectors>]           Stripe cache size (higher RAIDs only)
 	[region_size <sectors>]
@@ -87,9 +88,10 @@ Example tables
         5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
 
 'dmsetup table' displays the table used to construct the mapping.
-The optional parameters will always be printed in the order listed
+The optional parameters are always printed in the order listed
 above with "sync" or "nosync" always output ahead of the other
 arguments, regardless of the order used when originally loading the table.
+Arguments that can be repeated are ordered by value.
 
 'dmsetup status' yields information on the state and health of the
 array.
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index a8a1915a450d..88143a0303d2 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -308,6 +308,7 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
  *    [daemon_sleep <ms>]		Time between bitmap daemon work to clear bits
  *    [min_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
  *    [max_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
+ *    [write_mostly <idx>]		Indicate a write mostly drive via index
  *    [max_write_behind <sectors>]	See '-write-behind=' (man mdadm)
  *    [stripe_cache <sectors>]		Stripe cache size for higher RAIDs
  *    [region_size <sectors>]           Defines granularity of bitmap
@@ -376,7 +377,21 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 			clear_bit(In_sync, &rs->dev[value].rdev.flags);
 			rs->dev[value].rdev.recovery_offset = 0;
 			rs->print_flags |= DMPF_REBUILD;
+		} else if (!strcasecmp(key, "write_mostly")) {
+			if (rs->raid_type->level != 1) {
+				rs->ti->error = "write_mostly option is only valid for RAID1";
+				return -EINVAL;
+			}
+			if (value > rs->md.raid_disks) {
+				rs->ti->error = "Invalid write_mostly drive index given";
+				return -EINVAL;
+			}
+			set_bit(WriteMostly, &rs->dev[value].rdev.flags);
 		} else if (!strcasecmp(key, "max_write_behind")) {
+			if (rs->raid_type->level != 1) {
+				rs->ti->error = "max_write_behind option is only valid for RAID1";
+				return -EINVAL;
+			}
 			rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
 
 			/*
@@ -621,11 +636,15 @@ static int raid_status(struct dm_target *ti, status_type_t type,
 		break;
 	case STATUSTYPE_TABLE:
 		/* The string you would use to construct this array */
-		for (i = 0; i < rs->md.raid_disks; i++)
+		for (i = 0; i < rs->md.raid_disks; i++) {
 			if ((rs->print_flags & DMPF_REBUILD) &&
 			    rs->dev[i].data_dev &&
 			    !test_bit(In_sync, &rs->dev[i].rdev.flags))
 				raid_param_cnt += 2; /* for rebuilds */
+			if (rs->dev[i].data_dev &&
+			    test_bit(WriteMostly, &rs->dev[i].rdev.flags))
+				raid_param_cnt += 2;
+		}
 
 		raid_param_cnt += (hweight64(rs->print_flags & ~DMPF_REBUILD) * 2);
 		if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
@@ -656,6 +675,11 @@ static int raid_status(struct dm_target *ti, status_type_t type,
 		if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
 			DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
 
+		for (i = 0; i < rs->md.raid_disks; i++)
+			if (rs->dev[i].data_dev &&
+			    test_bit(WriteMostly, &rs->dev[i].rdev.flags))
+				DMEMIT(" write_mostly %u", i);
+
 		if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
 			DMEMIT(" max_write_behind %lu",
 			       rs->md.bitmap_info.max_write_behind);
-- 
cgit v1.2.3


From b12d437b73d32203a41fde0d407e91812c866844 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Tue, 2 Aug 2011 12:32:07 +0100
Subject: dm raid: support metadata devices

Add the ability to parse and use metadata devices to dm-raid.  Although
not strictly required, without the metadata devices, many features of
RAID are unavailable.  They are used to store a superblock and bitmap.

The role, or position in the array, of each device must be recorded in
its superblock.  This is to help with fault handling, array reshaping,
and sanity checks.  RAID 4/5/6 devices must be loaded in a specific order:
in this way, the 'array_position' field helps validate the correctness
of the mapping when it is loaded.  It can be used during reshaping to
identify which devices are added/removed.  Fault handling is impossible
without this field.  For example, when a device fails it is recorded in
the superblock.  If this is a RAID1 device and the offending device is
removed from the array, there must be a way during subsequent array
assembly to determine that the failed device was the one removed.  This
is done by correlating the 'array_position' field and the bit-field
variable 'failed_devices'.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 Documentation/device-mapper/dm-raid.txt |  12 +-
 drivers/md/Kconfig                      |   5 +-
 drivers/md/dm-raid.c                    | 419 ++++++++++++++++++++++++++++++--
 3 files changed, 412 insertions(+), 24 deletions(-)

(limited to 'drivers')

diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index a7d1c4abc927..2a8c11331d2d 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -11,6 +11,7 @@ The target is named "raid" and it accepts the following parameters:
     <#raid_devs> <metadata_dev0> <dev0> [.. <metadata_devN> <devN>]
 
 <raid_type>:
+  raid1		RAID1 mirroring
   raid4		RAID4 dedicated parity disk
   raid5_la	RAID5 left asymmetric
 		- rotating parity 0 with data continuation
@@ -61,8 +62,7 @@ The target is named "raid" and it accepts the following parameters:
 <#raid_devs>: The number of devices composing the array.
 	Each device consists of two entries.  The first is the device
 	containing the metadata (if any); the second is the one containing the
-	data.  Currently, separate metadata devices are not supported and '-'
-	is required in place of the metadata device.
+	data.
 
 	If a drive has failed or is missing at creation time, a '-' can be
 	given for both the metadata and data drives for a given position.
@@ -70,7 +70,7 @@ The target is named "raid" and it accepts the following parameters:
 
 Example tables
 --------------
-# RAID4 - 4 data drives, 1 parity
+# RAID4 - 4 data drives, 1 parity (no metadata devices)
 # No metadata devices specified to hold superblock/bitmap info
 # Chunk size of 1MiB
 # (Lines separated for easy reading)
@@ -79,13 +79,13 @@ Example tables
         raid4 1 2048 \
         5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
 
-# RAID4 - 4 data drives, 1 parity (no metadata devices)
+# RAID4 - 4 data drives, 1 parity (with metadata devices)
 # Chunk size of 1MiB, force RAID initialization,
 #       min recovery rate at 20 kiB/sec/disk
 
 0 1960893648 raid \
-        raid4 4 2048 min_recovery_rate 20 sync\
-        5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
+        raid4 4 2048 sync min_recovery_rate 20 \
+        5 8:17 8:18 8:33 8:34 8:49 8:50 8:65 8:66 8:81 8:82
 
 'dmsetup table' displays the table used to construct the mapping.
 The optional parameters are always printed in the order listed
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 8420129fc5ee..f75a66e7d312 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -241,12 +241,13 @@ config DM_MIRROR
          needed for live data migration tools such as 'pvmove'.
 
 config DM_RAID
-       tristate "RAID 4/5/6 target (EXPERIMENTAL)"
+       tristate "RAID 1/4/5/6 target (EXPERIMENTAL)"
        depends on BLK_DEV_DM && EXPERIMENTAL
+       select MD_RAID1
        select MD_RAID456
        select BLK_DEV_MD
        ---help---
-	 A dm target that supports RAID4, RAID5 and RAID6 mappings
+	 A dm target that supports RAID1, RAID4, RAID5 and RAID6 mappings
 
 	 A RAID-5 set of N drives with a capacity of C MB per drive provides
 	 the capacity of C * (N - 1) MB, and protects against a failure
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 88143a0303d2..69873806c50c 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -16,12 +16,10 @@
 #define DM_MSG_PREFIX "raid"
 
 /*
- * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then
- * make it so the flag doesn't set anything.
+ * The following flags are used by dm-raid.c to set up the array state.
+ * They must be cleared before md_run is called.
  */
-#ifndef MD_SYNC_STATE_FORCED
-#define MD_SYNC_STATE_FORCED 0
-#endif
+#define FirstUse 10             /* rdev flag */
 
 struct raid_dev {
 	/*
@@ -149,9 +147,16 @@ static void context_free(struct raid_set *rs)
 {
 	int i;
 
-	for (i = 0; i < rs->md.raid_disks; i++)
+	for (i = 0; i < rs->md.raid_disks; i++) {
+		if (rs->dev[i].meta_dev)
+			dm_put_device(rs->ti, rs->dev[i].meta_dev);
+		if (rs->dev[i].rdev.sb_page)
+			put_page(rs->dev[i].rdev.sb_page);
+		rs->dev[i].rdev.sb_page = NULL;
+		rs->dev[i].rdev.sb_loaded = 0;
 		if (rs->dev[i].data_dev)
 			dm_put_device(rs->ti, rs->dev[i].data_dev);
+	}
 
 	kfree(rs);
 }
@@ -161,7 +166,16 @@ static void context_free(struct raid_set *rs)
  *  <meta_dev>: meta device name or '-' if missing
  *  <data_dev>: data device name or '-' if missing
  *
- * This code parses those words.
+ * The following are permitted:
+ *    - -
+ *    - <data_dev>
+ *    <meta_dev> <data_dev>
+ *
+ * The following is not allowed:
+ *    <meta_dev> -
+ *
+ * This code parses those words.  If there is a failure,
+ * the caller must use context_free to unwind the operations.
  */
 static int dev_parms(struct raid_set *rs, char **argv)
 {
@@ -184,8 +198,16 @@ static int dev_parms(struct raid_set *rs, char **argv)
 		rs->dev[i].rdev.mddev = &rs->md;
 
 		if (strcmp(argv[0], "-")) {
-			rs->ti->error = "Metadata devices not supported";
-			return -EINVAL;
+			ret = dm_get_device(rs->ti, argv[0],
+					    dm_table_get_mode(rs->ti->table),
+					    &rs->dev[i].meta_dev);
+			rs->ti->error = "RAID metadata device lookup failure";
+			if (ret)
+				return ret;
+
+			rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL);
+			if (!rs->dev[i].rdev.sb_page)
+				return -ENOMEM;
 		}
 
 		if (!strcmp(argv[1], "-")) {
@@ -195,6 +217,10 @@ static int dev_parms(struct raid_set *rs, char **argv)
 				return -EINVAL;
 			}
 
+			rs->ti->error = "No data device supplied with metadata device";
+			if (rs->dev[i].meta_dev)
+				return -EINVAL;
+
 			continue;
 		}
 
@@ -206,6 +232,10 @@ static int dev_parms(struct raid_set *rs, char **argv)
 			return ret;
 		}
 
+		if (rs->dev[i].meta_dev) {
+			metadata_available = 1;
+			rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev;
+		}
 		rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
 		list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
 		if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
@@ -334,22 +364,39 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 	num_raid_params--;
 
 	/*
-	 * Second, parse the unordered optional arguments
+	 * We set each individual device as In_sync with a completed
+	 * 'recovery_offset'.  If there has been a device failure or
+	 * replacement then one of the following cases applies:
+	 *
+	 *   1) User specifies 'rebuild'.
+	 *      - Device is reset when param is read.
+	 *   2) A new device is supplied.
+	 *      - No matching superblock found, resets device.
+	 *   3) Device failure was transient and returns on reload.
+	 *      - Failure noticed, resets device for bitmap replay.
+	 *   4) Device hadn't completed recovery after previous failure.
+	 *      - Superblock is read and overrides recovery_offset.
+	 *
+	 * What is found in the superblocks of the devices is always
+	 * authoritative, unless 'rebuild' or '[no]sync' was specified.
 	 */
-	for (i = 0; i < rs->md.raid_disks; i++)
+	for (i = 0; i < rs->md.raid_disks; i++) {
 		set_bit(In_sync, &rs->dev[i].rdev.flags);
+		rs->dev[i].rdev.recovery_offset = MaxSector;
+	}
 
+	/*
+	 * Second, parse the unordered optional arguments
+	 */
 	for (i = 0; i < num_raid_params; i++) {
 		if (!strcasecmp(argv[i], "nosync")) {
 			rs->md.recovery_cp = MaxSector;
 			rs->print_flags |= DMPF_NOSYNC;
-			rs->md.flags |= MD_SYNC_STATE_FORCED;
 			continue;
 		}
 		if (!strcasecmp(argv[i], "sync")) {
 			rs->md.recovery_cp = 0;
 			rs->print_flags |= DMPF_SYNC;
-			rs->md.flags |= MD_SYNC_STATE_FORCED;
 			continue;
 		}
 
@@ -481,14 +528,345 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
 	return md_raid5_congested(&rs->md, bits);
 }
 
+/*
+ * This structure is never routinely used by userspace, unlike md superblocks.
+ * Devices with this superblock should only ever be accessed via device-mapper.
+ */
+#define DM_RAID_MAGIC 0x64526D44
+struct dm_raid_superblock {
+	__le32 magic;		/* "DmRd" */
+	__le32 features;	/* Used to indicate possible future changes */
+
+	__le32 num_devices;	/* Number of devices in this array. (Max 64) */
+	__le32 array_position;	/* The position of this drive in the array */
+
+	__le64 events;		/* Incremented by md when superblock updated */
+	__le64 failed_devices;	/* Bit field of devices to indicate failures */
+
+	/*
+	 * This offset tracks the progress of the repair or replacement of
+	 * an individual drive.
+	 */
+	__le64 disk_recovery_offset;
+
+	/*
+	 * This offset tracks the progress of the initial array
+	 * synchronisation/parity calculation.
+	 */
+	__le64 array_resync_offset;
+
+	/*
+	 * RAID characteristics
+	 */
+	__le32 level;
+	__le32 layout;
+	__le32 stripe_sectors;
+
+	__u8 pad[452];		/* Round struct to 512 bytes. */
+				/* Always set to 0 when writing. */
+} __packed;
+
+static int read_disk_sb(mdk_rdev_t *rdev, int size)
+{
+	BUG_ON(!rdev->sb_page);
+
+	if (rdev->sb_loaded)
+		return 0;
+
+	if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) {
+		DMERR("Failed to read device superblock");
+		return -EINVAL;
+	}
+
+	rdev->sb_loaded = 1;
+
+	return 0;
+}
+
+static void super_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	mdk_rdev_t *r, *t;
+	uint64_t failed_devices;
+	struct dm_raid_superblock *sb;
+
+	sb = page_address(rdev->sb_page);
+	failed_devices = le64_to_cpu(sb->failed_devices);
+
+	rdev_for_each(r, t, mddev)
+		if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
+			failed_devices |= (1ULL << r->raid_disk);
+
+	memset(sb, 0, sizeof(*sb));
+
+	sb->magic = cpu_to_le32(DM_RAID_MAGIC);
+	sb->features = cpu_to_le32(0);	/* No features yet */
+
+	sb->num_devices = cpu_to_le32(mddev->raid_disks);
+	sb->array_position = cpu_to_le32(rdev->raid_disk);
+
+	sb->events = cpu_to_le64(mddev->events);
+	sb->failed_devices = cpu_to_le64(failed_devices);
+
+	sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset);
+	sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp);
+
+	sb->level = cpu_to_le32(mddev->level);
+	sb->layout = cpu_to_le32(mddev->layout);
+	sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors);
+}
+
+/*
+ * super_load
+ *
+ * This function creates a superblock if one is not found on the device
+ * and will decide which superblock to use if there's a choice.
+ *
+ * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise
+ */
+static int super_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev)
+{
+	int ret;
+	struct dm_raid_superblock *sb;
+	struct dm_raid_superblock *refsb;
+	uint64_t events_sb, events_refsb;
+
+	rdev->sb_start = 0;
+	rdev->sb_size = sizeof(*sb);
+
+	ret = read_disk_sb(rdev, rdev->sb_size);
+	if (ret)
+		return ret;
+
+	sb = page_address(rdev->sb_page);
+	if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) {
+		super_sync(rdev->mddev, rdev);
+
+		set_bit(FirstUse, &rdev->flags);
+
+		/* Force writing of superblocks to disk */
+		set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
+
+		/* Any superblock is better than none, choose that if given */
+		return refdev ? 0 : 1;
+	}
+
+	if (!refdev)
+		return 1;
+
+	events_sb = le64_to_cpu(sb->events);
+
+	refsb = page_address(refdev->sb_page);
+	events_refsb = le64_to_cpu(refsb->events);
+
+	return (events_sb > events_refsb) ? 1 : 0;
+}
+
+static int super_init_validation(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	int role;
+	struct raid_set *rs = container_of(mddev, struct raid_set, md);
+	uint64_t events_sb;
+	uint64_t failed_devices;
+	struct dm_raid_superblock *sb;
+	uint32_t new_devs = 0;
+	uint32_t rebuilds = 0;
+	mdk_rdev_t *r, *t;
+	struct dm_raid_superblock *sb2;
+
+	sb = page_address(rdev->sb_page);
+	events_sb = le64_to_cpu(sb->events);
+	failed_devices = le64_to_cpu(sb->failed_devices);
+
+	/*
+	 * Initialise to 1 if this is a new superblock.
+	 */
+	mddev->events = events_sb ? : 1;
+
+	/*
+	 * Reshaping is not currently allowed
+	 */
+	if ((le32_to_cpu(sb->level) != mddev->level) ||
+	    (le32_to_cpu(sb->layout) != mddev->layout) ||
+	    (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) {
+		DMERR("Reshaping arrays not yet supported.");
+		return -EINVAL;
+	}
+
+	/* We can only change the number of devices in RAID1 right now */
+	if ((rs->raid_type->level != 1) &&
+	    (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
+		DMERR("Reshaping arrays not yet supported.");
+		return -EINVAL;
+	}
+
+	if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)))
+		mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
+
+	/*
+	 * During load, we set FirstUse if a new superblock was written.
+	 * There are two reasons we might not have a superblock:
+	 * 1) The array is brand new - in which case, all of the
+	 *    devices must have their In_sync bit set.  Also,
+	 *    recovery_cp must be 0, unless forced.
+	 * 2) This is a new device being added to an old array
+	 *    and the new device needs to be rebuilt - in which
+	 *    case the In_sync bit will /not/ be set and
+	 *    recovery_cp must be MaxSector.
+	 */
+	rdev_for_each(r, t, mddev) {
+		if (!test_bit(In_sync, &r->flags)) {
+			if (!test_bit(FirstUse, &r->flags))
+				DMERR("Superblock area of "
+				      "rebuild device %d should have been "
+				      "cleared.", r->raid_disk);
+			set_bit(FirstUse, &r->flags);
+			rebuilds++;
+		} else if (test_bit(FirstUse, &r->flags))
+			new_devs++;
+	}
+
+	if (!rebuilds) {
+		if (new_devs == mddev->raid_disks) {
+			DMINFO("Superblocks created for new array");
+			set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
+		} else if (new_devs) {
+			DMERR("New device injected "
+			      "into existing array without 'rebuild' "
+			      "parameter specified");
+			return -EINVAL;
+		}
+	} else if (new_devs) {
+		DMERR("'rebuild' devices cannot be "
+		      "injected into an array with other first-time devices");
+		return -EINVAL;
+	} else if (mddev->recovery_cp != MaxSector) {
+		DMERR("'rebuild' specified while array is not in-sync");
+		return -EINVAL;
+	}
+
+	/*
+	 * Now we set the Faulty bit for those devices that are
+	 * recorded in the superblock as failed.
+	 */
+	rdev_for_each(r, t, mddev) {
+		if (!r->sb_page)
+			continue;
+		sb2 = page_address(r->sb_page);
+		sb2->failed_devices = 0;
+
+		/*
+		 * Check for any device re-ordering.
+		 */
+		if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) {
+			role = le32_to_cpu(sb2->array_position);
+			if (role != r->raid_disk) {
+				if (rs->raid_type->level != 1) {
+					rs->ti->error = "Cannot change device "
+						"positions in RAID array";
+					return -EINVAL;
+				}
+				DMINFO("RAID1 device #%d now at position #%d",
+				       role, r->raid_disk);
+			}
+
+			/*
+			 * Partial recovery is performed on
+			 * returning failed devices.
+			 */
+			if (failed_devices & (1 << role))
+				set_bit(Faulty, &r->flags);
+		}
+	}
+
+	return 0;
+}
+
+static int super_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	struct dm_raid_superblock *sb = page_address(rdev->sb_page);
+
+	/*
+	 * If mddev->events is not set, we know we have not yet initialized
+	 * the array.
+	 */
+	if (!mddev->events && super_init_validation(mddev, rdev))
+		return -EINVAL;
+
+	mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */
+	rdev->mddev->bitmap_info.default_offset = 4096 >> 9;
+	if (!test_bit(FirstUse, &rdev->flags)) {
+		rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
+		if (rdev->recovery_offset != MaxSector)
+			clear_bit(In_sync, &rdev->flags);
+	}
+
+	/*
+	 * If a device comes back, set it as not In_sync and no longer faulty.
+	 */
+	if (test_bit(Faulty, &rdev->flags)) {
+		clear_bit(Faulty, &rdev->flags);
+		clear_bit(In_sync, &rdev->flags);
+		rdev->saved_raid_disk = rdev->raid_disk;
+		rdev->recovery_offset = 0;
+	}
+
+	clear_bit(FirstUse, &rdev->flags);
+
+	return 0;
+}
+
+/*
+ * Analyse superblocks and select the freshest.
+ */
+static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
+{
+	int ret;
+	mdk_rdev_t *rdev, *freshest, *tmp;
+	mddev_t *mddev = &rs->md;
+
+	freshest = NULL;
+	rdev_for_each(rdev, tmp, mddev) {
+		if (!rdev->meta_bdev)
+			continue;
+
+		ret = super_load(rdev, freshest);
+
+		switch (ret) {
+		case 1:
+			freshest = rdev;
+			break;
+		case 0:
+			break;
+		default:
+			ti->error = "Failed to load superblock";
+			return ret;
+		}
+	}
+
+	if (!freshest)
+		return 0;
+
+	/*
+	 * Validation of the freshest device provides the source of
+	 * validation for the remaining devices.
+	 */
+	ti->error = "Unable to assemble array: Invalid superblocks";
+	if (super_validate(mddev, freshest))
+		return -EINVAL;
+
+	rdev_for_each(rdev, tmp, mddev)
+		if ((rdev != freshest) && super_validate(mddev, rdev))
+			return -EINVAL;
+
+	return 0;
+}
+
 /*
  * Construct a RAID4/5/6 mapping:
  * Args:
  *	<raid_type> <#raid_params> <raid_params>		\
  *	<#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
  *
- * ** metadata devices are not supported yet, use '-' instead **
- *
  * <raid_params> varies by <raid_type>.  See 'parse_raid_params' for
  * details on possible <raid_params>.
  */
@@ -556,6 +934,11 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	if (ret)
 		goto bad;
 
+	rs->md.sync_super = super_sync;
+	ret = analyse_superblocks(ti, rs);
+	if (ret)
+		goto bad;
+
 	INIT_WORK(&rs->md.event_work, do_table_event);
 	ti->private = rs;
 
@@ -698,7 +1081,10 @@ static int raid_status(struct dm_target *ti, status_type_t type,
 
 		DMEMIT(" %d", rs->md.raid_disks);
 		for (i = 0; i < rs->md.raid_disks; i++) {
-			DMEMIT(" -"); /* metadata device */
+			if (rs->dev[i].meta_dev)
+				DMEMIT(" %s", rs->dev[i].meta_dev->name);
+			else
+				DMEMIT(" -");
 
 			if (rs->dev[i].data_dev)
 				DMEMIT(" %s", rs->dev[i].data_dev->name);
@@ -755,6 +1141,7 @@ static void raid_resume(struct dm_target *ti)
 {
 	struct raid_set *rs = ti->private;
 
+	bitmap_load(&rs->md);
 	mddev_resume(&rs->md);
 }
 
-- 
cgit v1.2.3


From 327372797c88b24953f454cd51a3734c02697bdd Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Tue, 2 Aug 2011 12:32:07 +0100
Subject: dm raid: add md raid1 support

Support the MD RAID1 personality through dm-raid.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid.c | 49 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 39 insertions(+), 10 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 69873806c50c..a002dd85db1e 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -8,6 +8,7 @@
 #include <linux/slab.h>
 
 #include "md.h"
+#include "raid1.h"
 #include "raid5.h"
 #include "bitmap.h"
 
@@ -72,6 +73,7 @@ static struct raid_type {
 	const unsigned level;		/* RAID level. */
 	const unsigned algorithm;	/* RAID algorithm. */
 } raid_types[] = {
+	{"raid1",    "RAID1 (mirroring)",               0, 2, 1, 0 /* NONE */},
 	{"raid4",    "RAID4 (dedicated parity disk)",	1, 2, 5, ALGORITHM_PARITY_0},
 	{"raid5_la", "RAID5 (left asymmetric)",		1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
 	{"raid5_ra", "RAID5 (right asymmetric)",	1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
@@ -105,7 +107,8 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
 	}
 
 	sectors_per_dev = ti->len;
-	if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
+	if ((raid_type->level > 1) &&
+	    sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
 		ti->error = "Target length not divisible by number of data devices";
 		return ERR_PTR(-EINVAL);
 	}
@@ -329,13 +332,16 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
 
 /*
  * Possible arguments are...
- * RAID456:
  *	<chunk_size> [optional_args]
  *
- * Optional args:
- *    [[no]sync]			Force or prevent recovery of the entire array
+ * Argument definitions
+ *    <chunk_size>			The number of sectors per disk that
+ *                                      will form the "stripe"
+ *    [[no]sync]			Force or prevent recovery of the
+ *                                      entire array
  *    [rebuild <idx>]			Rebuild the drive indicated by the index
- *    [daemon_sleep <ms>]		Time between bitmap daemon work to clear bits
+ *    [daemon_sleep <ms>]		Time between bitmap daemon work to
+ *                                      clear bits
  *    [min_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
  *    [max_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
  *    [write_mostly <idx>]		Indicate a write mostly drive via index
@@ -352,11 +358,21 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 
 	/*
 	 * First, parse the in-order required arguments
+	 * "chunk_size" is the only argument of this type.
 	 */
-	if ((strict_strtoul(argv[0], 10, &value) < 0) ||
-	    !is_power_of_2(value) || (value < 8)) {
+	if ((strict_strtoul(argv[0], 10, &value) < 0)) {
 		rs->ti->error = "Bad chunk size";
 		return -EINVAL;
+	} else if (rs->raid_type->level == 1) {
+		if (value)
+			DMERR("Ignoring chunk size parameter for RAID 1");
+		value = 0;
+	} else if (!is_power_of_2(value)) {
+		rs->ti->error = "Chunk size must be a power of 2";
+		return -EINVAL;
+	} else if (value < 8) {
+		rs->ti->error = "Chunk size value is too small";
+		return -EINVAL;
 	}
 
 	rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
@@ -413,8 +429,12 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 		}
 
 		if (!strcasecmp(key, "rebuild")) {
-			if (++rebuild_cnt > rs->raid_type->parity_devs) {
-				rs->ti->error = "Too many rebuild drives given";
+			rebuild_cnt++;
+			if (((rs->raid_type->level != 1) &&
+			     (rebuild_cnt > rs->raid_type->parity_devs)) ||
+			    ((rs->raid_type->level == 1) &&
+			     (rebuild_cnt > (rs->md.raid_disks - 1)))) {
+				rs->ti->error = "Too many rebuild devices specified for given RAID type";
 				return -EINVAL;
 			}
 			if (value > rs->md.raid_disks) {
@@ -507,6 +527,11 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 	else
 		rs->ti->split_io = region_size;
 
+	if (rs->md.chunk_sectors)
+		rs->ti->split_io = rs->md.chunk_sectors;
+	else
+		rs->ti->split_io = region_size;
+
 	/* Assume there are no metadata devices until the drives are parsed */
 	rs->md.persistent = 0;
 	rs->md.external = 1;
@@ -525,6 +550,9 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
 {
 	struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
 
+	if (rs->raid_type->level == 1)
+		return md_raid1_congested(&rs->md, bits);
+
 	return md_raid5_congested(&rs->md, bits);
 }
 
@@ -955,6 +983,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	rs->callbacks.congested_fn = raid_is_congested;
 	dm_table_add_target_callbacks(ti->table, &rs->callbacks);
 
+	mddev_suspend(&rs->md);
 	return 0;
 
 bad:
@@ -1147,7 +1176,7 @@ static void raid_resume(struct dm_target *ti)
 
 static struct target_type raid_target = {
 	.name = "raid",
-	.version = {1, 0, 0},
+	.version = {1, 1, 0},
 	.module = THIS_MODULE,
 	.ctr = raid_ctr,
 	.dtr = raid_dtr,
-- 
cgit v1.2.3


From 772ae5f54d69c38a5e3c4352c5fdbdaff141af21 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Tue, 2 Aug 2011 12:32:08 +0100
Subject: dm crypt: optionally support discard requests

Add optional parameter field to dmcrypt table and support
"allow_discards" option.

Discard requests bypass crypt queue processing. Bio is simple remapped
to underlying device.

Note that discard will be never enabled by default because of security
consequences.  It is up to the administrator to enable it for encrypted
devices.

(Note that userspace cryptsetup does not understand new optional
parameters yet.  Support for this will come later.  Until then, you
should use 'dmsetup' to enable and disable this.)

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 Documentation/device-mapper/dm-crypt.txt | 21 +++++++++++++-
 drivers/md/dm-crypt.c                    | 49 +++++++++++++++++++++++++++++---
 2 files changed, 65 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt
index 6b5c42dbbe84..2c656ae43ba7 100644
--- a/Documentation/device-mapper/dm-crypt.txt
+++ b/Documentation/device-mapper/dm-crypt.txt
@@ -4,7 +4,8 @@ dm-crypt
 Device-Mapper's "crypt" target provides transparent encryption of block devices
 using the kernel crypto API.
 
-Parameters: <cipher> <key> <iv_offset> <device path> <offset>
+Parameters: <cipher> <key> <iv_offset> <device path> \
+	      <offset> [<#opt_params> <opt_params>]
 
 <cipher>
     Encryption cipher and an optional IV generation mode.
@@ -37,6 +38,24 @@ Parameters: <cipher> <key> <iv_offset> <device path> <offset>
 <offset>
     Starting sector within the device where the encrypted data begins.
 
+<#opt_params>
+    Number of optional parameters. If there are no optional parameters,
+    the optional paramaters section can be skipped or #opt_params can be zero.
+    Otherwise #opt_params is the number of following arguments.
+
+    Example of optional parameters section:
+        1 allow_discards
+
+allow_discards
+    Block discard requests (a.k.a. TRIM) are passed through the crypt device.
+    The default is to ignore discard requests.
+
+    WARNING: Assess the specific security risks carefully before enabling this
+    option.  For example, allowing discards on encrypted devices may lead to
+    the leak of information about the ciphertext device (filesystem type,
+    used space etc.) if the discarded blocks can be located easily on the
+    device later.
+
 Example scripts
 ===============
 LUKS (Linux Unified Key Setup) is now the preferred way to set up disk
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index b79e7472a9b3..49da55c1528a 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1574,11 +1574,17 @@ bad_mem:
 static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
 	struct crypt_config *cc;
-	unsigned int key_size;
+	unsigned int key_size, opt_params;
 	unsigned long long tmpll;
 	int ret;
+	struct dm_arg_set as;
+	const char *opt_string;
+
+	static struct dm_arg _args[] = {
+		{0, 1, "Invalid number of feature args"},
+	};
 
-	if (argc != 5) {
+	if (argc < 5) {
 		ti->error = "Not enough arguments";
 		return -EINVAL;
 	}
@@ -1647,6 +1653,30 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 	cc->start = tmpll;
 
+	argv += 5;
+	argc -= 5;
+
+	/* Optional parameters */
+	if (argc) {
+		as.argc = argc;
+		as.argv = argv;
+
+		ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
+		if (ret)
+			goto bad;
+
+		opt_string = dm_shift_arg(&as);
+
+		if (opt_params == 1 && opt_string &&
+		    !strcasecmp(opt_string, "allow_discards"))
+			ti->num_discard_requests = 1;
+		else if (opt_params) {
+			ret = -EINVAL;
+			ti->error = "Invalid feature arguments";
+			goto bad;
+		}
+	}
+
 	ret = -ENOMEM;
 	cc->io_queue = alloc_workqueue("kcryptd_io",
 				       WQ_NON_REENTRANT|
@@ -1681,9 +1711,16 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
 	struct dm_crypt_io *io;
 	struct crypt_config *cc;
 
-	if (bio->bi_rw & REQ_FLUSH) {
+	/*
+	 * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues.
+	 * - for REQ_FLUSH device-mapper core ensures that no IO is in-flight
+	 * - for REQ_DISCARD caller must use flush if IO ordering matters
+	 */
+	if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) {
 		cc = ti->private;
 		bio->bi_bdev = cc->dev->bdev;
+		if (bio_sectors(bio))
+			bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector);
 		return DM_MAPIO_REMAPPED;
 	}
 
@@ -1726,6 +1763,10 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
 
 		DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset,
 				cc->dev->name, (unsigned long long)cc->start);
+
+		if (ti->num_discard_requests)
+			DMEMIT(" 1 allow_discards");
+
 		break;
 	}
 	return 0;
@@ -1822,7 +1863,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
 
 static struct target_type crypt_target = {
 	.name   = "crypt",
-	.version = {1, 10, 0},
+	.version = {1, 11, 0},
 	.module = THIS_MODULE,
 	.ctr    = crypt_ctr,
 	.dtr    = crypt_dtr,
-- 
cgit v1.2.3


From ed8b752bccf2560e305e25125721d2f0ac759e88 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 2 Aug 2011 12:32:08 +0100
Subject: dm table: set flush capability based on underlying devices

DM has always advertised both REQ_FLUSH and REQ_FUA flush capabilities
regardless of whether or not a given DM device's underlying devices
also advertised a need for them.

Block's flush-merge changes from 2.6.39 have proven to be more costly
for DM devices.  Performance regressions have been reported even when
DM's underlying devices do not advertise that they have a write cache.

Fix the performance regressions by configuring a DM device's flushing
capabilities based on those of the underlying devices' capabilities.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 drivers/md/dm.c       |  1 -
 2 files changed, 43 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 259ce99302fc..986b8754bb08 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1248,9 +1248,45 @@ static void dm_table_set_integrity(struct dm_table *t)
 			       blk_get_integrity(template_disk));
 }
 
+static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
+				sector_t start, sector_t len, void *data)
+{
+	unsigned flush = (*(unsigned *)data);
+	struct request_queue *q = bdev_get_queue(dev->bdev);
+
+	return q && (q->flush_flags & flush);
+}
+
+static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
+{
+	struct dm_target *ti;
+	unsigned i = 0;
+
+	/*
+	 * Require at least one underlying device to support flushes.
+	 * t->devices includes internal dm devices such as mirror logs
+	 * so we need to use iterate_devices here, which targets
+	 * supporting flushes must provide.
+	 */
+	while (i < dm_table_get_num_targets(t)) {
+		ti = dm_table_get_target(t, i++);
+
+		if (!ti->num_flush_requests)
+			continue;
+
+		if (ti->type->iterate_devices &&
+		    ti->type->iterate_devices(ti, device_flush_capable, &flush))
+			return 1;
+	}
+
+	return 0;
+}
+
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 			       struct queue_limits *limits)
 {
+	unsigned flush = 0;
+
 	/*
 	 * Copy table's limits to the DM device's request_queue
 	 */
@@ -1261,6 +1297,13 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	else
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
 
+	if (dm_table_supports_flush(t, REQ_FLUSH)) {
+		flush |= REQ_FLUSH;
+		if (dm_table_supports_flush(t, REQ_FUA))
+			flush |= REQ_FUA;
+	}
+	blk_queue_flush(q, flush);
+
 	dm_table_set_integrity(t);
 
 	/*
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 1000eaf984ef..52b39f335bb3 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1808,7 +1808,6 @@ static void dm_init_md_queue(struct mapped_device *md)
 	blk_queue_make_request(md->queue, dm_request);
 	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
 	blk_queue_merge_bvec(md->queue, dm_merge_bvec);
-	blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
 }
 
 /*
-- 
cgit v1.2.3