From b26f5e3d7127487e934758c1fbe05d683b082cb0 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 2 Aug 2011 12:32:05 +0100
Subject: dm flakey: add drop_writes

Add 'drop_writes' option to drop writes silently while the
device is 'down'.  Reads are not touched.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 Documentation/device-mapper/dm-flakey.txt | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

(limited to 'Documentation')
diff --git a/Documentation/device-mapper/dm-flakey.txt b/Documentation/device-mapper/dm-flakey.txt
index c8efdfd19a65..1b66c868ee7e 100644
--- a/Documentation/device-mapper/dm-flakey.txt
+++ b/Documentation/device-mapper/dm-flakey.txt
@@ -1,17 +1,34 @@
 dm-flakey
 =========
 
-This target is the same as the linear target except that it returns I/O
-errors periodically.  It's been found useful in simulating failing
-devices for testing purposes.
+This target is the same as the linear target except that it exhibits
+unreliable behaviour periodically.  It's been found useful in simulating
+failing devices for testing purposes.
 
 Starting from the time the table is loaded, the device is available for
-<up interval> seconds, then returns errors for <down interval> seconds,
-and then this cycle repeats.
+<up interval> seconds, then exhibits unreliable behaviour for <down
+interval> seconds, and then this cycle repeats.
 
-Parameters: <dev path> <offset> <up interval> <down interval>
+Also, consider using this in combination with the dm-delay target too,
+which can delay reads and writes and/or send them to different
+underlying devices.
+
+Table parameters
+----------------
+  <dev path> <offset> <up interval> <down interval> \
+    [<num_features> [<feature arguments>]]
+
+Mandatory parameters:
     <dev path>: Full pathname to the underlying block-device, or a
                 "major:minor" device-number.
     <offset>: Starting sector within the device.
     <up interval>: Number of seconds device is available.
     <down interval>: Number of seconds device returns errors.
+
+Optional feature parameters:
+  If no feature parameters are present, during the periods of
+  unreliability, all I/O returns errors.
+
+  drop_writes:
+	All write I/O is silently ignored.
+	Read I/O is handled correctly.
-- 
cgit v1.2.3


From a3998799fb4df0b0af8271a7d50c4269032397aa Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 2 Aug 2011 12:32:06 +0100
Subject: dm flakey: add corrupt_bio_byte feature

Add corrupt_bio_byte feature to simulate corruption by overwriting a byte at a
specified position with a specified value during intervals when the device is
"down".

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 Documentation/device-mapper/dm-flakey.txt |  19 ++++
 drivers/md/dm-flakey.c                    | 155 +++++++++++++++++++++++++++---
 2 files changed, 159 insertions(+), 15 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/device-mapper/dm-flakey.txt b/Documentation/device-mapper/dm-flakey.txt
index 1b66c868ee7e..6ff5c2327227 100644
--- a/Documentation/device-mapper/dm-flakey.txt
+++ b/Documentation/device-mapper/dm-flakey.txt
@@ -32,3 +32,22 @@ Optional feature parameters:
   drop_writes:
 	All write I/O is silently ignored.
 	Read I/O is handled correctly.
+
+  corrupt_bio_byte <Nth_byte> <direction> <value> <flags>:
+	During <down interval>, replace <Nth_byte> of the data of
+	each matching bio with <value>.
+
+    <Nth_byte>: The offset of the byte to replace.
+		Counting starts at 1, to replace the first byte.
+    <direction>: Either 'r' to corrupt reads or 'w' to corrupt writes.
+		 'w' is incompatible with drop_writes.
+    <value>: The value (from 0-255) to write.
+    <flags>: Perform the replacement only if bio->bi_rw has all the
+	     selected flags set.
+
+Examples:
+  corrupt_bio_byte 32 r 1 0
+	- replaces the 32nd byte of READ bios with the value 1
+
+  corrupt_bio_byte 224 w 0 32
+	- replaces the 224th byte of REQ_META (=32) bios with the value 0
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index e7c4c2a64f4b..89f73ca22cfa 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2003 Sistina Software (UK) Limited.
- * Copyright (C) 2004, 2010 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004, 2010-2011 Red Hat, Inc. All rights reserved.
  *
  * This file is released under the GPL.
  */
@@ -15,6 +15,9 @@
 
 #define DM_MSG_PREFIX "flakey"
 
+#define all_corrupt_bio_flags_match(bio, fc)	\
+	(((bio)->bi_rw & (fc)->corrupt_bio_flags) == (fc)->corrupt_bio_flags)
+
 /*
  * Flakey: Used for testing only, simulates intermittent,
  * catastrophic device failure.
@@ -26,6 +29,10 @@ struct flakey_c {
 	unsigned up_interval;
 	unsigned down_interval;
 	unsigned long flags;
+	unsigned corrupt_bio_byte;
+	unsigned corrupt_bio_rw;
+	unsigned corrupt_bio_value;
+	unsigned corrupt_bio_flags;
 };
 
 enum feature_flag_bits {
@@ -40,7 +47,10 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
 	const char *arg_name;
 
 	static struct dm_arg _args[] = {
-		{0, 1, "Invalid number of feature args"},
+		{0, 6, "Invalid number of feature args"},
+		{1, UINT_MAX, "Invalid corrupt bio byte"},
+		{0, 255, "Invalid corrupt value to write into bio byte (0-255)"},
+		{0, UINT_MAX, "Invalid corrupt bio flags mask"},
 	};
 
 	/* No feature arguments supplied. */
@@ -49,9 +59,9 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
 
 	r = dm_read_arg_group(_args, as, &argc, &ti->error);
 	if (r)
-		return -EINVAL;
+		return r;
 
-	while (argc && !r) {
+	while (argc) {
 		arg_name = dm_shift_arg(as);
 		argc--;
 
@@ -67,11 +77,61 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
 			continue;
 		}
 
+		/*
+		 * corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>
+		 */
+		if (!strcasecmp(arg_name, "corrupt_bio_byte")) {
+			if (!argc)
+				ti->error = "Feature corrupt_bio_byte requires parameters";
+
+			r = dm_read_arg(_args + 1, as, &fc->corrupt_bio_byte, &ti->error);
+			if (r)
+				return r;
+			argc--;
+
+			/*
+			 * Direction r or w?
+			 */
+			arg_name = dm_shift_arg(as);
+			if (!strcasecmp(arg_name, "w"))
+				fc->corrupt_bio_rw = WRITE;
+			else if (!strcasecmp(arg_name, "r"))
+				fc->corrupt_bio_rw = READ;
+			else {
+				ti->error = "Invalid corrupt bio direction (r or w)";
+				return -EINVAL;
+			}
+			argc--;
+
+			/*
+			 * Value of byte (0-255) to write in place of correct one.
+			 */
+			r = dm_read_arg(_args + 2, as, &fc->corrupt_bio_value, &ti->error);
+			if (r)
+				return r;
+			argc--;
+
+			/*
+			 * Only corrupt bios with these flags set.
+			 */
+			r = dm_read_arg(_args + 3, as, &fc->corrupt_bio_flags, &ti->error);
+			if (r)
+				return r;
+			argc--;
+
+			continue;
+		}
+
 		ti->error = "Unrecognised flakey feature requested";
-		r = -EINVAL;
+		return -EINVAL;
 	}
 
-	return r;
+	if (test_bit(DROP_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) {
+		ti->error = "drop_writes is incompatible with corrupt_bio_byte with the WRITE flag set";
+		return -EINVAL;
+	}
+
+	return 0;
 }
 
 /*
@@ -80,6 +140,11 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
  *
  *   Feature args:
  *     [drop_writes]
+ *     [corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>]
+ *
+ *   Nth_byte starts from 1 for the first byte.
+ *   Direction is r for READ or w for WRITE.
+ *   bio_flags is ignored if 0.
  */
 static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
@@ -178,31 +243,64 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
 		bio->bi_sector = flakey_map_sector(ti, bio->bi_sector);
 }
 
+static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
+{
+	unsigned bio_bytes = bio_cur_bytes(bio);
+	char *data = bio_data(bio);
+
+	/*
+	 * Overwrite the Nth byte of the data returned.
+	 */
+	if (data && bio_bytes >= fc->corrupt_bio_byte) {
+		data[fc->corrupt_bio_byte - 1] = fc->corrupt_bio_value;
+
+		DMDEBUG("Corrupting data bio=%p by writing %u to byte %u "
+			"(rw=%c bi_rw=%lu bi_sector=%llu cur_bytes=%u)\n",
+			bio, fc->corrupt_bio_value, fc->corrupt_bio_byte,
+			(bio_data_dir(bio) == WRITE) ? 'w' : 'r',
+			bio->bi_rw, (unsigned long long)bio->bi_sector, bio_bytes);
+	}
+}
+
 static int flakey_map(struct dm_target *ti, struct bio *bio,
 		      union map_info *map_context)
 {
 	struct flakey_c *fc = ti->private;
 	unsigned elapsed;
-	unsigned rw;
 
 	/* Are we alive ? */
 	elapsed = (jiffies - fc->start_time) / HZ;
 	if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) {
-		rw = bio_data_dir(bio);
+		/*
+		 * Flag this bio as submitted while down.
+		 */
+		map_context->ll = 1;
+
+		/*
+		 * Map reads as normal.
+		 */
+		if (bio_data_dir(bio) == READ)
+			goto map_bio;
 
 		/*
-		 * Drop writes.  Map reads as normal.
+		 * Drop writes?
 		 */
 		if (test_bit(DROP_WRITES, &fc->flags)) {
-			if (rw == WRITE) {
-				bio_endio(bio, 0);
-				return DM_MAPIO_SUBMITTED;
-			}
+			bio_endio(bio, 0);
+			return DM_MAPIO_SUBMITTED;
+		}
+
+		/*
+		 * Corrupt matching writes.
+		 */
+		if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == WRITE)) {
+			if (all_corrupt_bio_flags_match(bio, fc))
+				corrupt_bio_data(bio, fc);
 			goto map_bio;
 		}
 
 		/*
-		 * Default setting errors all I/O.
+		 * By default, error all I/O.
 		 */
 		return -EIO;
 	}
@@ -213,6 +311,24 @@ map_bio:
 	return DM_MAPIO_REMAPPED;
 }
 
+static int flakey_end_io(struct dm_target *ti, struct bio *bio,
+			 int error, union map_info *map_context)
+{
+	struct flakey_c *fc = ti->private;
+	unsigned bio_submitted_while_down = map_context->ll;
+
+	/*
+	 * Corrupt successful READs while in down state.
+	 * If flags were specified, only corrupt those that match.
+	 */
+	if (!error && bio_submitted_while_down &&
+	    (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) &&
+	    all_corrupt_bio_flags_match(bio, fc))
+		corrupt_bio_data(bio, fc);
+
+	return error;
+}
+
 static int flakey_status(struct dm_target *ti, status_type_t type,
 			 char *result, unsigned int maxlen)
 {
@@ -231,9 +347,17 @@ static int flakey_status(struct dm_target *ti, status_type_t type,
 		       fc->down_interval);
 
 		drop_writes = test_bit(DROP_WRITES, &fc->flags);
-		DMEMIT("%u ", drop_writes);
+		DMEMIT("%u ", drop_writes + (fc->corrupt_bio_byte > 0) * 5);
+
 		if (drop_writes)
 			DMEMIT("drop_writes ");
+
+		if (fc->corrupt_bio_byte)
+			DMEMIT("corrupt_bio_byte %u %c %u %u ",
+			       fc->corrupt_bio_byte,
+			       (fc->corrupt_bio_rw == WRITE) ? 'w' : 'r',
+			       fc->corrupt_bio_value, fc->corrupt_bio_flags);
+
 		break;
 	}
 	return 0;
@@ -275,6 +399,7 @@ static struct target_type flakey_target = {
 	.ctr    = flakey_ctr,
 	.dtr    = flakey_dtr,
 	.map    = flakey_map,
+	.end_io = flakey_end_io,
 	.status = flakey_status,
 	.ioctl	= flakey_ioctl,
 	.merge	= flakey_merge,
-- 
cgit v1.2.3


From c0a2fa1ef1057a1e9450d6f055f1cde2ad4f85a2 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Tue, 2 Aug 2011 12:32:06 +0100
Subject: dm raid: improve table parameters documentation

Add more information about some dm-raid table parameters and clarify how
parameters are printed when 'dmsetup table' is issued.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 Documentation/device-mapper/dm-raid.txt | 124 ++++++++++++++++++++------------
 1 file changed, 78 insertions(+), 46 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index 33b6b7071ac8..4f9dd3cecc11 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -1,49 +1,75 @@
-Device-mapper RAID (dm-raid) is a bridge from DM to MD.  It
-provides a way to use device-mapper interfaces to access the MD RAID
-drivers.
+dm-raid
+-------
 
-As with all device-mapper targets, the nominal public interfaces are the
-constructor (CTR) tables and the status outputs (both STATUSTYPE_INFO
-and STATUSTYPE_TABLE).  The CTR table looks like the following:
+The device-mapper RAID (dm-raid) target provides a bridge from DM to MD.
+It allows the MD RAID drivers to be accessed using a device-mapper
+interface.
 
-1: <s> <l> raid \
-2:      <raid_type> <#raid_params> <raid_params> \
-3:      <#raid_devs> <meta_dev1> <dev1> .. <meta_devN> <devN>
-
-Line 1 contains the standard first three arguments to any device-mapper
-target - the start, length, and target type fields.  The target type in
-this case is "raid".
-
-Line 2 contains the arguments that define the particular raid
-type/personality/level, the required arguments for that raid type, and
-any optional arguments.  Possible raid types include: raid4, raid5_la,
-raid5_ls, raid5_rs, raid6_zr, raid6_nr, and raid6_nc.  (raid1 is
-planned for the future.)  The list of required and optional parameters
-is the same for all the current raid types.  The required parameters are
-positional, while the optional parameters are given as key/value pairs.
-The possible parameters are as follows:
- <chunk_size>           Chunk size in sectors.
- [[no]sync]             Force/Prevent RAID initialization
- [rebuild <idx>]        Rebuild the drive indicated by the index
- [daemon_sleep <ms>]    Time between bitmap daemon work to clear bits
- [min_recovery_rate <kB/sec/disk>]      Throttle RAID initialization
- [max_recovery_rate <kB/sec/disk>]      Throttle RAID initialization
- [max_write_behind <sectors>]           See '-write-behind=' (man mdadm)
- [stripe_cache <sectors>]               Stripe cache size for higher RAIDs
-
-Line 3 contains the list of devices that compose the array in
-metadata/data device pairs.  If the metadata is stored separately, a '-'
-is given for the metadata device position.  If a drive has failed or is
-missing at creation time, a '-' can be given for both the metadata and
-data drives for a given position.
-
-NB. Currently all metadata devices must be specified as '-'.
-
-Examples:
+The target is named "raid" and it accepts the following parameters:
+
+  <raid_type> <#raid_params> <raid_params> \
+    <#raid_devs> <metadata_dev0> <dev0> [.. <metadata_devN> <devN>]
+
+<raid_type>:
+  raid4		RAID4 dedicated parity disk
+  raid5_la	RAID5 left asymmetric
+		- rotating parity 0 with data continuation
+  raid5_ra	RAID5 right asymmetric
+		- rotating parity N with data continuation
+  raid5_ls	RAID5 left symmetric
+		- rotating parity 0 with data restart
+  raid5_rs 	RAID5 right symmetric
+		- rotating parity N with data restart
+  raid6_zr	RAID6 zero restart
+		- rotating parity zero (left-to-right) with data restart
+  raid6_nr	RAID6 N restart
+		- rotating parity N (right-to-left) with data restart
+  raid6_nc	RAID6 N continue
+		- rotating parity N (right-to-left) with data continuation
+
+  Refererence: Chapter 4 of
+  http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf
+
+<#raid_params>: The number of parameters that follow.
+
+<raid_params> consists of
+    Mandatory parameters:
+        <chunk_size>: Chunk size in sectors.  This parameter is often known as
+		      "stripe size".  It is the only mandatory parameter and
+		      is placed first.
+
+    followed by optional parameters (in any order):
+	[sync|nosync]   Force or prevent RAID initialization.
+
+	[rebuild <idx>]	Rebuild drive number idx (first drive is 0).
+
+	[daemon_sleep <ms>]
+		Interval between runs of the bitmap daemon that
+		clear bits.  A longer interval means less bitmap I/O but
+		resyncing after a failure is likely to take longer.
+
+	[min_recovery_rate <kB/sec/disk>]  Throttle RAID initialization
+	[max_recovery_rate <kB/sec/disk>]  Throttle RAID initialization
+	[max_write_behind <sectors>]       See '-write-behind=' (man mdadm)
+	[stripe_cache <sectors>]           Stripe cache size (higher RAIDs only)
+
+<#raid_devs>: The number of devices composing the array.
+	Each device consists of two entries.  The first is the device
+	containing the metadata (if any); the second is the one containing the
+	data.  Currently, separate metadata devices are not supported and '-'
+	is required in place of the metadata device.
+
+	If a drive has failed or is missing at creation time, a '-' can be
+	given for both the metadata and data drives for a given position.
+
+
+Example tables
+--------------
 # RAID4 - 4 data drives, 1 parity
 # No metadata devices specified to hold superblock/bitmap info
 # Chunk size of 1MiB
 # (Lines separated for easy reading)
+
 0 1960893648 raid \
         raid4 1 2048 \
         5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
@@ -51,20 +77,26 @@ Examples:
 # RAID4 - 4 data drives, 1 parity (no metadata devices)
 # Chunk size of 1MiB, force RAID initialization,
 #       min recovery rate at 20 kiB/sec/disk
+
 0 1960893648 raid \
         raid4 4 2048 min_recovery_rate 20 sync\
         5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
 
-Performing a 'dmsetup table' should display the CTR table used to
-construct the mapping (with possible reordering of optional
-parameters).
+'dmsetup table' displays the table used to construct the mapping.
+The optional parameters will always be printed in the order listed
+above with "sync" or "nosync" always output ahead of the other
+arguments, regardless of the order used when originally loading the table.
 
-Performing a 'dmsetup status' will yield information on the state and
-health of the array.  The output is as follows:
+'dmsetup status' yields information on the state and health of the
+array.
+The output is as follows:
 1: <s> <l> raid \
 2:      <raid_type> <#devices> <1 health char for each dev> <resync_ratio>
 
-Line 1 is standard DM output.  Line 2 is best shown by example:
+Line 1 is the standard output produced by device-mapper.
+Line 2 is produced by the raid target, and best explained by example:
         0 1960893648 raid raid4 5 AAAAA 2/490221568
 Here we can see the RAID type is raid4, there are 5 devices - all of
 which are 'A'live, and the array is 2/490221568 complete with recovery.
+Faulty or missing devices are marked 'D'.  Devices that are out-of-sync
+are marked 'a'.
-- 
cgit v1.2.3


From c1084561bb85da3630540ebe951749a8cd8fc714 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Tue, 2 Aug 2011 12:32:07 +0100
Subject: dm raid: add region_size parameter

Allow the user to specify the region_size.

Ensures that the supplied value meets md's constraints, viz. the number of
regions does not exceed 2^21.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 Documentation/device-mapper/dm-raid.txt |  4 ++
 drivers/md/dm-raid.c                    | 82 +++++++++++++++++++++++++++++++--
 2 files changed, 83 insertions(+), 3 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index 4f9dd3cecc11..be4419a30781 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -52,6 +52,10 @@ The target is named "raid" and it accepts the following parameters:
 	[max_recovery_rate <kB/sec/disk>]  Throttle RAID initialization
 	[max_write_behind <sectors>]       See '-write-behind=' (man mdadm)
 	[stripe_cache <sectors>]           Stripe cache size (higher RAIDs only)
+	[region_size <sectors>]
+		The region_size multiplied by the number of regions is the
+		logical size of the array.  The bitmap records the device
+		synchronisation state for each region.
 
 <#raid_devs>: The number of devices composing the array.
 	Each device consists of two entries.  The first is the device
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index d4e95b2e39f6..a8a1915a450d 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -52,7 +52,7 @@ struct raid_dev {
 #define DMPF_MAX_RECOVERY_RATE 0x20
 #define DMPF_MAX_WRITE_BEHIND  0x40
 #define DMPF_STRIPE_CACHE      0x80
-
+#define DMPF_REGION_SIZE       0X100
 struct raid_set {
 	struct dm_target *ti;
 
@@ -236,6 +236,67 @@ static int dev_parms(struct raid_set *rs, char **argv)
 	return 0;
 }
 
+/*
+ * validate_region_size
+ * @rs
+ * @region_size:  region size in sectors.  If 0, pick a size (4MiB default).
+ *
+ * Set rs->md.bitmap_info.chunksize (which really refers to 'region size').
+ * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap.
+ *
+ * Returns: 0 on success, -EINVAL on failure.
+ */
+static int validate_region_size(struct raid_set *rs, unsigned long region_size)
+{
+	unsigned long min_region_size = rs->ti->len / (1 << 21);
+
+	if (!region_size) {
+		/*
+		 * Choose a reasonable default.  All figures in sectors.
+		 */
+		if (min_region_size > (1 << 13)) {
+			DMINFO("Choosing default region size of %lu sectors",
+			       region_size);
+			region_size = min_region_size;
+		} else {
+			DMINFO("Choosing default region size of 4MiB");
+			region_size = 1 << 13; /* sectors */
+		}
+	} else {
+		/*
+		 * Validate user-supplied value.
+		 */
+		if (region_size > rs->ti->len) {
+			rs->ti->error = "Supplied region size is too large";
+			return -EINVAL;
+		}
+
+		if (region_size < min_region_size) {
+			DMERR("Supplied region_size (%lu sectors) below minimum (%lu)",
+			      region_size, min_region_size);
+			rs->ti->error = "Supplied region size is too small";
+			return -EINVAL;
+		}
+
+		if (!is_power_of_2(region_size)) {
+			rs->ti->error = "Region size is not a power of 2";
+			return -EINVAL;
+		}
+
+		if (region_size < rs->md.chunk_sectors) {
+			rs->ti->error = "Region size is smaller than the chunk size";
+			return -EINVAL;
+		}
+	}
+
+	/*
+	 * Convert sectors to bytes.
+	 */
+	rs->md.bitmap_info.chunksize = (region_size << 9);
+
+	return 0;
+}
+
 /*
  * Possible arguments are...
  * RAID456:
@@ -249,12 +310,13 @@ static int dev_parms(struct raid_set *rs, char **argv)
  *    [max_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
  *    [max_write_behind <sectors>]	See '-write-behind=' (man mdadm)
  *    [stripe_cache <sectors>]		Stripe cache size for higher RAIDs
+ *    [region_size <sectors>]           Defines granularity of bitmap
  */
 static int parse_raid_params(struct raid_set *rs, char **argv,
 			     unsigned num_raid_params)
 {
 	unsigned i, rebuild_cnt = 0;
-	unsigned long value;
+	unsigned long value, region_size = 0;
 	char *key;
 
 	/*
@@ -365,6 +427,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 				return -EINVAL;
 			}
 			rs->md.sync_speed_max = (int)value;
+		} else if (!strcasecmp(key, "region_size")) {
+			rs->print_flags |= DMPF_REGION_SIZE;
+			region_size = value;
 		} else {
 			DMERR("Unable to parse RAID parameter: %s", key);
 			rs->ti->error = "Unable to parse RAID parameters";
@@ -372,6 +437,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 		}
 	}
 
+	if (validate_region_size(rs, region_size))
+		return -EINVAL;
+
+	if (rs->md.chunk_sectors)
+		rs->ti->split_io = rs->md.chunk_sectors;
+	else
+		rs->ti->split_io = region_size;
+
 	/* Assume there are no metadata devices until the drives are parsed */
 	rs->md.persistent = 0;
 	rs->md.external = 1;
@@ -469,7 +542,6 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto bad;
 
 	INIT_WORK(&rs->md.event_work, do_table_event);
-	ti->split_io = rs->md.chunk_sectors;
 	ti->private = rs;
 
 	mutex_lock(&rs->md.reconfig_mutex);
@@ -596,6 +668,10 @@ static int raid_status(struct dm_target *ti, status_type_t type,
 			       conf ? conf->max_nr_stripes * 2 : 0);
 		}
 
+		if (rs->print_flags & DMPF_REGION_SIZE)
+			DMEMIT(" region_size %lu",
+			       rs->md.bitmap_info.chunksize >> 9);
+
 		DMEMIT(" %d", rs->md.raid_disks);
 		for (i = 0; i < rs->md.raid_disks; i++) {
 			DMEMIT(" -"); /* metadata device */
-- 
cgit v1.2.3


From 46bed2b5c16bb7c82e1088d7ae75fb958c8a8c4e Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Tue, 2 Aug 2011 12:32:07 +0100
Subject: dm raid: add write_mostly parameter

Add the write_mostly parameter to RAID1 dm-raid tables.

This allows the user to set the WriteMostly flag on a RAID1 device that
should normally be avoided for read I/O.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 Documentation/device-mapper/dm-raid.txt |  4 +++-
 drivers/md/dm-raid.c                    | 26 +++++++++++++++++++++++++-
 2 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index be4419a30781..a7d1c4abc927 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -50,6 +50,7 @@ The target is named "raid" and it accepts the following parameters:
 
 	[min_recovery_rate <kB/sec/disk>]  Throttle RAID initialization
 	[max_recovery_rate <kB/sec/disk>]  Throttle RAID initialization
+	[write_mostly <idx>]		   Drive index is write-mostly
 	[max_write_behind <sectors>]       See '-write-behind=' (man mdadm)
 	[stripe_cache <sectors>]           Stripe cache size (higher RAIDs only)
 	[region_size <sectors>]
@@ -87,9 +88,10 @@ Example tables
         5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
 
 'dmsetup table' displays the table used to construct the mapping.
-The optional parameters will always be printed in the order listed
+The optional parameters are always printed in the order listed
 above with "sync" or "nosync" always output ahead of the other
 arguments, regardless of the order used when originally loading the table.
+Arguments that can be repeated are ordered by value.
 
 'dmsetup status' yields information on the state and health of the
 array.
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index a8a1915a450d..88143a0303d2 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -308,6 +308,7 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
  *    [daemon_sleep <ms>]		Time between bitmap daemon work to clear bits
  *    [min_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
  *    [max_recovery_rate <kB/sec/disk>]	Throttle RAID initialization
+ *    [write_mostly <idx>]		Indicate a write mostly drive via index
  *    [max_write_behind <sectors>]	See '-write-behind=' (man mdadm)
  *    [stripe_cache <sectors>]		Stripe cache size for higher RAIDs
  *    [region_size <sectors>]           Defines granularity of bitmap
@@ -376,7 +377,21 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 			clear_bit(In_sync, &rs->dev[value].rdev.flags);
 			rs->dev[value].rdev.recovery_offset = 0;
 			rs->print_flags |= DMPF_REBUILD;
+		} else if (!strcasecmp(key, "write_mostly")) {
+			if (rs->raid_type->level != 1) {
+				rs->ti->error = "write_mostly option is only valid for RAID1";
+				return -EINVAL;
+			}
+			if (value > rs->md.raid_disks) {
+				rs->ti->error = "Invalid write_mostly drive index given";
+				return -EINVAL;
+			}
+			set_bit(WriteMostly, &rs->dev[value].rdev.flags);
 		} else if (!strcasecmp(key, "max_write_behind")) {
+			if (rs->raid_type->level != 1) {
+				rs->ti->error = "max_write_behind option is only valid for RAID1";
+				return -EINVAL;
+			}
 			rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
 
 			/*
@@ -621,11 +636,15 @@ static int raid_status(struct dm_target *ti, status_type_t type,
 		break;
 	case STATUSTYPE_TABLE:
 		/* The string you would use to construct this array */
-		for (i = 0; i < rs->md.raid_disks; i++)
+		for (i = 0; i < rs->md.raid_disks; i++) {
 			if ((rs->print_flags & DMPF_REBUILD) &&
 			    rs->dev[i].data_dev &&
 			    !test_bit(In_sync, &rs->dev[i].rdev.flags))
 				raid_param_cnt += 2; /* for rebuilds */
+			if (rs->dev[i].data_dev &&
+			    test_bit(WriteMostly, &rs->dev[i].rdev.flags))
+				raid_param_cnt += 2;
+		}
 
 		raid_param_cnt += (hweight64(rs->print_flags & ~DMPF_REBUILD) * 2);
 		if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
@@ -656,6 +675,11 @@ static int raid_status(struct dm_target *ti, status_type_t type,
 		if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
 			DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
 
+		for (i = 0; i < rs->md.raid_disks; i++)
+			if (rs->dev[i].data_dev &&
+			    test_bit(WriteMostly, &rs->dev[i].rdev.flags))
+				DMEMIT(" write_mostly %u", i);
+
 		if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
 			DMEMIT(" max_write_behind %lu",
 			       rs->md.bitmap_info.max_write_behind);
-- 
cgit v1.2.3


From b12d437b73d32203a41fde0d407e91812c866844 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Tue, 2 Aug 2011 12:32:07 +0100
Subject: dm raid: support metadata devices

Add the ability to parse and use metadata devices to dm-raid.  Although
not strictly required, without the metadata devices, many features of
RAID are unavailable.  They are used to store a superblock and bitmap.

The role, or position in the array, of each device must be recorded in
its superblock.  This is to help with fault handling, array reshaping,
and sanity checks.  RAID 4/5/6 devices must be loaded in a specific order:
in this way, the 'array_position' field helps validate the correctness
of the mapping when it is loaded.  It can be used during reshaping to
identify which devices are added/removed.  Fault handling is impossible
without this field.  For example, when a device fails it is recorded in
the superblock.  If this is a RAID1 device and the offending device is
removed from the array, there must be a way during subsequent array
assembly to determine that the failed device was the one removed.  This
is done by correlating the 'array_position' field and the bit-field
variable 'failed_devices'.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 Documentation/device-mapper/dm-raid.txt |  12 +-
 drivers/md/Kconfig                      |   5 +-
 drivers/md/dm-raid.c                    | 419 ++++++++++++++++++++++++++++++--
 3 files changed, 412 insertions(+), 24 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index a7d1c4abc927..2a8c11331d2d 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -11,6 +11,7 @@ The target is named "raid" and it accepts the following parameters:
     <#raid_devs> <metadata_dev0> <dev0> [.. <metadata_devN> <devN>]
 
 <raid_type>:
+  raid1		RAID1 mirroring
   raid4		RAID4 dedicated parity disk
   raid5_la	RAID5 left asymmetric
 		- rotating parity 0 with data continuation
@@ -61,8 +62,7 @@ The target is named "raid" and it accepts the following parameters:
 <#raid_devs>: The number of devices composing the array.
 	Each device consists of two entries.  The first is the device
 	containing the metadata (if any); the second is the one containing the
-	data.  Currently, separate metadata devices are not supported and '-'
-	is required in place of the metadata device.
+	data.
 
 	If a drive has failed or is missing at creation time, a '-' can be
 	given for both the metadata and data drives for a given position.
@@ -70,7 +70,7 @@ The target is named "raid" and it accepts the following parameters:
 
 Example tables
 --------------
-# RAID4 - 4 data drives, 1 parity
+# RAID4 - 4 data drives, 1 parity (no metadata devices)
 # No metadata devices specified to hold superblock/bitmap info
 # Chunk size of 1MiB
 # (Lines separated for easy reading)
@@ -79,13 +79,13 @@ Example tables
         raid4 1 2048 \
         5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
 
-# RAID4 - 4 data drives, 1 parity (no metadata devices)
+# RAID4 - 4 data drives, 1 parity (with metadata devices)
 # Chunk size of 1MiB, force RAID initialization,
 #       min recovery rate at 20 kiB/sec/disk
 
 0 1960893648 raid \
-        raid4 4 2048 min_recovery_rate 20 sync\
-        5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
+        raid4 4 2048 sync min_recovery_rate 20 \
+        5 8:17 8:18 8:33 8:34 8:49 8:50 8:65 8:66 8:81 8:82
 
 'dmsetup table' displays the table used to construct the mapping.
 The optional parameters are always printed in the order listed
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 8420129fc5ee..f75a66e7d312 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -241,12 +241,13 @@ config DM_MIRROR
          needed for live data migration tools such as 'pvmove'.
 
 config DM_RAID
-       tristate "RAID 4/5/6 target (EXPERIMENTAL)"
+       tristate "RAID 1/4/5/6 target (EXPERIMENTAL)"
        depends on BLK_DEV_DM && EXPERIMENTAL
+       select MD_RAID1
        select MD_RAID456
        select BLK_DEV_MD
        ---help---
-	 A dm target that supports RAID4, RAID5 and RAID6 mappings
+	 A dm target that supports RAID1, RAID4, RAID5 and RAID6 mappings
 
 	 A RAID-5 set of N drives with a capacity of C MB per drive provides
 	 the capacity of C * (N - 1) MB, and protects against a failure
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 88143a0303d2..69873806c50c 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -16,12 +16,10 @@
 #define DM_MSG_PREFIX "raid"
 
 /*
- * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then
- * make it so the flag doesn't set anything.
+ * The following flags are used by dm-raid.c to set up the array state.
+ * They must be cleared before md_run is called.
  */
-#ifndef MD_SYNC_STATE_FORCED
-#define MD_SYNC_STATE_FORCED 0
-#endif
+#define FirstUse 10             /* rdev flag */
 
 struct raid_dev {
 	/*
@@ -149,9 +147,16 @@ static void context_free(struct raid_set *rs)
 {
 	int i;
 
-	for (i = 0; i < rs->md.raid_disks; i++)
+	for (i = 0; i < rs->md.raid_disks; i++) {
+		if (rs->dev[i].meta_dev)
+			dm_put_device(rs->ti, rs->dev[i].meta_dev);
+		if (rs->dev[i].rdev.sb_page)
+			put_page(rs->dev[i].rdev.sb_page);
+		rs->dev[i].rdev.sb_page = NULL;
+		rs->dev[i].rdev.sb_loaded = 0;
 		if (rs->dev[i].data_dev)
 			dm_put_device(rs->ti, rs->dev[i].data_dev);
+	}
 
 	kfree(rs);
 }
@@ -161,7 +166,16 @@ static void context_free(struct raid_set *rs)
  *  <meta_dev>: meta device name or '-' if missing
  *  <data_dev>: data device name or '-' if missing
  *
- * This code parses those words.
+ * The following are permitted:
+ *    - -
+ *    - <data_dev>
+ *    <meta_dev> <data_dev>
+ *
+ * The following is not allowed:
+ *    <meta_dev> -
+ *
+ * This code parses those words.  If there is a failure,
+ * the caller must use context_free to unwind the operations.
  */
 static int dev_parms(struct raid_set *rs, char **argv)
 {
@@ -184,8 +198,16 @@ static int dev_parms(struct raid_set *rs, char **argv)
 		rs->dev[i].rdev.mddev = &rs->md;
 
 		if (strcmp(argv[0], "-")) {
-			rs->ti->error = "Metadata devices not supported";
-			return -EINVAL;
+			ret = dm_get_device(rs->ti, argv[0],
+					    dm_table_get_mode(rs->ti->table),
+					    &rs->dev[i].meta_dev);
+			rs->ti->error = "RAID metadata device lookup failure";
+			if (ret)
+				return ret;
+
+			rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL);
+			if (!rs->dev[i].rdev.sb_page)
+				return -ENOMEM;
 		}
 
 		if (!strcmp(argv[1], "-")) {
@@ -195,6 +217,10 @@ static int dev_parms(struct raid_set *rs, char **argv)
 				return -EINVAL;
 			}
 
+			rs->ti->error = "No data device supplied with metadata device";
+			if (rs->dev[i].meta_dev)
+				return -EINVAL;
+
 			continue;
 		}
 
@@ -206,6 +232,10 @@ static int dev_parms(struct raid_set *rs, char **argv)
 			return ret;
 		}
 
+		if (rs->dev[i].meta_dev) {
+			metadata_available = 1;
+			rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev;
+		}
 		rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
 		list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
 		if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
@@ -334,22 +364,39 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 	num_raid_params--;
 
 	/*
-	 * Second, parse the unordered optional arguments
+	 * We set each individual device as In_sync with a completed
+	 * 'recovery_offset'.  If there has been a device failure or
+	 * replacement then one of the following cases applies:
+	 *
+	 *   1) User specifies 'rebuild'.
+	 *      - Device is reset when param is read.
+	 *   2) A new device is supplied.
+	 *      - No matching superblock found, resets device.
+	 *   3) Device failure was transient and returns on reload.
+	 *      - Failure noticed, resets device for bitmap replay.
+	 *   4) Device hadn't completed recovery after previous failure.
+	 *      - Superblock is read and overrides recovery_offset.
+	 *
+	 * What is found in the superblocks of the devices is always
+	 * authoritative, unless 'rebuild' or '[no]sync' was specified.
 	 */
-	for (i = 0; i < rs->md.raid_disks; i++)
+	for (i = 0; i < rs->md.raid_disks; i++) {
 		set_bit(In_sync, &rs->dev[i].rdev.flags);
+		rs->dev[i].rdev.recovery_offset = MaxSector;
+	}
 
+	/*
+	 * Second, parse the unordered optional arguments
+	 */
 	for (i = 0; i < num_raid_params; i++) {
 		if (!strcasecmp(argv[i], "nosync")) {
 			rs->md.recovery_cp = MaxSector;
 			rs->print_flags |= DMPF_NOSYNC;
-			rs->md.flags |= MD_SYNC_STATE_FORCED;
 			continue;
 		}
 		if (!strcasecmp(argv[i], "sync")) {
 			rs->md.recovery_cp = 0;
 			rs->print_flags |= DMPF_SYNC;
-			rs->md.flags |= MD_SYNC_STATE_FORCED;
 			continue;
 		}
 
@@ -481,14 +528,345 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
 	return md_raid5_congested(&rs->md, bits);
 }
 
+/*
+ * This structure is never routinely used by userspace, unlike md superblocks.
+ * Devices with this superblock should only ever be accessed via device-mapper.
+ */
+#define DM_RAID_MAGIC 0x64526D44
+struct dm_raid_superblock {
+	__le32 magic;		/* "DmRd" */
+	__le32 features;	/* Used to indicate possible future changes */
+
+	__le32 num_devices;	/* Number of devices in this array. (Max 64) */
+	__le32 array_position;	/* The position of this drive in the array */
+
+	__le64 events;		/* Incremented by md when superblock updated */
+	__le64 failed_devices;	/* Bit field of devices to indicate failures */
+
+	/*
+	 * This offset tracks the progress of the repair or replacement of
+	 * an individual drive.
+	 */
+	__le64 disk_recovery_offset;
+
+	/*
+	 * This offset tracks the progress of the initial array
+	 * synchronisation/parity calculation.
+	 */
+	__le64 array_resync_offset;
+
+	/*
+	 * RAID characteristics
+	 */
+	__le32 level;
+	__le32 layout;
+	__le32 stripe_sectors;
+
+	__u8 pad[452];		/* Round struct to 512 bytes. */
+				/* Always set to 0 when writing. */
+} __packed;
+
+static int read_disk_sb(mdk_rdev_t *rdev, int size)
+{
+	BUG_ON(!rdev->sb_page);
+
+	if (rdev->sb_loaded)
+		return 0;
+
+	if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) {
+		DMERR("Failed to read device superblock");
+		return -EINVAL;
+	}
+
+	rdev->sb_loaded = 1;
+
+	return 0;
+}
+
+static void super_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	mdk_rdev_t *r, *t;
+	uint64_t failed_devices;
+	struct dm_raid_superblock *sb;
+
+	sb = page_address(rdev->sb_page);
+	failed_devices = le64_to_cpu(sb->failed_devices);
+
+	rdev_for_each(r, t, mddev)
+		if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
+			failed_devices |= (1ULL << r->raid_disk);
+
+	memset(sb, 0, sizeof(*sb));
+
+	sb->magic = cpu_to_le32(DM_RAID_MAGIC);
+	sb->features = cpu_to_le32(0);	/* No features yet */
+
+	sb->num_devices = cpu_to_le32(mddev->raid_disks);
+	sb->array_position = cpu_to_le32(rdev->raid_disk);
+
+	sb->events = cpu_to_le64(mddev->events);
+	sb->failed_devices = cpu_to_le64(failed_devices);
+
+	sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset);
+	sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp);
+
+	sb->level = cpu_to_le32(mddev->level);
+	sb->layout = cpu_to_le32(mddev->layout);
+	sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors);
+}
+
+/*
+ * super_load
+ *
+ * This function creates a superblock if one is not found on the device
+ * and will decide which superblock to use if there's a choice.
+ *
+ * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise
+ */
+static int super_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev)
+{
+	int ret;
+	struct dm_raid_superblock *sb;
+	struct dm_raid_superblock *refsb;
+	uint64_t events_sb, events_refsb;
+
+	rdev->sb_start = 0;
+	rdev->sb_size = sizeof(*sb);
+
+	ret = read_disk_sb(rdev, rdev->sb_size);
+	if (ret)
+		return ret;
+
+	sb = page_address(rdev->sb_page);
+	if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) {
+		super_sync(rdev->mddev, rdev);
+
+		set_bit(FirstUse, &rdev->flags);
+
+		/* Force writing of superblocks to disk */
+		set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
+
+		/* Any superblock is better than none, choose that if given */
+		return refdev ? 0 : 1;
+	}
+
+	if (!refdev)
+		return 1;
+
+	events_sb = le64_to_cpu(sb->events);
+
+	refsb = page_address(refdev->sb_page);
+	events_refsb = le64_to_cpu(refsb->events);
+
+	return (events_sb > events_refsb) ? 1 : 0;
+}
+
+static int super_init_validation(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	int role;
+	struct raid_set *rs = container_of(mddev, struct raid_set, md);
+	uint64_t events_sb;
+	uint64_t failed_devices;
+	struct dm_raid_superblock *sb;
+	uint32_t new_devs = 0;
+	uint32_t rebuilds = 0;
+	mdk_rdev_t *r, *t;
+	struct dm_raid_superblock *sb2;
+
+	sb = page_address(rdev->sb_page);
+	events_sb = le64_to_cpu(sb->events);
+	failed_devices = le64_to_cpu(sb->failed_devices);
+
+	/*
+	 * Initialise to 1 if this is a new superblock.
+	 */
+	mddev->events = events_sb ? : 1;
+
+	/*
+	 * Reshaping is not currently allowed
+	 */
+	if ((le32_to_cpu(sb->level) != mddev->level) ||
+	    (le32_to_cpu(sb->layout) != mddev->layout) ||
+	    (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) {
+		DMERR("Reshaping arrays not yet supported.");
+		return -EINVAL;
+	}
+
+	/* We can only change the number of devices in RAID1 right now */
+	if ((rs->raid_type->level != 1) &&
+	    (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
+		DMERR("Reshaping arrays not yet supported.");
+		return -EINVAL;
+	}
+
+	if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)))
+		mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
+
+	/*
+	 * During load, we set FirstUse if a new superblock was written.
+	 * There are two reasons we might not have a superblock:
+	 * 1) The array is brand new - in which case, all of the
+	 *    devices must have their In_sync bit set.  Also,
+	 *    recovery_cp must be 0, unless forced.
+	 * 2) This is a new device being added to an old array
+	 *    and the new device needs to be rebuilt - in which
+	 *    case the In_sync bit will /not/ be set and
+	 *    recovery_cp must be MaxSector.
+	 */
+	rdev_for_each(r, t, mddev) {
+		if (!test_bit(In_sync, &r->flags)) {
+			if (!test_bit(FirstUse, &r->flags))
+				DMERR("Superblock area of "
+				      "rebuild device %d should have been "
+				      "cleared.", r->raid_disk);
+			set_bit(FirstUse, &r->flags);
+			rebuilds++;
+		} else if (test_bit(FirstUse, &r->flags))
+			new_devs++;
+	}
+
+	if (!rebuilds) {
+		if (new_devs == mddev->raid_disks) {
+			DMINFO("Superblocks created for new array");
+			set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
+		} else if (new_devs) {
+			DMERR("New device injected "
+			      "into existing array without 'rebuild' "
+			      "parameter specified");
+			return -EINVAL;
+		}
+	} else if (new_devs) {
+		DMERR("'rebuild' devices cannot be "
+		      "injected into an array with other first-time devices");
+		return -EINVAL;
+	} else if (mddev->recovery_cp != MaxSector) {
+		DMERR("'rebuild' specified while array is not in-sync");
+		return -EINVAL;
+	}
+
+	/*
+	 * Now we set the Faulty bit for those devices that are
+	 * recorded in the superblock as failed.
+	 */
+	rdev_for_each(r, t, mddev) {
+		if (!r->sb_page)
+			continue;
+		sb2 = page_address(r->sb_page);
+		sb2->failed_devices = 0;
+
+		/*
+		 * Check for any device re-ordering.
+		 */
+		if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) {
+			role = le32_to_cpu(sb2->array_position);
+			if (role != r->raid_disk) {
+				if (rs->raid_type->level != 1) {
+					rs->ti->error = "Cannot change device "
+						"positions in RAID array";
+					return -EINVAL;
+				}
+				DMINFO("RAID1 device #%d now at position #%d",
+				       role, r->raid_disk);
+			}
+
+			/*
+			 * Partial recovery is performed on
+			 * returning failed devices.
+			 */
+			if (failed_devices & (1 << role))
+				set_bit(Faulty, &r->flags);
+		}
+	}
+
+	return 0;
+}
+
+static int super_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	struct dm_raid_superblock *sb = page_address(rdev->sb_page);
+
+	/*
+	 * If mddev->events is not set, we know we have not yet initialized
+	 * the array.
+	 */
+	if (!mddev->events && super_init_validation(mddev, rdev))
+		return -EINVAL;
+
+	mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */
+	rdev->mddev->bitmap_info.default_offset = 4096 >> 9;
+	if (!test_bit(FirstUse, &rdev->flags)) {
+		rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
+		if (rdev->recovery_offset != MaxSector)
+			clear_bit(In_sync, &rdev->flags);
+	}
+
+	/*
+	 * If a device comes back, set it as not In_sync and no longer faulty.
+	 */
+	if (test_bit(Faulty, &rdev->flags)) {
+		clear_bit(Faulty, &rdev->flags);
+		clear_bit(In_sync, &rdev->flags);
+		rdev->saved_raid_disk = rdev->raid_disk;
+		rdev->recovery_offset = 0;
+	}
+
+	clear_bit(FirstUse, &rdev->flags);
+
+	return 0;
+}
+
+/*
+ * Analyse superblocks and select the freshest.
+ */
+static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
+{
+	int ret;
+	mdk_rdev_t *rdev, *freshest, *tmp;
+	mddev_t *mddev = &rs->md;
+
+	freshest = NULL;
+	rdev_for_each(rdev, tmp, mddev) {
+		if (!rdev->meta_bdev)
+			continue;
+
+		ret = super_load(rdev, freshest);
+
+		switch (ret) {
+		case 1:
+			freshest = rdev;
+			break;
+		case 0:
+			break;
+		default:
+			ti->error = "Failed to load superblock";
+			return ret;
+		}
+	}
+
+	if (!freshest)
+		return 0;
+
+	/*
+	 * Validation of the freshest device provides the source of
+	 * validation for the remaining devices.
+	 */
+	ti->error = "Unable to assemble array: Invalid superblocks";
+	if (super_validate(mddev, freshest))
+		return -EINVAL;
+
+	rdev_for_each(rdev, tmp, mddev)
+		if ((rdev != freshest) && super_validate(mddev, rdev))
+			return -EINVAL;
+
+	return 0;
+}
+
 /*
  * Construct a RAID4/5/6 mapping:
  * Args:
  *	<raid_type> <#raid_params> <raid_params>		\
  *	<#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
  *
- * ** metadata devices are not supported yet, use '-' instead **
- *
  * <raid_params> varies by <raid_type>.  See 'parse_raid_params' for
  * details on possible <raid_params>.
  */
@@ -556,6 +934,11 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	if (ret)
 		goto bad;
 
+	rs->md.sync_super = super_sync;
+	ret = analyse_superblocks(ti, rs);
+	if (ret)
+		goto bad;
+
 	INIT_WORK(&rs->md.event_work, do_table_event);
 	ti->private = rs;
 
@@ -698,7 +1081,10 @@ static int raid_status(struct dm_target *ti, status_type_t type,
 
 		DMEMIT(" %d", rs->md.raid_disks);
 		for (i = 0; i < rs->md.raid_disks; i++) {
-			DMEMIT(" -"); /* metadata device */
+			if (rs->dev[i].meta_dev)
+				DMEMIT(" %s", rs->dev[i].meta_dev->name);
+			else
+				DMEMIT(" -");
 
 			if (rs->dev[i].data_dev)
 				DMEMIT(" %s", rs->dev[i].data_dev->name);
@@ -755,6 +1141,7 @@ static void raid_resume(struct dm_target *ti)
 {
 	struct raid_set *rs = ti->private;
 
+	bitmap_load(&rs->md);
 	mddev_resume(&rs->md);
 }
 
-- 
cgit v1.2.3


From 772ae5f54d69c38a5e3c4352c5fdbdaff141af21 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Tue, 2 Aug 2011 12:32:08 +0100
Subject: dm crypt: optionally support discard requests

Add optional parameter field to dmcrypt table and support
"allow_discards" option.

Discard requests bypass crypt queue processing. Bio is simple remapped
to underlying device.

Note that discard will be never enabled by default because of security
consequences.  It is up to the administrator to enable it for encrypted
devices.

(Note that userspace cryptsetup does not understand new optional
parameters yet.  Support for this will come later.  Until then, you
should use 'dmsetup' to enable and disable this.)

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 Documentation/device-mapper/dm-crypt.txt | 21 +++++++++++++-
 drivers/md/dm-crypt.c                    | 49 +++++++++++++++++++++++++++++---
 2 files changed, 65 insertions(+), 5 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt
index 6b5c42dbbe84..2c656ae43ba7 100644
--- a/Documentation/device-mapper/dm-crypt.txt
+++ b/Documentation/device-mapper/dm-crypt.txt
@@ -4,7 +4,8 @@ dm-crypt
 Device-Mapper's "crypt" target provides transparent encryption of block devices
 using the kernel crypto API.
 
-Parameters: <cipher> <key> <iv_offset> <device path> <offset>
+Parameters: <cipher> <key> <iv_offset> <device path> \
+	      <offset> [<#opt_params> <opt_params>]
 
 <cipher>
     Encryption cipher and an optional IV generation mode.
@@ -37,6 +38,24 @@ Parameters: <cipher> <key> <iv_offset> <device path> <offset>
 <offset>
     Starting sector within the device where the encrypted data begins.
 
+<#opt_params>
+    Number of optional parameters. If there are no optional parameters,
+    the optional paramaters section can be skipped or #opt_params can be zero.
+    Otherwise #opt_params is the number of following arguments.
+
+    Example of optional parameters section:
+        1 allow_discards
+
+allow_discards
+    Block discard requests (a.k.a. TRIM) are passed through the crypt device.
+    The default is to ignore discard requests.
+
+    WARNING: Assess the specific security risks carefully before enabling this
+    option.  For example, allowing discards on encrypted devices may lead to
+    the leak of information about the ciphertext device (filesystem type,
+    used space etc.) if the discarded blocks can be located easily on the
+    device later.
+
 Example scripts
 ===============
 LUKS (Linux Unified Key Setup) is now the preferred way to set up disk
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index b79e7472a9b3..49da55c1528a 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1574,11 +1574,17 @@ bad_mem:
 static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
 	struct crypt_config *cc;
-	unsigned int key_size;
+	unsigned int key_size, opt_params;
 	unsigned long long tmpll;
 	int ret;
+	struct dm_arg_set as;
+	const char *opt_string;
+
+	static struct dm_arg _args[] = {
+		{0, 1, "Invalid number of feature args"},
+	};
 
-	if (argc != 5) {
+	if (argc < 5) {
 		ti->error = "Not enough arguments";
 		return -EINVAL;
 	}
@@ -1647,6 +1653,30 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 	cc->start = tmpll;
 
+	argv += 5;
+	argc -= 5;
+
+	/* Optional parameters */
+	if (argc) {
+		as.argc = argc;
+		as.argv = argv;
+
+		ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
+		if (ret)
+			goto bad;
+
+		opt_string = dm_shift_arg(&as);
+
+		if (opt_params == 1 && opt_string &&
+		    !strcasecmp(opt_string, "allow_discards"))
+			ti->num_discard_requests = 1;
+		else if (opt_params) {
+			ret = -EINVAL;
+			ti->error = "Invalid feature arguments";
+			goto bad;
+		}
+	}
+
 	ret = -ENOMEM;
 	cc->io_queue = alloc_workqueue("kcryptd_io",
 				       WQ_NON_REENTRANT|
@@ -1681,9 +1711,16 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
 	struct dm_crypt_io *io;
 	struct crypt_config *cc;
 
-	if (bio->bi_rw & REQ_FLUSH) {
+	/*
+	 * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues.
+	 * - for REQ_FLUSH device-mapper core ensures that no IO is in-flight
+	 * - for REQ_DISCARD caller must use flush if IO ordering matters
+	 */
+	if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) {
 		cc = ti->private;
 		bio->bi_bdev = cc->dev->bdev;
+		if (bio_sectors(bio))
+			bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector);
 		return DM_MAPIO_REMAPPED;
 	}
 
@@ -1726,6 +1763,10 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
 
 		DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset,
 				cc->dev->name, (unsigned long long)cc->start);
+
+		if (ti->num_discard_requests)
+			DMEMIT(" 1 allow_discards");
+
 		break;
 	}
 	return 0;
@@ -1822,7 +1863,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
 
 static struct target_type crypt_target = {
 	.name   = "crypt",
-	.version = {1, 10, 0},
+	.version = {1, 11, 0},
 	.module = THIS_MODULE,
 	.ctr    = crypt_ctr,
 	.dtr    = crypt_dtr,
-- 
cgit v1.2.3