From 36fa30636fb84b209210299684e1be66d9e58217 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Fri, 9 Sep 2005 16:23:45 -0700
Subject: [PATCH] md: all hot-add and hot-remove of md intent logging bitmaps

Both file-bitmaps and superblock bitmaps are supported.

If you add a bitmap file on the array device, you lose.

This introduces a 'default_bitmap_offset' field in mddev, as the ioctl used
for adding a superblock bitmap doesn't have room for giving an offset.  Later,
this value will be setable via sysfs.

Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/raid/md_k.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux/raid')

diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 8c14ba565a45..817062bf7352 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -278,6 +278,10 @@ struct mddev_s
 							* start of bitmap. May be
 							* negative, but not '0'
 							*/
+	long				default_bitmap_offset; /* this is the offset to use when
+								* hot-adding a bitmap.  It should
+								* eventually be settable by sysfs.
+								*/
 
 	struct list_head		all_mddevs;
 };
@@ -314,6 +318,12 @@ struct mdk_personality_s
 	int (*resize) (mddev_t *mddev, sector_t sectors);
 	int (*reshape) (mddev_t *mddev, int raid_disks);
 	int (*reconfig) (mddev_t *mddev, int layout, int chunk_size);
+	/* quiesce moves between quiescence states
+	 * 0 - fully active
+	 * 1 - no new requests allowed
+	 * others - reserved
+	 */
+	void (*quiesce) (mddev_t *mddev, int state);
 };
 
 
-- 
cgit v1.2.3


From 8ddf9efe6708f3674f0ddfeb6425fd27bea109a2 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Fri, 9 Sep 2005 16:23:45 -0700
Subject: [PATCH] md: support write-mostly device in raid1

This allows a device in a raid1 to be marked as "write mostly".  Read requests
will only be sent if there is no other option.

Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c           | 18 +++++++++++
 drivers/md/raid1.c        | 76 ++++++++++++++++++++++++++++++++---------------
 include/linux/raid/md_k.h |  3 ++
 include/linux/raid/md_p.h | 11 +++++--
 4 files changed, 82 insertions(+), 26 deletions(-)

(limited to 'include/linux/raid')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index ae654466dc23..f1ac356e656d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -670,6 +670,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 
 	if (mddev->level != LEVEL_MULTIPATH) {
 		rdev->faulty = 0;
+		rdev->flags = 0;
 		desc = sb->disks + rdev->desc_nr;
 
 		if (desc->state & (1<<MD_DISK_FAULTY))
@@ -679,6 +680,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 			rdev->in_sync = 1;
 			rdev->raid_disk = desc->raid_disk;
 		}
+		if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
+			set_bit(WriteMostly, &rdev->flags);
 	} else /* MULTIPATH are always insync */
 		rdev->in_sync = 1;
 	return 0;
@@ -777,6 +780,8 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 			spare++;
 			working++;
 		}
+		if (test_bit(WriteMostly, &rdev2->flags))
+			d->state |= (1<<MD_DISK_WRITEMOSTLY);
 	}
 	
 	/* now set the "removed" and "faulty" bits on any missing devices */
@@ -990,6 +995,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 			rdev->raid_disk = role;
 			break;
 		}
+		rdev->flags = 0;
+		if (sb->devflags & WriteMostly1)
+			set_bit(WriteMostly, &rdev->flags);
 	} else /* MULTIPATH are always insync */
 		rdev->in_sync = 1;
 
@@ -2152,6 +2160,8 @@ static int get_disk_info(mddev_t * mddev, void __user * arg)
 			info.state |= (1<<MD_DISK_ACTIVE);
 			info.state |= (1<<MD_DISK_SYNC);
 		}
+		if (test_bit(WriteMostly, &rdev->flags))
+			info.state |= (1<<MD_DISK_WRITEMOSTLY);
 	} else {
 		info.major = info.minor = 0;
 		info.raid_disk = -1;
@@ -2237,6 +2247,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 		rdev->saved_raid_disk = rdev->raid_disk;
 
 		rdev->in_sync = 0; /* just to be sure */
+		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
+			set_bit(WriteMostly, &rdev->flags);
+
 		rdev->raid_disk = -1;
 		err = bind_rdev_to_array(rdev, mddev);
 		if (err)
@@ -2277,6 +2290,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 		else
 			rdev->in_sync = 0;
 
+		if (info->state & (1<<MD_DISK_WRITEMOSTLY))
+			set_bit(WriteMostly, &rdev->flags);
+
 		err = bind_rdev_to_array(rdev, mddev);
 		if (err) {
 			export_rdev(rdev);
@@ -3329,6 +3345,8 @@ static int md_seq_show(struct seq_file *seq, void *v)
 			char b[BDEVNAME_SIZE];
 			seq_printf(seq, " %s[%d]",
 				bdevname(rdev->bdev,b), rdev->desc_nr);
+			if (test_bit(WriteMostly, &rdev->flags))
+				seq_printf(seq, "(W)");
 			if (rdev->faulty) {
 				seq_printf(seq, "(F)");
 				continue;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index ba643e4bfac9..28839a8193f2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -360,13 +360,14 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 {
 	const unsigned long this_sector = r1_bio->sector;
 	int new_disk = conf->last_used, disk = new_disk;
+	int wonly_disk = -1;
 	const int sectors = r1_bio->sectors;
 	sector_t new_distance, current_distance;
-	mdk_rdev_t *new_rdev, *rdev;
+	mdk_rdev_t *rdev;
 
 	rcu_read_lock();
 	/*
-	 * Check if it if we can balance. We can balance on the whole
+	 * Check if we can balance. We can balance on the whole
 	 * device if no resync is going on, or below the resync window.
 	 * We take the first readable disk when above the resync window.
 	 */
@@ -376,11 +377,16 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 		/* Choose the first operation device, for consistancy */
 		new_disk = 0;
 
-		while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL ||
-		       !new_rdev->in_sync) {
-			new_disk++;
-			if (new_disk == conf->raid_disks) {
-				new_disk = -1;
+		for (rdev = conf->mirrors[new_disk].rdev;
+		     !rdev || !rdev->in_sync
+			     || test_bit(WriteMostly, &rdev->flags);
+		     rdev = conf->mirrors[++new_disk].rdev) {
+
+			if (rdev && rdev->in_sync)
+				wonly_disk = new_disk;
+
+			if (new_disk == conf->raid_disks - 1) {
+				new_disk = wonly_disk;
 				break;
 			}
 		}
@@ -389,16 +395,26 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 
 
 	/* make sure the disk is operational */
-	while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL ||
-	       !new_rdev->in_sync) {
+	for (rdev = conf->mirrors[new_disk].rdev;
+	     !rdev || !rdev->in_sync ||
+		     test_bit(WriteMostly, &rdev->flags);
+	     rdev = conf->mirrors[new_disk].rdev) {
+
+		if (rdev && rdev->in_sync)
+			wonly_disk = new_disk;
+
 		if (new_disk <= 0)
 			new_disk = conf->raid_disks;
 		new_disk--;
 		if (new_disk == disk) {
-			new_disk = -1;
-			goto rb_out;
+			new_disk = wonly_disk;
+			break;
 		}
 	}
+
+	if (new_disk < 0)
+		goto rb_out;
+
 	disk = new_disk;
 	/* now disk == new_disk == starting point for search */
 
@@ -419,37 +435,41 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 			disk = conf->raid_disks;
 		disk--;
 
-		if ((rdev=conf->mirrors[disk].rdev) == NULL ||
-		    !rdev->in_sync)
+		rdev = conf->mirrors[disk].rdev;
+
+		if (!rdev ||
+		    !rdev->in_sync ||
+		    test_bit(WriteMostly, &rdev->flags))
 			continue;
 
 		if (!atomic_read(&rdev->nr_pending)) {
 			new_disk = disk;
-			new_rdev = rdev;
 			break;
 		}
 		new_distance = abs(this_sector - conf->mirrors[disk].head_position);
 		if (new_distance < current_distance) {
 			current_distance = new_distance;
 			new_disk = disk;
-			new_rdev = rdev;
 		}
 	} while (disk != conf->last_used);
 
-rb_out:
+ rb_out:
 
 
 	if (new_disk >= 0) {
-		conf->next_seq_sect = this_sector + sectors;
-		conf->last_used = new_disk;
-		atomic_inc(&new_rdev->nr_pending);
-		if (!new_rdev->in_sync) {
+		rdev = conf->mirrors[new_disk].rdev;
+		if (!rdev)
+			goto retry;
+		atomic_inc(&rdev->nr_pending);
+		if (!rdev->in_sync) {
 			/* cannot risk returning a device that failed
 			 * before we inc'ed nr_pending
 			 */
-			atomic_dec(&new_rdev->nr_pending);
+			atomic_dec(&rdev->nr_pending);
 			goto retry;
 		}
+		conf->next_seq_sect = this_sector + sectors;
+		conf->last_used = new_disk;
 	}
 	rcu_read_unlock();
 
@@ -1109,6 +1129,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	sector_t max_sector, nr_sectors;
 	int disk;
 	int i;
+	int wonly;
 	int write_targets = 0;
 	int sync_blocks;
 	int still_degraded = 0;
@@ -1164,14 +1185,21 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	 */
 	disk = conf->last_used;
 	/* make sure disk is operational */
-
+	wonly = disk;
 	while (conf->mirrors[disk].rdev == NULL ||
-	       !conf->mirrors[disk].rdev->in_sync) {
+	       !conf->mirrors[disk].rdev->in_sync ||
+	       test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags)
+		) {
+		if (conf->mirrors[disk].rdev  &&
+		    conf->mirrors[disk].rdev->in_sync)
+			wonly = disk;
 		if (disk <= 0)
 			disk = conf->raid_disks;
 		disk--;
-		if (disk == conf->last_used)
+		if (disk == conf->last_used) {
+			disk = wonly;
 			break;
+		}
 	}
 	conf->last_used = disk;
 	atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 817062bf7352..7ef78e15ce04 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -181,6 +181,9 @@ struct mdk_rdev_s
 	int faulty;			/* if faulty do not issue IO requests */
 	int in_sync;			/* device is a full member of the array */
 
+	unsigned long	flags;		/* Should include faulty and in_sync here. */
+#define	WriteMostly	4		/* Avoid reading if at all possible */
+
 	int desc_nr;			/* descriptor index in the superblock */
 	int raid_disk;			/* role of device in array */
 	int saved_raid_disk;		/* role that device used to have in the
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index dc65cd435494..4f047f84fb1f 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -79,6 +79,11 @@
 #define MD_DISK_SYNC		2 /* disk is in sync with the raid set */
 #define MD_DISK_REMOVED		3 /* disk is in sync with the raid set */
 
+#define	MD_DISK_WRITEMOSTLY	9 /* disk is "write-mostly" is RAID1 config.
+				   * read requests will only be sent here in
+				   * dire need
+				   */
+
 typedef struct mdp_device_descriptor_s {
 	__u32 number;		/* 0 Device number in the entire set	      */
 	__u32 major;		/* 1 Device major number		      */
@@ -193,7 +198,7 @@ struct mdp_superblock_1 {
 
 	__u64	ctime;		/* lo 40 bits are seconds, top 24 are microseconds or 0*/
 	__u32	level;		/* -4 (multipath), -1 (linear), 0,1,4,5 */
-	__u32	layout;		/* only for raid5 currently */
+	__u32	layout;		/* only for raid5 and raid10 currently */
 	__u64	size;		/* used size of component devices, in 512byte sectors */
 
 	__u32	chunksize;	/* in 512byte sectors */
@@ -212,7 +217,9 @@ struct mdp_superblock_1 {
 	__u32	dev_number;	/* permanent identifier of this  device - not role in raid */
 	__u32	cnt_corrected_read; /* number of read errors that were corrected by re-writing */
 	__u8	device_uuid[16]; /* user-space setable, ignored by kernel */
-	__u8	pad2[64-56];	/* set to 0 when writing */
+	__u8	devflags;	/* per-device flags.  Only one defined...*/
+#define	WriteMostly1	1	/* mask for writemostly flag in above */
+	__u8	pad2[64-57];	/* set to 0 when writing */
 
 	/* array state information - 64 bytes */
 	__u64	utime;		/* 40 bits second, 24 btes microseconds */
-- 
cgit v1.2.3


From 4b6d287f627b5fb6a49f78f9e81649ff98c62bb7 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Fri, 9 Sep 2005 16:23:47 -0700
Subject: [PATCH] md: add write-behind support for md/raid1

If a device is flagged 'WriteMostly' and the array has a bitmap, and the
bitmap superblock indicates that write_behind is allowed, then write_behind is
enabled for WriteMostly devices.

Write requests will be acknowledges as complete to the caller (via b_end_io)
when all non-WriteMostly devices have completed the write, but will not be
cleared from the bitmap until all devices complete.

This requires memory allocation to make a local copy of the data being
written.  If there is insufficient memory, then we fall-back on normal write
semantics.

Signed-Off-By: Paul Clements <paul.clements@steeleye.com>
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/bitmap.c         |  26 ++++++++--
 drivers/md/raid1.c          | 124 +++++++++++++++++++++++++++++++++++++++++---
 include/linux/raid/bitmap.h |  15 ++++--
 include/linux/raid/md_k.h   |   3 ++
 include/linux/raid/raid1.h  |  13 +++++
 5 files changed, 165 insertions(+), 16 deletions(-)

(limited to 'include/linux/raid')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 2925219f0881..2c84de2b4ad5 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -437,6 +437,7 @@ void bitmap_print_sb(struct bitmap *bitmap)
 	printk(KERN_DEBUG "  daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep));
 	printk(KERN_DEBUG "     sync size: %llu KB\n",
 			(unsigned long long)le64_to_cpu(sb->sync_size)/2);
+	printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind));
 	kunmap(bitmap->sb_page);
 }
 
@@ -445,7 +446,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 {
 	char *reason = NULL;
 	bitmap_super_t *sb;
-	unsigned long chunksize, daemon_sleep;
+	unsigned long chunksize, daemon_sleep, write_behind;
 	unsigned long bytes_read;
 	unsigned long long events;
 	int err = -EINVAL;
@@ -474,6 +475,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 
 	chunksize = le32_to_cpu(sb->chunksize);
 	daemon_sleep = le32_to_cpu(sb->daemon_sleep);
+	write_behind = le32_to_cpu(sb->write_behind);
 
 	/* verify that the bitmap-specific fields are valid */
 	if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
@@ -485,7 +487,9 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	else if ((1 << ffz(~chunksize)) != chunksize)
 		reason = "bitmap chunksize not a power of 2";
 	else if (daemon_sleep < 1 || daemon_sleep > 15)
-		reason = "daemon sleep period out of range";
+		reason = "daemon sleep period out of range (1-15s)";
+	else if (write_behind > COUNTER_MAX)
+		reason = "write-behind limit out of range (0 - 16383)";
 	if (reason) {
 		printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n",
 			bmname(bitmap), reason);
@@ -518,6 +522,7 @@ success:
 	/* assign fields using values from superblock */
 	bitmap->chunksize = chunksize;
 	bitmap->daemon_sleep = daemon_sleep;
+	bitmap->max_write_behind = write_behind;
 	bitmap->flags |= sb->state;
 	bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
 	if (sb->state & BITMAP_STALE)
@@ -1282,9 +1287,16 @@ static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
 	}
 }
 
-int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors)
+int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind)
 {
 	if (!bitmap) return 0;
+
+	if (behind) {
+		atomic_inc(&bitmap->behind_writes);
+		PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n",
+		  atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
+	}
+
 	while (sectors) {
 		int blocks;
 		bitmap_counter_t *bmc;
@@ -1319,9 +1331,15 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
 }
 
 void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors,
-		     int success)
+		     int success, int behind)
 {
 	if (!bitmap) return;
+	if (behind) {
+		atomic_dec(&bitmap->behind_writes);
+		PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n",
+		  atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
+	}
+
 	while (sectors) {
 		int blocks;
 		unsigned long flags;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 28839a8193f2..ba7f5f256161 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -222,8 +222,17 @@ static void raid_end_bio_io(r1bio_t *r1_bio)
 {
 	struct bio *bio = r1_bio->master_bio;
 
-	bio_endio(bio, bio->bi_size,
-		test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
+	/* if nobody has done the final endio yet, do it now */
+	if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
+		PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n",
+			(bio_data_dir(bio) == WRITE) ? "write" : "read",
+			(unsigned long long) bio->bi_sector,
+			(unsigned long long) bio->bi_sector +
+				(bio->bi_size >> 9) - 1);
+
+		bio_endio(bio, bio->bi_size,
+			test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
+	}
 	free_r1bio(r1_bio);
 }
 
@@ -292,7 +301,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
-	int mirror;
+	int mirror, behind;
 	conf_t *conf = mddev_to_conf(r1_bio->mddev);
 
 	if (bio->bi_size)
@@ -323,16 +332,46 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
 
 	update_head_pos(mirror, r1_bio);
 
+	behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
+	if (behind) {
+		if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
+			atomic_dec(&r1_bio->behind_remaining);
+
+		/* In behind mode, we ACK the master bio once the I/O has safely
+		 * reached all non-writemostly disks. Setting the Returned bit
+		 * ensures that this gets done only once -- we don't ever want to
+		 * return -EIO here, instead we'll wait */
+
+		if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
+		    test_bit(R1BIO_Uptodate, &r1_bio->state)) {
+			/* Maybe we can return now */
+			if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
+				struct bio *mbio = r1_bio->master_bio;
+				PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
+				       (unsigned long long) mbio->bi_sector,
+				       (unsigned long long) mbio->bi_sector +
+				       (mbio->bi_size >> 9) - 1);
+				bio_endio(mbio, mbio->bi_size, 0);
+			}
+		}
+	}
 	/*
 	 *
 	 * Let's see if all mirrored write operations have finished
 	 * already.
 	 */
 	if (atomic_dec_and_test(&r1_bio->remaining)) {
+		if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
+			/* free extra copy of the data pages */
+			int i = bio->bi_vcnt;
+			while (i--)
+				__free_page(bio->bi_io_vec[i].bv_page);
+		}
 		/* clear the bitmap if all writes complete successfully */
 		bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
 				r1_bio->sectors,
-				!test_bit(R1BIO_Degraded, &r1_bio->state));
+				!test_bit(R1BIO_Degraded, &r1_bio->state),
+				behind);
 		md_write_end(r1_bio->mddev);
 		raid_end_bio_io(r1_bio);
 	}
@@ -562,6 +601,39 @@ static void device_barrier(conf_t *conf, sector_t sect)
 	spin_unlock_irq(&conf->resync_lock);
 }
 
+/* duplicate the data pages for behind I/O */
+static struct page **alloc_behind_pages(struct bio *bio)
+{
+	int i;
+	struct bio_vec *bvec;
+	struct page **pages = kmalloc(bio->bi_vcnt * sizeof(struct page *),
+					GFP_NOIO);
+	if (unlikely(!pages))
+		goto do_sync_io;
+
+	memset(pages, 0, bio->bi_vcnt * sizeof(struct page *));
+
+	bio_for_each_segment(bvec, bio, i) {
+		pages[i] = alloc_page(GFP_NOIO);
+		if (unlikely(!pages[i]))
+			goto do_sync_io;
+		memcpy(kmap(pages[i]) + bvec->bv_offset,
+			kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
+		kunmap(pages[i]);
+		kunmap(bvec->bv_page);
+	}
+
+	return pages;
+
+do_sync_io:
+	if (pages)
+		for (i = 0; i < bio->bi_vcnt && pages[i]; i++)
+			__free_page(pages[i]);
+	kfree(pages);
+	PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
+	return NULL;
+}
+
 static int make_request(request_queue_t *q, struct bio * bio)
 {
 	mddev_t *mddev = q->queuedata;
@@ -574,6 +646,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
 	struct bitmap *bitmap = mddev->bitmap;
 	unsigned long flags;
 	struct bio_list bl;
+	struct page **behind_pages = NULL;
 
 	if (unlikely(bio_barrier(bio))) {
 		bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
@@ -613,8 +686,6 @@ static int make_request(request_queue_t *q, struct bio * bio)
 	r1_bio->mddev = mddev;
 	r1_bio->sector = bio->bi_sector;
 
-	r1_bio->state = 0;
-
 	if (bio_data_dir(bio) == READ) {
 		/*
 		 * read balancing logic:
@@ -675,13 +746,22 @@ static int make_request(request_queue_t *q, struct bio * bio)
 	}
 	rcu_read_unlock();
 
+	BUG_ON(targets == 0); /* we never fail the last device */
+
 	if (targets < conf->raid_disks) {
 		/* array is degraded, we will not clear the bitmap
 		 * on I/O completion (see raid1_end_write_request) */
 		set_bit(R1BIO_Degraded, &r1_bio->state);
 	}
 
+	/* do behind I/O ? */
+	if (bitmap &&
+	    atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind &&
+	    (behind_pages = alloc_behind_pages(bio)) != NULL)
+		set_bit(R1BIO_BehindIO, &r1_bio->state);
+
 	atomic_set(&r1_bio->remaining, 0);
+	atomic_set(&r1_bio->behind_remaining, 0);
 
 	bio_list_init(&bl);
 	for (i = 0; i < disks; i++) {
@@ -698,12 +778,31 @@ static int make_request(request_queue_t *q, struct bio * bio)
 		mbio->bi_rw = WRITE;
 		mbio->bi_private = r1_bio;
 
+		if (behind_pages) {
+			struct bio_vec *bvec;
+			int j;
+
+			/* Yes, I really want the '__' version so that
+			 * we clear any unused pointer in the io_vec, rather
+			 * than leave them unchanged.  This is important
+			 * because when we come to free the pages, we won't
+			 * know the originial bi_idx, so we just free
+			 * them all
+			 */
+			__bio_for_each_segment(bvec, mbio, j, 0)
+				bvec->bv_page = behind_pages[j];
+			if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
+				atomic_inc(&r1_bio->behind_remaining);
+		}
+
 		atomic_inc(&r1_bio->remaining);
 
 		bio_list_add(&bl, mbio);
 	}
+	kfree(behind_pages); /* the behind pages are attached to the bios now */
 
-	bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors);
+	bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
+				test_bit(R1BIO_BehindIO, &r1_bio->state));
 	spin_lock_irqsave(&conf->device_lock, flags);
 	bio_list_merge(&conf->pending_bio_list, &bl);
 	bio_list_init(&bl);
@@ -1471,6 +1570,17 @@ out:
 static int stop(mddev_t *mddev)
 {
 	conf_t *conf = mddev_to_conf(mddev);
+	struct bitmap *bitmap = mddev->bitmap;
+	int behind_wait = 0;
+
+	/* wait for behind writes to complete */
+	while (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
+		behind_wait++;
+		printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(HZ); /* wait a second */
+		/* need to kick something here to make sure I/O goes? */
+	}
 
 	md_unregister_thread(mddev->thread);
 	mddev->thread = NULL;
diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h
index 4bf1659f8aa8..9de99198caf1 100644
--- a/include/linux/raid/bitmap.h
+++ b/include/linux/raid/bitmap.h
@@ -7,7 +7,7 @@
 #define BITMAP_H 1
 
 #define BITMAP_MAJOR 3
-#define BITMAP_MINOR 38
+#define BITMAP_MINOR 39
 
 /*
  * in-memory bitmap:
@@ -147,8 +147,9 @@ typedef struct bitmap_super_s {
 	__u32 state;        /* 48  bitmap state information */
 	__u32 chunksize;    /* 52  the bitmap chunk size in bytes */
 	__u32 daemon_sleep; /* 56  seconds between disk flushes */
+	__u32 write_behind; /* 60  number of outstanding write-behind writes */
 
-	__u8  pad[256 - 60]; /* set to zero */
+	__u8  pad[256 - 64]; /* set to zero */
 } bitmap_super_t;
 
 /* notes:
@@ -226,6 +227,9 @@ struct bitmap {
 
 	unsigned long flags;
 
+	unsigned long max_write_behind; /* write-behind mode */
+	atomic_t behind_writes;
+
 	/*
 	 * the bitmap daemon - periodically wakes up and sweeps the bitmap
 	 * file, cleaning up bits and flushing out pages to disk as necessary
@@ -260,9 +264,10 @@ int  bitmap_setallbits(struct bitmap *bitmap);
 void bitmap_write_all(struct bitmap *bitmap);
 
 /* these are exported */
-int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors);
-void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors,
-		     int success);
+int bitmap_startwrite(struct bitmap *bitmap, sector_t offset,
+			unsigned long sectors, int behind);
+void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
+			unsigned long sectors, int success, int behind);
 int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded);
 void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted);
 void bitmap_close_sync(struct bitmap *bitmap);
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 7ef78e15ce04..2514e5fcda7f 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -275,6 +275,9 @@ struct mddev_s
 	atomic_t			writes_pending; 
 	request_queue_t			*queue;	/* for plugging ... */
 
+	atomic_t                        write_behind; /* outstanding async IO */
+	unsigned int                    max_write_behind; /* 0 = sync */
+
 	struct bitmap                   *bitmap; /* the bitmap for the device */
 	struct file			*bitmap_file; /* the bitmap file */
 	long				bitmap_offset; /* offset from superblock of
diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h
index 9d93cf12e890..60e19b667548 100644
--- a/include/linux/raid/raid1.h
+++ b/include/linux/raid/raid1.h
@@ -80,6 +80,9 @@ struct r1bio_s {
 	atomic_t		remaining; /* 'have we finished' count,
 					    * used from IRQ handlers
 					    */
+	atomic_t		behind_remaining; /* number of write-behind ios remaining
+						 * in this BehindIO request
+						 */
 	sector_t		sector;
 	int			sectors;
 	unsigned long		state;
@@ -107,4 +110,14 @@ struct r1bio_s {
 #define	R1BIO_Uptodate	0
 #define	R1BIO_IsSync	1
 #define	R1BIO_Degraded	2
+#define	R1BIO_BehindIO   3
+/* For write-behind requests, we call bi_end_io when
+ * the last non-write-behind device completes, providing
+ * any write was successful.  Otherwise we call when
+ * any write-behind write succeeds, otherwise we call
+ * with failure when last write completes (and all failed).
+ * Record that bi_end_io was called with this flag...
+ */
+#define	R1BIO_Returned 4
+
 #endif
-- 
cgit v1.2.3


From 15945fee6f09bff1f86b1a735b5888dc59cf38e3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Fri, 9 Sep 2005 16:23:47 -0700
Subject: [PATCH] md: support md/linear array with components greater than 2
 terabytes.

linear currently uses division by the size of the smallest componenet device
to find which device a request goes to.  If that smallest device is larger
than 2 terabytes, then the division will not work on some systems.

So we introduce a pre-shift, and take care not to make the hash table too
large, much like the code in raid0.

Also get rid of conf->nr_zones, which is not needed.

Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/linear.c         | 95 +++++++++++++++++++++++++++++++--------------
 include/linux/raid/linear.h |  4 +-
 2 files changed, 68 insertions(+), 31 deletions(-)

(limited to 'include/linux/raid')

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 4991ba543368..bb279fad2fd2 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -38,7 +38,8 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
 	/*
 	 * sector_div(a,b) returns the remainer and sets a to a/b
 	 */
-	(void)sector_div(block, conf->smallest->size);
+	block >>= conf->preshift;
+	(void)sector_div(block, conf->hash_spacing);
 	hash = conf->hash_table[block];
 
 	while ((sector>>1) >= (hash->size + hash->offset))
@@ -47,7 +48,7 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
 }
 
 /**
- *	linear_mergeable_bvec -- tell bio layer if a two requests can be merged
+ *	linear_mergeable_bvec -- tell bio layer if two requests can be merged
  *	@q: request queue
  *	@bio: the buffer head that's been built up so far
  *	@biovec: the request that could be merged to it.
@@ -116,7 +117,7 @@ static int linear_run (mddev_t *mddev)
 	dev_info_t **table;
 	mdk_rdev_t *rdev;
 	int i, nb_zone, cnt;
-	sector_t start;
+	sector_t min_spacing;
 	sector_t curr_offset;
 	struct list_head *tmp;
 
@@ -127,11 +128,6 @@ static int linear_run (mddev_t *mddev)
 	memset(conf, 0, sizeof(*conf) + mddev->raid_disks*sizeof(dev_info_t));
 	mddev->private = conf;
 
-	/*
-	 * Find the smallest device.
-	 */
-
-	conf->smallest = NULL;
 	cnt = 0;
 	mddev->array_size = 0;
 
@@ -159,8 +155,6 @@ static int linear_run (mddev_t *mddev)
 		disk->size = rdev->size;
 		mddev->array_size += rdev->size;
 
-		if (!conf->smallest || (disk->size < conf->smallest->size))
-			conf->smallest = disk;
 		cnt++;
 	}
 	if (cnt != mddev->raid_disks) {
@@ -168,6 +162,36 @@ static int linear_run (mddev_t *mddev)
 		goto out;
 	}
 
+	min_spacing = mddev->array_size;
+	sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *));
+
+	/* min_spacing is the minimum spacing that will fit the hash
+	 * table in one PAGE.  This may be much smaller than needed.
+	 * We find the smallest non-terminal set of consecutive devices
+	 * that is larger than min_spacing as use the size of that as
+	 * the actual spacing
+	 */
+	conf->hash_spacing = mddev->array_size;
+	for (i=0; i < cnt-1 ; i++) {
+		sector_t sz = 0;
+		int j;
+		for (j=i; i<cnt-1 && sz < min_spacing ; j++)
+			sz += conf->disks[j].size;
+		if (sz >= min_spacing && sz < conf->hash_spacing)
+			conf->hash_spacing = sz;
+	}
+
+	/* hash_spacing may be too large for sector_div to work with,
+	 * so we might need to pre-shift
+	 */
+	conf->preshift = 0;
+	if (sizeof(sector_t) > sizeof(u32)) {
+		sector_t space = conf->hash_spacing;
+		while (space > (sector_t)(~(u32)0)) {
+			space >>= 1;
+			conf->preshift++;
+		}
+	}
 	/*
 	 * This code was restructured to work around a gcc-2.95.3 internal
 	 * compiler error.  Alter it with care.
@@ -177,39 +201,52 @@ static int linear_run (mddev_t *mddev)
 		unsigned round;
 		unsigned long base;
 
-		sz = mddev->array_size;
-		base = conf->smallest->size;
+		sz = mddev->array_size >> conf->preshift;
+		sz += 1; /* force round-up */
+		base = conf->hash_spacing >> conf->preshift;
 		round = sector_div(sz, base);
-		nb_zone = conf->nr_zones = sz + (round ? 1 : 0);
+		nb_zone = sz + (round ? 1 : 0);
 	}
-			
-	conf->hash_table = kmalloc (sizeof (dev_info_t*) * nb_zone,
+	BUG_ON(nb_zone > PAGE_SIZE / sizeof(struct dev_info *));
+
+	conf->hash_table = kmalloc (sizeof (struct dev_info *) * nb_zone,
 					GFP_KERNEL);
 	if (!conf->hash_table)
 		goto out;
 
 	/*
 	 * Here we generate the linear hash table
+	 * First calculate the device offsets.
 	 */
+	conf->disks[0].offset = 0;
+	for (i=1; i<mddev->raid_disks; i++)
+		conf->disks[i].offset =
+			conf->disks[i-1].offset +
+			conf->disks[i-1].size;
+
 	table = conf->hash_table;
-	start = 0;
 	curr_offset = 0;
-	for (i = 0; i < cnt; i++) {
-		dev_info_t *disk = conf->disks + i;
+	i = 0;
+	for (curr_offset = 0;
+	     curr_offset < mddev->array_size;
+	     curr_offset += conf->hash_spacing) {
 
-		disk->offset = curr_offset;
-		curr_offset += disk->size;
+		while (i < mddev->raid_disks-1 &&
+		       curr_offset >= conf->disks[i+1].offset)
+			i++;
 
-		/* 'curr_offset' is the end of this disk
-		 * 'start' is the start of table
+		*table ++ = conf->disks + i;
+	}
+
+	if (conf->preshift) {
+		conf->hash_spacing >>= conf->preshift;
+		/* round hash_spacing up so that when we divide by it,
+		 * we err on the side of "too-low", which is safest.
 		 */
-		while (start < curr_offset) {
-			*table++ = disk;
-			start += conf->smallest->size;
-		}
+		conf->hash_spacing++;
 	}
-	if (table-conf->hash_table != nb_zone)
-		BUG();
+
+	BUG_ON(table - conf->hash_table > nb_zone);
 
 	blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
 	mddev->queue->unplug_fn = linear_unplug;
@@ -299,7 +336,7 @@ static void linear_status (struct seq_file *seq, mddev_t *mddev)
 	sector_t s = 0;
   
 	seq_printf(seq, "      ");
-	for (j = 0; j < conf->nr_zones; j++)
+	for (j = 0; j < mddev->raid_disks; j++)
 	{
 		char b[BDEVNAME_SIZE];
 		s += conf->smallest_size;
diff --git a/include/linux/raid/linear.h b/include/linux/raid/linear.h
index e04c4fe45b53..7eaf290e10e7 100644
--- a/include/linux/raid/linear.h
+++ b/include/linux/raid/linear.h
@@ -14,8 +14,8 @@ typedef struct dev_info dev_info_t;
 struct linear_private_data
 {
 	dev_info_t		**hash_table;
-	dev_info_t		*smallest;
-	int			nr_zones;
+	sector_t		hash_spacing;
+	int			preshift; /* shift before dividing by hash_spacing */
 	dev_info_t		disks[0];
 };
 
-- 
cgit v1.2.3


From 71c0805cb48462c99fbe0e5fcc6c12d7b9929c09 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Fri, 9 Sep 2005 16:23:51 -0700
Subject: [PATCH] md: allow md to load a superblock with feature-bit '1' set

As this is used to flag an internal bitmap.

Also, introduce symbolic names for feature bits.

Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c           | 6 +++---
 include/linux/raid/md_p.h | 5 +++++
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux/raid')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 866c704e008a..1be3f2de396b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -875,7 +875,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 	    sb->major_version != cpu_to_le32(1) ||
 	    le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
 	    le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
-	    sb->feature_map != 0)
+	    (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
 		return -EINVAL;
 
 	if (calc_sb_1_csum(sb) != sb->sb_csum) {
@@ -954,7 +954,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 
 		mddev->max_disks =  (4096-256)/2;
 
-		if ((le32_to_cpu(sb->feature_map) & 1) &&
+		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
 		    mddev->bitmap_file == NULL ) {
 			if (mddev->level != 1) {
 				printk(KERN_WARNING "md: bitmaps only supported for raid1\n");
@@ -1029,7 +1029,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 
 	if (mddev->bitmap && mddev->bitmap_file == NULL) {
 		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
-		sb->feature_map = cpu_to_le32(1);
+		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
 	}
 
 	max_dev = 0;
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index 4f047f84fb1f..c100fa5d4bfa 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -238,5 +238,10 @@ struct mdp_superblock_1 {
 	__u16	dev_roles[0];	/* role in array, or 0xffff for a spare, or 0xfffe for faulty */
 };
 
+/* feature_map bits */
+#define MD_FEATURE_BITMAP_OFFSET	1
+
+#define	MD_FEATURE_ALL			1
+
 #endif 
 
-- 
cgit v1.2.3


From 773f7834425e83144c95fbbc553ced3c2b74b828 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Fri, 9 Sep 2005 16:23:53 -0700
Subject: [PATCH] md: remove old cruft from md_k.h header file

These inlines haven't been used for ages, they should go.

Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/raid/md_k.h | 64 -----------------------------------------------
 1 file changed, 64 deletions(-)

(limited to 'include/linux/raid')

diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 2514e5fcda7f..8042f55dd323 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -85,70 +85,6 @@ typedef struct mdk_rdev_s mdk_rdev_t;
 
 #define MAX_CHUNK_SIZE (4096*1024)
 
-/*
- * default readahead
- */
-
-static inline int disk_faulty(mdp_disk_t * d)
-{
-	return d->state & (1 << MD_DISK_FAULTY);
-}
-
-static inline int disk_active(mdp_disk_t * d)
-{
-	return d->state & (1 << MD_DISK_ACTIVE);
-}
-
-static inline int disk_sync(mdp_disk_t * d)
-{
-	return d->state & (1 << MD_DISK_SYNC);
-}
-
-static inline int disk_spare(mdp_disk_t * d)
-{
-	return !disk_sync(d) && !disk_active(d) && !disk_faulty(d);
-}
-
-static inline int disk_removed(mdp_disk_t * d)
-{
-	return d->state & (1 << MD_DISK_REMOVED);
-}
-
-static inline void mark_disk_faulty(mdp_disk_t * d)
-{
-	d->state |= (1 << MD_DISK_FAULTY);
-}
-
-static inline void mark_disk_active(mdp_disk_t * d)
-{
-	d->state |= (1 << MD_DISK_ACTIVE);
-}
-
-static inline void mark_disk_sync(mdp_disk_t * d)
-{
-	d->state |= (1 << MD_DISK_SYNC);
-}
-
-static inline void mark_disk_spare(mdp_disk_t * d)
-{
-	d->state = 0;
-}
-
-static inline void mark_disk_removed(mdp_disk_t * d)
-{
-	d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED);
-}
-
-static inline void mark_disk_inactive(mdp_disk_t * d)
-{
-	d->state &= ~(1 << MD_DISK_ACTIVE);
-}
-
-static inline void mark_disk_nonsync(mdp_disk_t * d)
-{
-	d->state &= ~(1 << MD_DISK_SYNC);
-}
-
 /*
  * MD's 'extended' device
  */
-- 
cgit v1.2.3


From 0002b2718dd04da67c21f8a7830de8d95a9b0345 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Fri, 9 Sep 2005 16:23:53 -0700
Subject: [PATCH] md: limit size of sb read/written to appropriate amount

version-1 superblocks are not (normally) 4K long, and can be of variable size.
 Writing the full 4K can cause corruption (but only in non-default
configurations).

With this patch the super-block-flavour can choose a size to read, and set a
size to write based on what it finds.

Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c           | 20 +++++++++++++++-----
 include/linux/raid/md_k.h |  1 +
 2 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'include/linux/raid')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1be3f2de396b..be7873c61b3c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -393,7 +393,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 	return ret;
 }
 
-static int read_disk_sb(mdk_rdev_t * rdev)
+static int read_disk_sb(mdk_rdev_t * rdev, int size)
 {
 	char b[BDEVNAME_SIZE];
 	if (!rdev->sb_page) {
@@ -404,7 +404,7 @@ static int read_disk_sb(mdk_rdev_t * rdev)
 		return 0;
 
 
-	if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ))
+	if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ))
 		goto fail;
 	rdev->sb_loaded = 1;
 	return 0;
@@ -531,7 +531,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
 	sb_offset = calc_dev_sboffset(rdev->bdev);
 	rdev->sb_offset = sb_offset;
 
-	ret = read_disk_sb(rdev);
+	ret = read_disk_sb(rdev, MD_SB_BYTES);
 	if (ret) return ret;
 
 	ret = -EINVAL;
@@ -564,6 +564,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
 
 	rdev->preferred_minor = sb->md_minor;
 	rdev->data_offset = 0;
+	rdev->sb_size = MD_SB_BYTES;
 
 	if (sb->level == LEVEL_MULTIPATH)
 		rdev->desc_nr = -1;
@@ -837,6 +838,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 	int ret;
 	sector_t sb_offset;
 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
+	int bmask;
 
 	/*
 	 * Calculate the position of the superblock.
@@ -865,7 +867,10 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 	}
 	rdev->sb_offset = sb_offset;
 
-	ret = read_disk_sb(rdev);
+	/* superblock is rarely larger than 1K, but it can be larger,
+	 * and it is safe to read 4k, so we do that
+	 */
+	ret = read_disk_sb(rdev, 4096);
 	if (ret) return ret;
 
 
@@ -891,6 +896,11 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 	rdev->preferred_minor = 0xffff;
 	rdev->data_offset = le64_to_cpu(sb->data_offset);
 
+	rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
+	bmask = block_size(rdev->bdev)-1;
+	if (rdev->sb_size & bmask)
+		rdev-> sb_size = (rdev->sb_size | bmask)+1;
+
 	if (refdev == 0)
 		return 1;
 	else {
@@ -1375,7 +1385,7 @@ repeat:
 		dprintk("%s ", bdevname(rdev->bdev,b));
 		if (!rdev->faulty) {
 			md_super_write(mddev,rdev,
-				       rdev->sb_offset<<1, MD_SB_BYTES,
+				       rdev->sb_offset<<1, rdev->sb_size,
 				       rdev->sb_page);
 			dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
 				bdevname(rdev->bdev,b),
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 8042f55dd323..ebce949b1443 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -102,6 +102,7 @@ struct mdk_rdev_s
 	int		sb_loaded;
 	sector_t	data_offset;	/* start of data in array */
 	sector_t	sb_offset;
+	int		sb_size;	/* bytes in the superblock */
 	int		preferred_minor;	/* autorun support */
 
 	/* A device can be in one of three states based on two flags:
-- 
cgit v1.2.3


From 72626685dc66d455742a7f215a0535c551628b9e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Fri, 9 Sep 2005 16:23:54 -0700
Subject: [PATCH] md: add write-intent-bitmap support to raid5

Most awkward part of this is delaying write requests until bitmap updates have
been flushed.

To achieve this, we have a sequence number (seq_flush) which is incremented
each time the raid5 is unplugged.

If the raid thread notices that this has changed, it flushes bitmap changes,
and assigned the value of seq_flush to seq_write.

When a write request arrives, it is given the number from seq_write, and that
write request may not complete until seq_flush is larger than the saved seq
number.

We have a new queue for storing stripes which are waiting for a bitmap flush
and an extra flag for stripes to record if the write was 'degraded' and so
should not clear the a bit in the bitmap.

Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/md.c            |   3 +-
 drivers/md/raid5.c         | 133 +++++++++++++++++++++++++++++++++++++++++----
 include/linux/raid/raid5.h |  14 ++++-
 3 files changed, 137 insertions(+), 13 deletions(-)

(limited to 'include/linux/raid')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index be7873c61b3c..dbf540a7fccc 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -645,7 +645,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 
 		if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
 		    mddev->bitmap_file == NULL) {
-			if (mddev->level != 1) {
+			if (mddev->level != 1 && mddev->level != 5) {
 				/* FIXME use a better test */
 				printk(KERN_WARNING "md: bitmaps only support for raid1\n");
 				return -EINVAL;
@@ -3517,7 +3517,6 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
  */
 void md_write_start(mddev_t *mddev, struct bio *bi)
 {
-	DEFINE_WAIT(w);
 	if (bio_data_dir(bi) != WRITE)
 		return;
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ed859e08d600..4683ca24c046 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -24,6 +24,8 @@
 #include <linux/bitops.h>
 #include <asm/atomic.h>
 
+#include <linux/raid/bitmap.h>
+
 /*
  * Stripe cache
  */
@@ -79,8 +81,13 @@ static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
 		if (test_bit(STRIPE_HANDLE, &sh->state)) {
 			if (test_bit(STRIPE_DELAYED, &sh->state))
 				list_add_tail(&sh->lru, &conf->delayed_list);
-			else
+			else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+				 conf->seq_write == sh->bm_seq)
+				list_add_tail(&sh->lru, &conf->bitmap_list);
+			else {
+				clear_bit(STRIPE_BIT_DELAY, &sh->state);
 				list_add_tail(&sh->lru, &conf->handle_list);
+			}
 			md_wakeup_thread(conf->mddev->thread);
 		} else {
 			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
@@ -244,6 +251,9 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 	spin_lock_irq(&conf->device_lock);
 
 	do {
+		wait_event_lock_irq(conf->wait_for_stripe,
+				    conf->quiesce == 0,
+				    conf->device_lock, /* nothing */);
 		sh = __find_stripe(conf, sector);
 		if (!sh) {
 			if (!conf->inactive_blocked)
@@ -803,6 +813,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 {
 	struct bio **bip;
 	raid5_conf_t *conf = sh->raid_conf;
+	int firstwrite=0;
 
 	PRINTK("adding bh b#%llu to stripe s#%llu\n",
 		(unsigned long long)bi->bi_sector,
@@ -811,9 +822,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 
 	spin_lock(&sh->lock);
 	spin_lock_irq(&conf->device_lock);
-	if (forwrite)
+	if (forwrite) {
 		bip = &sh->dev[dd_idx].towrite;
-	else
+		if (*bip == NULL && sh->dev[dd_idx].written == NULL)
+			firstwrite = 1;
+	} else
 		bip = &sh->dev[dd_idx].toread;
 	while (*bip && (*bip)->bi_sector < bi->bi_sector) {
 		if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
@@ -836,6 +849,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 		(unsigned long long)bi->bi_sector,
 		(unsigned long long)sh->sector, dd_idx);
 
+	if (conf->mddev->bitmap && firstwrite) {
+		sh->bm_seq = conf->seq_write;
+		bitmap_startwrite(conf->mddev->bitmap, sh->sector,
+				  STRIPE_SECTORS, 0);
+		set_bit(STRIPE_BIT_DELAY, &sh->state);
+	}
+
 	if (forwrite) {
 		/* check if page is covered */
 		sector_t sector = sh->dev[dd_idx].sector;
@@ -958,12 +978,13 @@ static void handle_stripe(struct stripe_head *sh)
 	 * need to be failed
 	 */
 	if (failed > 1 && to_read+to_write+written) {
-		spin_lock_irq(&conf->device_lock);
 		for (i=disks; i--; ) {
+			int bitmap_end = 0;
+			spin_lock_irq(&conf->device_lock);
 			/* fail all writes first */
 			bi = sh->dev[i].towrite;
 			sh->dev[i].towrite = NULL;
-			if (bi) to_write--;
+			if (bi) { to_write--; bitmap_end = 1; }
 
 			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
 				wake_up(&conf->wait_for_overlap);
@@ -981,6 +1002,7 @@ static void handle_stripe(struct stripe_head *sh)
 			/* and fail all 'written' */
 			bi = sh->dev[i].written;
 			sh->dev[i].written = NULL;
+			if (bi) bitmap_end = 1;
 			while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
 				struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
 				clear_bit(BIO_UPTODATE, &bi->bi_flags);
@@ -1009,8 +1031,11 @@ static void handle_stripe(struct stripe_head *sh)
 					bi = nextbi;
 				}
 			}
+			spin_unlock_irq(&conf->device_lock);
+			if (bitmap_end)
+				bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+						STRIPE_SECTORS, 0, 0);
 		}
-		spin_unlock_irq(&conf->device_lock);
 	}
 	if (failed > 1 && syncing) {
 		md_done_sync(conf->mddev, STRIPE_SECTORS,0);
@@ -1038,6 +1063,7 @@ static void handle_stripe(struct stripe_head *sh)
 			 test_bit(R5_UPTODATE, &dev->flags) ) {
 			/* We can return any write requests */
 			    struct bio *wbi, *wbi2;
+			    int bitmap_end = 0;
 			    PRINTK("Return write for disc %d\n", i);
 			    spin_lock_irq(&conf->device_lock);
 			    wbi = dev->written;
@@ -1051,7 +1077,13 @@ static void handle_stripe(struct stripe_head *sh)
 				    }
 				    wbi = wbi2;
 			    }
+			    if (dev->towrite == NULL)
+				    bitmap_end = 1;
 			    spin_unlock_irq(&conf->device_lock);
+			    if (bitmap_end)
+				    bitmap_endwrite(conf->mddev->bitmap, sh->sector,
+						    STRIPE_SECTORS,
+						    !test_bit(STRIPE_DEGRADED, &sh->state), 0);
 		    }
 		}
 	}
@@ -1175,7 +1207,8 @@ static void handle_stripe(struct stripe_head *sh)
 				}
 			}
 		/* now if nothing is locked, and if we have enough data, we can start a write request */
-		if (locked == 0 && (rcw == 0 ||rmw == 0)) {
+		if (locked == 0 && (rcw == 0 ||rmw == 0) &&
+		    !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
 			PRINTK("Computing parity...\n");
 			compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
 			/* now every locked buffer is ready to be written */
@@ -1231,6 +1264,7 @@ static void handle_stripe(struct stripe_head *sh)
 			dev = &sh->dev[failed_num];
 			set_bit(R5_LOCKED, &dev->flags);
 			set_bit(R5_Wantwrite, &dev->flags);
+			clear_bit(STRIPE_DEGRADED, &sh->state);
 			locked++;
 			set_bit(STRIPE_INSYNC, &sh->state);
 			set_bit(R5_Syncio, &dev->flags);
@@ -1298,6 +1332,8 @@ static void handle_stripe(struct stripe_head *sh)
 			bi->bi_next = NULL;
 			generic_make_request(bi);
 		} else {
+			if (rw == 1)
+				set_bit(STRIPE_DEGRADED, &sh->state);
 			PRINTK("skip op %ld on disc %d for sector %llu\n",
 				bi->bi_rw, i, (unsigned long long)sh->sector);
 			clear_bit(R5_LOCKED, &sh->dev[i].flags);
@@ -1322,6 +1358,20 @@ static inline void raid5_activate_delayed(raid5_conf_t *conf)
 	}
 }
 
+static inline void activate_bit_delay(raid5_conf_t *conf)
+{
+	/* device_lock is held */
+	struct list_head head;
+	list_add(&head, &conf->bitmap_list);
+	list_del_init(&conf->bitmap_list);
+	while (!list_empty(&head)) {
+		struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
+		list_del_init(&sh->lru);
+		atomic_inc(&sh->count);
+		__release_stripe(conf, sh);
+	}
+}
+
 static void unplug_slaves(mddev_t *mddev)
 {
 	raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -1354,8 +1404,10 @@ static void raid5_unplug_device(request_queue_t *q)
 
 	spin_lock_irqsave(&conf->device_lock, flags);
 
-	if (blk_remove_plug(q))
+	if (blk_remove_plug(q)) {
+		conf->seq_flush++;
 		raid5_activate_delayed(conf);
+	}
 	md_wakeup_thread(mddev->thread);
 
 	spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -1493,10 +1545,20 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 	sector_t first_sector;
 	int raid_disks = conf->raid_disks;
 	int data_disks = raid_disks-1;
+	sector_t max_sector = mddev->size << 1;
+	int sync_blocks;
 
-	if (sector_nr >= mddev->size <<1) {
+	if (sector_nr >= max_sector) {
 		/* just being told to finish up .. nothing much to do */
 		unplug_slaves(mddev);
+
+		if (mddev->curr_resync < max_sector) /* aborted */
+			bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
+					&sync_blocks, 1);
+		else /* compelted sync */
+			conf->fullsync = 0;
+		bitmap_close_sync(mddev->bitmap);
+
 		return 0;
 	}
 	/* if there is 1 or more failed drives and we are trying
@@ -1508,6 +1570,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		*skipped = 1;
 		return rv;
 	}
+	if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
+	    !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
+		/* we can skip this block, and probably more */
+		sync_blocks /= STRIPE_SECTORS;
+		*skipped = 1;
+		return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
+	}
 
 	x = sector_nr;
 	chunk_offset = sector_div(x, sectors_per_chunk);
@@ -1525,6 +1594,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(1);
 	}
+	bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0);
 	spin_lock(&sh->lock);	
 	set_bit(STRIPE_SYNCING, &sh->state);
 	clear_bit(STRIPE_INSYNC, &sh->state);
@@ -1558,6 +1628,13 @@ static void raid5d (mddev_t *mddev)
 	while (1) {
 		struct list_head *first;
 
+		if (conf->seq_flush - conf->seq_write > 0) {
+			int seq = conf->seq_flush;
+			bitmap_unplug(mddev->bitmap);
+			conf->seq_write = seq;
+			activate_bit_delay(conf);
+		}
+
 		if (list_empty(&conf->handle_list) &&
 		    atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
 		    !blk_queue_plugged(mddev->queue) &&
@@ -1591,7 +1668,7 @@ static void raid5d (mddev_t *mddev)
 	PRINTK("--- raid5d inactive\n");
 }
 
-static int run (mddev_t *mddev)
+static int run(mddev_t *mddev)
 {
 	raid5_conf_t *conf;
 	int raid_disk, memory;
@@ -1621,6 +1698,7 @@ static int run (mddev_t *mddev)
 	init_waitqueue_head(&conf->wait_for_overlap);
 	INIT_LIST_HEAD(&conf->handle_list);
 	INIT_LIST_HEAD(&conf->delayed_list);
+	INIT_LIST_HEAD(&conf->bitmap_list);
 	INIT_LIST_HEAD(&conf->inactive_list);
 	atomic_set(&conf->active_stripes, 0);
 	atomic_set(&conf->preread_active_stripes, 0);
@@ -1732,6 +1810,9 @@ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
 
 	/* Ok, everything is just fine now */
 
+	if (mddev->bitmap)
+		mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
+
 	mddev->queue->unplug_fn = raid5_unplug_device;
 	mddev->queue->issue_flush_fn = raid5_issue_flush;
 
@@ -1912,6 +1993,8 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 			rdev->in_sync = 0;
 			rdev->raid_disk = disk;
 			found = 1;
+			if (rdev->saved_raid_disk != disk)
+				conf->fullsync = 1;
 			p->rdev = rdev;
 			break;
 		}
@@ -1941,6 +2024,35 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
 	return 0;
 }
 
+static void raid5_quiesce(mddev_t *mddev, int state)
+{
+	raid5_conf_t *conf = mddev_to_conf(mddev);
+
+	switch(state) {
+	case 1: /* stop all writes */
+		spin_lock_irq(&conf->device_lock);
+		conf->quiesce = 1;
+		wait_event_lock_irq(conf->wait_for_stripe,
+				    atomic_read(&conf->active_stripes) == 0,
+				    conf->device_lock, /* nothing */);
+		spin_unlock_irq(&conf->device_lock);
+		break;
+
+	case 0: /* re-enable writes */
+		spin_lock_irq(&conf->device_lock);
+		conf->quiesce = 0;
+		wake_up(&conf->wait_for_stripe);
+		spin_unlock_irq(&conf->device_lock);
+		break;
+	}
+	if (mddev->thread) {
+		if (mddev->bitmap)
+			mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
+		else
+			mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
+		md_wakeup_thread(mddev->thread);
+	}
+}
 static mdk_personality_t raid5_personality=
 {
 	.name		= "raid5",
@@ -1955,6 +2067,7 @@ static mdk_personality_t raid5_personality=
 	.spare_active	= raid5_spare_active,
 	.sync_request	= sync_request,
 	.resize		= raid5_resize,
+	.quiesce	= raid5_quiesce,
 };
 
 static int __init raid5_init (void)
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index d63ddcb4afad..176fc653c284 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -134,6 +134,7 @@ struct stripe_head {
 	unsigned long		state;			/* state flags */
 	atomic_t		count;			/* nr of active thread/requests */
 	spinlock_t		lock;
+	int			bm_seq;	/* sequence number for bitmap flushes */
 	struct r5dev {
 		struct bio	req;
 		struct bio_vec	vec;
@@ -165,12 +166,13 @@ struct stripe_head {
 /*
  * Stripe state
  */
-#define STRIPE_ERROR		1
 #define STRIPE_HANDLE		2
 #define	STRIPE_SYNCING		3
 #define	STRIPE_INSYNC		4
 #define	STRIPE_PREREAD_ACTIVE	5
 #define	STRIPE_DELAYED		6
+#define	STRIPE_DEGRADED		7
+#define	STRIPE_BIT_DELAY	8
 
 /*
  * Plugging:
@@ -210,10 +212,20 @@ struct raid5_private_data {
 
 	struct list_head	handle_list; /* stripes needing handling */
 	struct list_head	delayed_list; /* stripes that have plugged requests */
+	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
 	atomic_t		preread_active_stripes; /* stripes with scheduled io */
 
 	char			cache_name[20];
 	kmem_cache_t		*slab_cache; /* for allocating stripes */
+
+	int			seq_flush, seq_write;
+	int			quiesce;
+
+	int			fullsync;  /* set to 1 if a full sync is needed,
+					    * (fresh device added).
+					    * Cleared when a sync completes.
+					    */
+
 	/*
 	 * Free stripes pool
 	 */
-- 
cgit v1.2.3