From 36fa30636fb84b209210299684e1be66d9e58217 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:45 -0700 Subject: [PATCH] md: all hot-add and hot-remove of md intent logging bitmaps Both file-bitmaps and superblock bitmaps are supported. If you add a bitmap file on the array device, you lose. This introduces a 'default_bitmap_offset' field in mddev, as the ioctl used for adding a superblock bitmap doesn't have room for giving an offset. Later, this value will be setable via sysfs. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md_k.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 8c14ba565a45..817062bf7352 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -278,6 +278,10 @@ struct mddev_s * start of bitmap. May be * negative, but not '0' */ + long default_bitmap_offset; /* this is the offset to use when + * hot-adding a bitmap. It should + * eventually be settable by sysfs. + */ struct list_head all_mddevs; }; @@ -314,6 +318,12 @@ struct mdk_personality_s int (*resize) (mddev_t *mddev, sector_t sectors); int (*reshape) (mddev_t *mddev, int raid_disks); int (*reconfig) (mddev_t *mddev, int layout, int chunk_size); + /* quiesce moves between quiescence states + * 0 - fully active + * 1 - no new requests allowed + * others - reserved + */ + void (*quiesce) (mddev_t *mddev, int state); }; -- cgit v1.2.3 From 8ddf9efe6708f3674f0ddfeb6425fd27bea109a2 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:45 -0700 Subject: [PATCH] md: support write-mostly device in raid1 This allows a device in a raid1 to be marked as "write mostly". Read requests will only be sent if there is no other option. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 18 +++++++++++ drivers/md/raid1.c | 76 ++++++++++++++++++++++++++++++++--------------- include/linux/raid/md_k.h | 3 ++ include/linux/raid/md_p.h | 11 +++++-- 4 files changed, 82 insertions(+), 26 deletions(-) (limited to 'include/linux/raid') diff --git a/drivers/md/md.c b/drivers/md/md.c index ae654466dc23..f1ac356e656d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -670,6 +670,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) if (mddev->level != LEVEL_MULTIPATH) { rdev->faulty = 0; + rdev->flags = 0; desc = sb->disks + rdev->desc_nr; if (desc->state & (1<in_sync = 1; rdev->raid_disk = desc->raid_disk; } + if (desc->state & (1<flags); } else /* MULTIPATH are always insync */ rdev->in_sync = 1; return 0; @@ -777,6 +780,8 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) spare++; working++; } + if (test_bit(WriteMostly, &rdev2->flags)) + d->state |= (1<raid_disk = role; break; } + rdev->flags = 0; + if (sb->devflags & WriteMostly1) + set_bit(WriteMostly, &rdev->flags); } else /* MULTIPATH are always insync */ rdev->in_sync = 1; @@ -2152,6 +2160,8 @@ static int get_disk_info(mddev_t * mddev, void __user * arg) info.state |= (1<flags)) + info.state |= (1<saved_raid_disk = rdev->raid_disk; rdev->in_sync = 0; /* just to be sure */ + if (info->state & (1<flags); + rdev->raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); if (err) @@ -2277,6 +2290,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) else rdev->in_sync = 0; + if (info->state & (1<flags); + err = bind_rdev_to_array(rdev, mddev); if (err) { export_rdev(rdev); @@ -3329,6 +3345,8 @@ static int md_seq_show(struct seq_file *seq, void *v) char b[BDEVNAME_SIZE]; seq_printf(seq, " %s[%d]", bdevname(rdev->bdev,b), rdev->desc_nr); + if (test_bit(WriteMostly, &rdev->flags)) + seq_printf(seq, "(W)"); if (rdev->faulty) { seq_printf(seq, "(F)"); continue; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index ba643e4bfac9..28839a8193f2 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -360,13 +360,14 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) { const unsigned long this_sector = r1_bio->sector; int new_disk = conf->last_used, disk = new_disk; + int wonly_disk = -1; const int sectors = r1_bio->sectors; sector_t new_distance, current_distance; - mdk_rdev_t *new_rdev, *rdev; + mdk_rdev_t *rdev; rcu_read_lock(); /* - * Check if it if we can balance. We can balance on the whole + * Check if we can balance. We can balance on the whole * device if no resync is going on, or below the resync window. * We take the first readable disk when above the resync window. */ @@ -376,11 +377,16 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) /* Choose the first operation device, for consistancy */ new_disk = 0; - while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || - !new_rdev->in_sync) { - new_disk++; - if (new_disk == conf->raid_disks) { - new_disk = -1; + for (rdev = conf->mirrors[new_disk].rdev; + !rdev || !rdev->in_sync + || test_bit(WriteMostly, &rdev->flags); + rdev = conf->mirrors[++new_disk].rdev) { + + if (rdev && rdev->in_sync) + wonly_disk = new_disk; + + if (new_disk == conf->raid_disks - 1) { + new_disk = wonly_disk; break; } } @@ -389,16 +395,26 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) /* make sure the disk is operational */ - while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL || - !new_rdev->in_sync) { + for (rdev = conf->mirrors[new_disk].rdev; + !rdev || !rdev->in_sync || + test_bit(WriteMostly, &rdev->flags); + rdev = conf->mirrors[new_disk].rdev) { + + if (rdev && rdev->in_sync) + wonly_disk = new_disk; + if (new_disk <= 0) new_disk = conf->raid_disks; new_disk--; if (new_disk == disk) { - new_disk = -1; - goto rb_out; + new_disk = wonly_disk; + break; } } + + if (new_disk < 0) + goto rb_out; + disk = new_disk; /* now disk == new_disk == starting point for search */ @@ -419,37 +435,41 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) disk = conf->raid_disks; disk--; - if ((rdev=conf->mirrors[disk].rdev) == NULL || - !rdev->in_sync) + rdev = conf->mirrors[disk].rdev; + + if (!rdev || + !rdev->in_sync || + test_bit(WriteMostly, &rdev->flags)) continue; if (!atomic_read(&rdev->nr_pending)) { new_disk = disk; - new_rdev = rdev; break; } new_distance = abs(this_sector - conf->mirrors[disk].head_position); if (new_distance < current_distance) { current_distance = new_distance; new_disk = disk; - new_rdev = rdev; } } while (disk != conf->last_used); -rb_out: + rb_out: if (new_disk >= 0) { - conf->next_seq_sect = this_sector + sectors; - conf->last_used = new_disk; - atomic_inc(&new_rdev->nr_pending); - if (!new_rdev->in_sync) { + rdev = conf->mirrors[new_disk].rdev; + if (!rdev) + goto retry; + atomic_inc(&rdev->nr_pending); + if (!rdev->in_sync) { /* cannot risk returning a device that failed * before we inc'ed nr_pending */ - atomic_dec(&new_rdev->nr_pending); + atomic_dec(&rdev->nr_pending); goto retry; } + conf->next_seq_sect = this_sector + sectors; + conf->last_used = new_disk; } rcu_read_unlock(); @@ -1109,6 +1129,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i sector_t max_sector, nr_sectors; int disk; int i; + int wonly; int write_targets = 0; int sync_blocks; int still_degraded = 0; @@ -1164,14 +1185,21 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i */ disk = conf->last_used; /* make sure disk is operational */ - + wonly = disk; while (conf->mirrors[disk].rdev == NULL || - !conf->mirrors[disk].rdev->in_sync) { + !conf->mirrors[disk].rdev->in_sync || + test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags) + ) { + if (conf->mirrors[disk].rdev && + conf->mirrors[disk].rdev->in_sync) + wonly = disk; if (disk <= 0) disk = conf->raid_disks; disk--; - if (disk == conf->last_used) + if (disk == conf->last_used) { + disk = wonly; break; + } } conf->last_used = disk; atomic_inc(&conf->mirrors[disk].rdev->nr_pending); diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 817062bf7352..7ef78e15ce04 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -181,6 +181,9 @@ struct mdk_rdev_s int faulty; /* if faulty do not issue IO requests */ int in_sync; /* device is a full member of the array */ + unsigned long flags; /* Should include faulty and in_sync here. */ +#define WriteMostly 4 /* Avoid reading if at all possible */ + int desc_nr; /* descriptor index in the superblock */ int raid_disk; /* role of device in array */ int saved_raid_disk; /* role that device used to have in the diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index dc65cd435494..4f047f84fb1f 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h @@ -79,6 +79,11 @@ #define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ #define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */ +#define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config. + * read requests will only be sent here in + * dire need + */ + typedef struct mdp_device_descriptor_s { __u32 number; /* 0 Device number in the entire set */ __u32 major; /* 1 Device major number */ @@ -193,7 +198,7 @@ struct mdp_superblock_1 { __u64 ctime; /* lo 40 bits are seconds, top 24 are microseconds or 0*/ __u32 level; /* -4 (multipath), -1 (linear), 0,1,4,5 */ - __u32 layout; /* only for raid5 currently */ + __u32 layout; /* only for raid5 and raid10 currently */ __u64 size; /* used size of component devices, in 512byte sectors */ __u32 chunksize; /* in 512byte sectors */ @@ -212,7 +217,9 @@ struct mdp_superblock_1 { __u32 dev_number; /* permanent identifier of this device - not role in raid */ __u32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */ __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ - __u8 pad2[64-56]; /* set to 0 when writing */ + __u8 devflags; /* per-device flags. Only one defined...*/ +#define WriteMostly1 1 /* mask for writemostly flag in above */ + __u8 pad2[64-57]; /* set to 0 when writing */ /* array state information - 64 bytes */ __u64 utime; /* 40 bits second, 24 btes microseconds */ -- cgit v1.2.3 From 4b6d287f627b5fb6a49f78f9e81649ff98c62bb7 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:47 -0700 Subject: [PATCH] md: add write-behind support for md/raid1 If a device is flagged 'WriteMostly' and the array has a bitmap, and the bitmap superblock indicates that write_behind is allowed, then write_behind is enabled for WriteMostly devices. Write requests will be acknowledges as complete to the caller (via b_end_io) when all non-WriteMostly devices have completed the write, but will not be cleared from the bitmap until all devices complete. This requires memory allocation to make a local copy of the data being written. If there is insufficient memory, then we fall-back on normal write semantics. Signed-Off-By: Paul Clements Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/bitmap.c | 26 ++++++++-- drivers/md/raid1.c | 124 +++++++++++++++++++++++++++++++++++++++++--- include/linux/raid/bitmap.h | 15 ++++-- include/linux/raid/md_k.h | 3 ++ include/linux/raid/raid1.h | 13 +++++ 5 files changed, 165 insertions(+), 16 deletions(-) (limited to 'include/linux/raid') diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 2925219f0881..2c84de2b4ad5 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -437,6 +437,7 @@ void bitmap_print_sb(struct bitmap *bitmap) printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); printk(KERN_DEBUG " sync size: %llu KB\n", (unsigned long long)le64_to_cpu(sb->sync_size)/2); + printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind)); kunmap(bitmap->sb_page); } @@ -445,7 +446,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) { char *reason = NULL; bitmap_super_t *sb; - unsigned long chunksize, daemon_sleep; + unsigned long chunksize, daemon_sleep, write_behind; unsigned long bytes_read; unsigned long long events; int err = -EINVAL; @@ -474,6 +475,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) chunksize = le32_to_cpu(sb->chunksize); daemon_sleep = le32_to_cpu(sb->daemon_sleep); + write_behind = le32_to_cpu(sb->write_behind); /* verify that the bitmap-specific fields are valid */ if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) @@ -485,7 +487,9 @@ static int bitmap_read_sb(struct bitmap *bitmap) else if ((1 << ffz(~chunksize)) != chunksize) reason = "bitmap chunksize not a power of 2"; else if (daemon_sleep < 1 || daemon_sleep > 15) - reason = "daemon sleep period out of range"; + reason = "daemon sleep period out of range (1-15s)"; + else if (write_behind > COUNTER_MAX) + reason = "write-behind limit out of range (0 - 16383)"; if (reason) { printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n", bmname(bitmap), reason); @@ -518,6 +522,7 @@ success: /* assign fields using values from superblock */ bitmap->chunksize = chunksize; bitmap->daemon_sleep = daemon_sleep; + bitmap->max_write_behind = write_behind; bitmap->flags |= sb->state; bitmap->events_cleared = le64_to_cpu(sb->events_cleared); if (sb->state & BITMAP_STALE) @@ -1282,9 +1287,16 @@ static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, } } -int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors) +int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind) { if (!bitmap) return 0; + + if (behind) { + atomic_inc(&bitmap->behind_writes); + PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n", + atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); + } + while (sectors) { int blocks; bitmap_counter_t *bmc; @@ -1319,9 +1331,15 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect } void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, - int success) + int success, int behind) { if (!bitmap) return; + if (behind) { + atomic_dec(&bitmap->behind_writes); + PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n", + atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); + } + while (sectors) { int blocks; unsigned long flags; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 28839a8193f2..ba7f5f256161 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -222,8 +222,17 @@ static void raid_end_bio_io(r1bio_t *r1_bio) { struct bio *bio = r1_bio->master_bio; - bio_endio(bio, bio->bi_size, - test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); + /* if nobody has done the final endio yet, do it now */ + if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { + PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n", + (bio_data_dir(bio) == WRITE) ? "write" : "read", + (unsigned long long) bio->bi_sector, + (unsigned long long) bio->bi_sector + + (bio->bi_size >> 9) - 1); + + bio_endio(bio, bio->bi_size, + test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); + } free_r1bio(r1_bio); } @@ -292,7 +301,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); - int mirror; + int mirror, behind; conf_t *conf = mddev_to_conf(r1_bio->mddev); if (bio->bi_size) @@ -323,16 +332,46 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int update_head_pos(mirror, r1_bio); + behind = test_bit(R1BIO_BehindIO, &r1_bio->state); + if (behind) { + if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) + atomic_dec(&r1_bio->behind_remaining); + + /* In behind mode, we ACK the master bio once the I/O has safely + * reached all non-writemostly disks. Setting the Returned bit + * ensures that this gets done only once -- we don't ever want to + * return -EIO here, instead we'll wait */ + + if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && + test_bit(R1BIO_Uptodate, &r1_bio->state)) { + /* Maybe we can return now */ + if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { + struct bio *mbio = r1_bio->master_bio; + PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", + (unsigned long long) mbio->bi_sector, + (unsigned long long) mbio->bi_sector + + (mbio->bi_size >> 9) - 1); + bio_endio(mbio, mbio->bi_size, 0); + } + } + } /* * * Let's see if all mirrored write operations have finished * already. */ if (atomic_dec_and_test(&r1_bio->remaining)) { + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { + /* free extra copy of the data pages */ + int i = bio->bi_vcnt; + while (i--) + __free_page(bio->bi_io_vec[i].bv_page); + } /* clear the bitmap if all writes complete successfully */ bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, r1_bio->sectors, - !test_bit(R1BIO_Degraded, &r1_bio->state)); + !test_bit(R1BIO_Degraded, &r1_bio->state), + behind); md_write_end(r1_bio->mddev); raid_end_bio_io(r1_bio); } @@ -562,6 +601,39 @@ static void device_barrier(conf_t *conf, sector_t sect) spin_unlock_irq(&conf->resync_lock); } +/* duplicate the data pages for behind I/O */ +static struct page **alloc_behind_pages(struct bio *bio) +{ + int i; + struct bio_vec *bvec; + struct page **pages = kmalloc(bio->bi_vcnt * sizeof(struct page *), + GFP_NOIO); + if (unlikely(!pages)) + goto do_sync_io; + + memset(pages, 0, bio->bi_vcnt * sizeof(struct page *)); + + bio_for_each_segment(bvec, bio, i) { + pages[i] = alloc_page(GFP_NOIO); + if (unlikely(!pages[i])) + goto do_sync_io; + memcpy(kmap(pages[i]) + bvec->bv_offset, + kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); + kunmap(pages[i]); + kunmap(bvec->bv_page); + } + + return pages; + +do_sync_io: + if (pages) + for (i = 0; i < bio->bi_vcnt && pages[i]; i++) + __free_page(pages[i]); + kfree(pages); + PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); + return NULL; +} + static int make_request(request_queue_t *q, struct bio * bio) { mddev_t *mddev = q->queuedata; @@ -574,6 +646,7 @@ static int make_request(request_queue_t *q, struct bio * bio) struct bitmap *bitmap = mddev->bitmap; unsigned long flags; struct bio_list bl; + struct page **behind_pages = NULL; if (unlikely(bio_barrier(bio))) { bio_endio(bio, bio->bi_size, -EOPNOTSUPP); @@ -613,8 +686,6 @@ static int make_request(request_queue_t *q, struct bio * bio) r1_bio->mddev = mddev; r1_bio->sector = bio->bi_sector; - r1_bio->state = 0; - if (bio_data_dir(bio) == READ) { /* * read balancing logic: @@ -675,13 +746,22 @@ static int make_request(request_queue_t *q, struct bio * bio) } rcu_read_unlock(); + BUG_ON(targets == 0); /* we never fail the last device */ + if (targets < conf->raid_disks) { /* array is degraded, we will not clear the bitmap * on I/O completion (see raid1_end_write_request) */ set_bit(R1BIO_Degraded, &r1_bio->state); } + /* do behind I/O ? */ + if (bitmap && + atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind && + (behind_pages = alloc_behind_pages(bio)) != NULL) + set_bit(R1BIO_BehindIO, &r1_bio->state); + atomic_set(&r1_bio->remaining, 0); + atomic_set(&r1_bio->behind_remaining, 0); bio_list_init(&bl); for (i = 0; i < disks; i++) { @@ -698,12 +778,31 @@ static int make_request(request_queue_t *q, struct bio * bio) mbio->bi_rw = WRITE; mbio->bi_private = r1_bio; + if (behind_pages) { + struct bio_vec *bvec; + int j; + + /* Yes, I really want the '__' version so that + * we clear any unused pointer in the io_vec, rather + * than leave them unchanged. This is important + * because when we come to free the pages, we won't + * know the originial bi_idx, so we just free + * them all + */ + __bio_for_each_segment(bvec, mbio, j, 0) + bvec->bv_page = behind_pages[j]; + if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) + atomic_inc(&r1_bio->behind_remaining); + } + atomic_inc(&r1_bio->remaining); bio_list_add(&bl, mbio); } + kfree(behind_pages); /* the behind pages are attached to the bios now */ - bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors); + bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, + test_bit(R1BIO_BehindIO, &r1_bio->state)); spin_lock_irqsave(&conf->device_lock, flags); bio_list_merge(&conf->pending_bio_list, &bl); bio_list_init(&bl); @@ -1471,6 +1570,17 @@ out: static int stop(mddev_t *mddev) { conf_t *conf = mddev_to_conf(mddev); + struct bitmap *bitmap = mddev->bitmap; + int behind_wait = 0; + + /* wait for behind writes to complete */ + while (bitmap && atomic_read(&bitmap->behind_writes) > 0) { + behind_wait++; + printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ); /* wait a second */ + /* need to kick something here to make sure I/O goes? */ + } md_unregister_thread(mddev->thread); mddev->thread = NULL; diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h index 4bf1659f8aa8..9de99198caf1 100644 --- a/include/linux/raid/bitmap.h +++ b/include/linux/raid/bitmap.h @@ -7,7 +7,7 @@ #define BITMAP_H 1 #define BITMAP_MAJOR 3 -#define BITMAP_MINOR 38 +#define BITMAP_MINOR 39 /* * in-memory bitmap: @@ -147,8 +147,9 @@ typedef struct bitmap_super_s { __u32 state; /* 48 bitmap state information */ __u32 chunksize; /* 52 the bitmap chunk size in bytes */ __u32 daemon_sleep; /* 56 seconds between disk flushes */ + __u32 write_behind; /* 60 number of outstanding write-behind writes */ - __u8 pad[256 - 60]; /* set to zero */ + __u8 pad[256 - 64]; /* set to zero */ } bitmap_super_t; /* notes: @@ -226,6 +227,9 @@ struct bitmap { unsigned long flags; + unsigned long max_write_behind; /* write-behind mode */ + atomic_t behind_writes; + /* * the bitmap daemon - periodically wakes up and sweeps the bitmap * file, cleaning up bits and flushing out pages to disk as necessary @@ -260,9 +264,10 @@ int bitmap_setallbits(struct bitmap *bitmap); void bitmap_write_all(struct bitmap *bitmap); /* these are exported */ -int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors); -void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, - int success); +int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int behind); +void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int success, int behind); int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded); void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted); void bitmap_close_sync(struct bitmap *bitmap); diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 7ef78e15ce04..2514e5fcda7f 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -275,6 +275,9 @@ struct mddev_s atomic_t writes_pending; request_queue_t *queue; /* for plugging ... */ + atomic_t write_behind; /* outstanding async IO */ + unsigned int max_write_behind; /* 0 = sync */ + struct bitmap *bitmap; /* the bitmap for the device */ struct file *bitmap_file; /* the bitmap file */ long bitmap_offset; /* offset from superblock of diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h index 9d93cf12e890..60e19b667548 100644 --- a/include/linux/raid/raid1.h +++ b/include/linux/raid/raid1.h @@ -80,6 +80,9 @@ struct r1bio_s { atomic_t remaining; /* 'have we finished' count, * used from IRQ handlers */ + atomic_t behind_remaining; /* number of write-behind ios remaining + * in this BehindIO request + */ sector_t sector; int sectors; unsigned long state; @@ -107,4 +110,14 @@ struct r1bio_s { #define R1BIO_Uptodate 0 #define R1BIO_IsSync 1 #define R1BIO_Degraded 2 +#define R1BIO_BehindIO 3 +/* For write-behind requests, we call bi_end_io when + * the last non-write-behind device completes, providing + * any write was successful. Otherwise we call when + * any write-behind write succeeds, otherwise we call + * with failure when last write completes (and all failed). + * Record that bi_end_io was called with this flag... + */ +#define R1BIO_Returned 4 + #endif -- cgit v1.2.3 From 15945fee6f09bff1f86b1a735b5888dc59cf38e3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:47 -0700 Subject: [PATCH] md: support md/linear array with components greater than 2 terabytes. linear currently uses division by the size of the smallest componenet device to find which device a request goes to. If that smallest device is larger than 2 terabytes, then the division will not work on some systems. So we introduce a pre-shift, and take care not to make the hash table too large, much like the code in raid0. Also get rid of conf->nr_zones, which is not needed. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/linear.c | 95 +++++++++++++++++++++++++++++++-------------- include/linux/raid/linear.h | 4 +- 2 files changed, 68 insertions(+), 31 deletions(-) (limited to 'include/linux/raid') diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 4991ba543368..bb279fad2fd2 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -38,7 +38,8 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) /* * sector_div(a,b) returns the remainer and sets a to a/b */ - (void)sector_div(block, conf->smallest->size); + block >>= conf->preshift; + (void)sector_div(block, conf->hash_spacing); hash = conf->hash_table[block]; while ((sector>>1) >= (hash->size + hash->offset)) @@ -47,7 +48,7 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) } /** - * linear_mergeable_bvec -- tell bio layer if a two requests can be merged + * linear_mergeable_bvec -- tell bio layer if two requests can be merged * @q: request queue * @bio: the buffer head that's been built up so far * @biovec: the request that could be merged to it. @@ -116,7 +117,7 @@ static int linear_run (mddev_t *mddev) dev_info_t **table; mdk_rdev_t *rdev; int i, nb_zone, cnt; - sector_t start; + sector_t min_spacing; sector_t curr_offset; struct list_head *tmp; @@ -127,11 +128,6 @@ static int linear_run (mddev_t *mddev) memset(conf, 0, sizeof(*conf) + mddev->raid_disks*sizeof(dev_info_t)); mddev->private = conf; - /* - * Find the smallest device. - */ - - conf->smallest = NULL; cnt = 0; mddev->array_size = 0; @@ -159,8 +155,6 @@ static int linear_run (mddev_t *mddev) disk->size = rdev->size; mddev->array_size += rdev->size; - if (!conf->smallest || (disk->size < conf->smallest->size)) - conf->smallest = disk; cnt++; } if (cnt != mddev->raid_disks) { @@ -168,6 +162,36 @@ static int linear_run (mddev_t *mddev) goto out; } + min_spacing = mddev->array_size; + sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *)); + + /* min_spacing is the minimum spacing that will fit the hash + * table in one PAGE. This may be much smaller than needed. + * We find the smallest non-terminal set of consecutive devices + * that is larger than min_spacing as use the size of that as + * the actual spacing + */ + conf->hash_spacing = mddev->array_size; + for (i=0; i < cnt-1 ; i++) { + sector_t sz = 0; + int j; + for (j=i; idisks[j].size; + if (sz >= min_spacing && sz < conf->hash_spacing) + conf->hash_spacing = sz; + } + + /* hash_spacing may be too large for sector_div to work with, + * so we might need to pre-shift + */ + conf->preshift = 0; + if (sizeof(sector_t) > sizeof(u32)) { + sector_t space = conf->hash_spacing; + while (space > (sector_t)(~(u32)0)) { + space >>= 1; + conf->preshift++; + } + } /* * This code was restructured to work around a gcc-2.95.3 internal * compiler error. Alter it with care. @@ -177,39 +201,52 @@ static int linear_run (mddev_t *mddev) unsigned round; unsigned long base; - sz = mddev->array_size; - base = conf->smallest->size; + sz = mddev->array_size >> conf->preshift; + sz += 1; /* force round-up */ + base = conf->hash_spacing >> conf->preshift; round = sector_div(sz, base); - nb_zone = conf->nr_zones = sz + (round ? 1 : 0); + nb_zone = sz + (round ? 1 : 0); } - - conf->hash_table = kmalloc (sizeof (dev_info_t*) * nb_zone, + BUG_ON(nb_zone > PAGE_SIZE / sizeof(struct dev_info *)); + + conf->hash_table = kmalloc (sizeof (struct dev_info *) * nb_zone, GFP_KERNEL); if (!conf->hash_table) goto out; /* * Here we generate the linear hash table + * First calculate the device offsets. */ + conf->disks[0].offset = 0; + for (i=1; iraid_disks; i++) + conf->disks[i].offset = + conf->disks[i-1].offset + + conf->disks[i-1].size; + table = conf->hash_table; - start = 0; curr_offset = 0; - for (i = 0; i < cnt; i++) { - dev_info_t *disk = conf->disks + i; + i = 0; + for (curr_offset = 0; + curr_offset < mddev->array_size; + curr_offset += conf->hash_spacing) { - disk->offset = curr_offset; - curr_offset += disk->size; + while (i < mddev->raid_disks-1 && + curr_offset >= conf->disks[i+1].offset) + i++; - /* 'curr_offset' is the end of this disk - * 'start' is the start of table + *table ++ = conf->disks + i; + } + + if (conf->preshift) { + conf->hash_spacing >>= conf->preshift; + /* round hash_spacing up so that when we divide by it, + * we err on the side of "too-low", which is safest. */ - while (start < curr_offset) { - *table++ = disk; - start += conf->smallest->size; - } + conf->hash_spacing++; } - if (table-conf->hash_table != nb_zone) - BUG(); + + BUG_ON(table - conf->hash_table > nb_zone); blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); mddev->queue->unplug_fn = linear_unplug; @@ -299,7 +336,7 @@ static void linear_status (struct seq_file *seq, mddev_t *mddev) sector_t s = 0; seq_printf(seq, " "); - for (j = 0; j < conf->nr_zones; j++) + for (j = 0; j < mddev->raid_disks; j++) { char b[BDEVNAME_SIZE]; s += conf->smallest_size; diff --git a/include/linux/raid/linear.h b/include/linux/raid/linear.h index e04c4fe45b53..7eaf290e10e7 100644 --- a/include/linux/raid/linear.h +++ b/include/linux/raid/linear.h @@ -14,8 +14,8 @@ typedef struct dev_info dev_info_t; struct linear_private_data { dev_info_t **hash_table; - dev_info_t *smallest; - int nr_zones; + sector_t hash_spacing; + int preshift; /* shift before dividing by hash_spacing */ dev_info_t disks[0]; }; -- cgit v1.2.3 From 71c0805cb48462c99fbe0e5fcc6c12d7b9929c09 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:51 -0700 Subject: [PATCH] md: allow md to load a superblock with feature-bit '1' set As this is used to flag an internal bitmap. Also, introduce symbolic names for feature bits. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 6 +++--- include/linux/raid/md_p.h | 5 +++++ 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux/raid') diff --git a/drivers/md/md.c b/drivers/md/md.c index 866c704e008a..1be3f2de396b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -875,7 +875,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) sb->major_version != cpu_to_le32(1) || le32_to_cpu(sb->max_dev) > (4096-256)/2 || le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) || - sb->feature_map != 0) + (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) return -EINVAL; if (calc_sb_1_csum(sb) != sb->sb_csum) { @@ -954,7 +954,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->max_disks = (4096-256)/2; - if ((le32_to_cpu(sb->feature_map) & 1) && + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && mddev->bitmap_file == NULL ) { if (mddev->level != 1) { printk(KERN_WARNING "md: bitmaps only supported for raid1\n"); @@ -1029,7 +1029,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) if (mddev->bitmap && mddev->bitmap_file == NULL) { sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); - sb->feature_map = cpu_to_le32(1); + sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); } max_dev = 0; diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index 4f047f84fb1f..c100fa5d4bfa 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h @@ -238,5 +238,10 @@ struct mdp_superblock_1 { __u16 dev_roles[0]; /* role in array, or 0xffff for a spare, or 0xfffe for faulty */ }; +/* feature_map bits */ +#define MD_FEATURE_BITMAP_OFFSET 1 + +#define MD_FEATURE_ALL 1 + #endif -- cgit v1.2.3 From 773f7834425e83144c95fbbc553ced3c2b74b828 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:53 -0700 Subject: [PATCH] md: remove old cruft from md_k.h header file These inlines haven't been used for ages, they should go. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md_k.h | 64 ----------------------------------------------- 1 file changed, 64 deletions(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 2514e5fcda7f..8042f55dd323 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -85,70 +85,6 @@ typedef struct mdk_rdev_s mdk_rdev_t; #define MAX_CHUNK_SIZE (4096*1024) -/* - * default readahead - */ - -static inline int disk_faulty(mdp_disk_t * d) -{ - return d->state & (1 << MD_DISK_FAULTY); -} - -static inline int disk_active(mdp_disk_t * d) -{ - return d->state & (1 << MD_DISK_ACTIVE); -} - -static inline int disk_sync(mdp_disk_t * d) -{ - return d->state & (1 << MD_DISK_SYNC); -} - -static inline int disk_spare(mdp_disk_t * d) -{ - return !disk_sync(d) && !disk_active(d) && !disk_faulty(d); -} - -static inline int disk_removed(mdp_disk_t * d) -{ - return d->state & (1 << MD_DISK_REMOVED); -} - -static inline void mark_disk_faulty(mdp_disk_t * d) -{ - d->state |= (1 << MD_DISK_FAULTY); -} - -static inline void mark_disk_active(mdp_disk_t * d) -{ - d->state |= (1 << MD_DISK_ACTIVE); -} - -static inline void mark_disk_sync(mdp_disk_t * d) -{ - d->state |= (1 << MD_DISK_SYNC); -} - -static inline void mark_disk_spare(mdp_disk_t * d) -{ - d->state = 0; -} - -static inline void mark_disk_removed(mdp_disk_t * d) -{ - d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED); -} - -static inline void mark_disk_inactive(mdp_disk_t * d) -{ - d->state &= ~(1 << MD_DISK_ACTIVE); -} - -static inline void mark_disk_nonsync(mdp_disk_t * d) -{ - d->state &= ~(1 << MD_DISK_SYNC); -} - /* * MD's 'extended' device */ -- cgit v1.2.3 From 0002b2718dd04da67c21f8a7830de8d95a9b0345 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:53 -0700 Subject: [PATCH] md: limit size of sb read/written to appropriate amount version-1 superblocks are not (normally) 4K long, and can be of variable size. Writing the full 4K can cause corruption (but only in non-default configurations). With this patch the super-block-flavour can choose a size to read, and set a size to write based on what it finds. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 20 +++++++++++++++----- include/linux/raid/md_k.h | 1 + 2 files changed, 16 insertions(+), 5 deletions(-) (limited to 'include/linux/raid') diff --git a/drivers/md/md.c b/drivers/md/md.c index 1be3f2de396b..be7873c61b3c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -393,7 +393,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size, return ret; } -static int read_disk_sb(mdk_rdev_t * rdev) +static int read_disk_sb(mdk_rdev_t * rdev, int size) { char b[BDEVNAME_SIZE]; if (!rdev->sb_page) { @@ -404,7 +404,7 @@ static int read_disk_sb(mdk_rdev_t * rdev) return 0; - if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) + if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ)) goto fail; rdev->sb_loaded = 1; return 0; @@ -531,7 +531,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version sb_offset = calc_dev_sboffset(rdev->bdev); rdev->sb_offset = sb_offset; - ret = read_disk_sb(rdev); + ret = read_disk_sb(rdev, MD_SB_BYTES); if (ret) return ret; ret = -EINVAL; @@ -564,6 +564,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version rdev->preferred_minor = sb->md_minor; rdev->data_offset = 0; + rdev->sb_size = MD_SB_BYTES; if (sb->level == LEVEL_MULTIPATH) rdev->desc_nr = -1; @@ -837,6 +838,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) int ret; sector_t sb_offset; char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; + int bmask; /* * Calculate the position of the superblock. @@ -865,7 +867,10 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) } rdev->sb_offset = sb_offset; - ret = read_disk_sb(rdev); + /* superblock is rarely larger than 1K, but it can be larger, + * and it is safe to read 4k, so we do that + */ + ret = read_disk_sb(rdev, 4096); if (ret) return ret; @@ -891,6 +896,11 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) rdev->preferred_minor = 0xffff; rdev->data_offset = le64_to_cpu(sb->data_offset); + rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; + bmask = block_size(rdev->bdev)-1; + if (rdev->sb_size & bmask) + rdev-> sb_size = (rdev->sb_size | bmask)+1; + if (refdev == 0) return 1; else { @@ -1375,7 +1385,7 @@ repeat: dprintk("%s ", bdevname(rdev->bdev,b)); if (!rdev->faulty) { md_super_write(mddev,rdev, - rdev->sb_offset<<1, MD_SB_BYTES, + rdev->sb_offset<<1, rdev->sb_size, rdev->sb_page); dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", bdevname(rdev->bdev,b), diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 8042f55dd323..ebce949b1443 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -102,6 +102,7 @@ struct mdk_rdev_s int sb_loaded; sector_t data_offset; /* start of data in array */ sector_t sb_offset; + int sb_size; /* bytes in the superblock */ int preferred_minor; /* autorun support */ /* A device can be in one of three states based on two flags: -- cgit v1.2.3 From 72626685dc66d455742a7f215a0535c551628b9e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 9 Sep 2005 16:23:54 -0700 Subject: [PATCH] md: add write-intent-bitmap support to raid5 Most awkward part of this is delaying write requests until bitmap updates have been flushed. To achieve this, we have a sequence number (seq_flush) which is incremented each time the raid5 is unplugged. If the raid thread notices that this has changed, it flushes bitmap changes, and assigned the value of seq_flush to seq_write. When a write request arrives, it is given the number from seq_write, and that write request may not complete until seq_flush is larger than the saved seq number. We have a new queue for storing stripes which are waiting for a bitmap flush and an extra flag for stripes to record if the write was 'degraded' and so should not clear the a bit in the bitmap. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 3 +- drivers/md/raid5.c | 133 +++++++++++++++++++++++++++++++++++++++++---- include/linux/raid/raid5.h | 14 ++++- 3 files changed, 137 insertions(+), 13 deletions(-) (limited to 'include/linux/raid') diff --git a/drivers/md/md.c b/drivers/md/md.c index be7873c61b3c..dbf540a7fccc 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -645,7 +645,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) if (sb->state & (1<bitmap_file == NULL) { - if (mddev->level != 1) { + if (mddev->level != 1 && mddev->level != 5) { /* FIXME use a better test */ printk(KERN_WARNING "md: bitmaps only support for raid1\n"); return -EINVAL; @@ -3517,7 +3517,6 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok) */ void md_write_start(mddev_t *mddev, struct bio *bi) { - DEFINE_WAIT(w); if (bio_data_dir(bi) != WRITE) return; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ed859e08d600..4683ca24c046 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -24,6 +24,8 @@ #include #include +#include + /* * Stripe cache */ @@ -79,8 +81,13 @@ static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) if (test_bit(STRIPE_HANDLE, &sh->state)) { if (test_bit(STRIPE_DELAYED, &sh->state)) list_add_tail(&sh->lru, &conf->delayed_list); - else + else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && + conf->seq_write == sh->bm_seq) + list_add_tail(&sh->lru, &conf->bitmap_list); + else { + clear_bit(STRIPE_BIT_DELAY, &sh->state); list_add_tail(&sh->lru, &conf->handle_list); + } md_wakeup_thread(conf->mddev->thread); } else { if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { @@ -244,6 +251,9 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector spin_lock_irq(&conf->device_lock); do { + wait_event_lock_irq(conf->wait_for_stripe, + conf->quiesce == 0, + conf->device_lock, /* nothing */); sh = __find_stripe(conf, sector); if (!sh) { if (!conf->inactive_blocked) @@ -803,6 +813,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in { struct bio **bip; raid5_conf_t *conf = sh->raid_conf; + int firstwrite=0; PRINTK("adding bh b#%llu to stripe s#%llu\n", (unsigned long long)bi->bi_sector, @@ -811,9 +822,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in spin_lock(&sh->lock); spin_lock_irq(&conf->device_lock); - if (forwrite) + if (forwrite) { bip = &sh->dev[dd_idx].towrite; - else + if (*bip == NULL && sh->dev[dd_idx].written == NULL) + firstwrite = 1; + } else bip = &sh->dev[dd_idx].toread; while (*bip && (*bip)->bi_sector < bi->bi_sector) { if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) @@ -836,6 +849,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in (unsigned long long)bi->bi_sector, (unsigned long long)sh->sector, dd_idx); + if (conf->mddev->bitmap && firstwrite) { + sh->bm_seq = conf->seq_write; + bitmap_startwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, 0); + set_bit(STRIPE_BIT_DELAY, &sh->state); + } + if (forwrite) { /* check if page is covered */ sector_t sector = sh->dev[dd_idx].sector; @@ -958,12 +978,13 @@ static void handle_stripe(struct stripe_head *sh) * need to be failed */ if (failed > 1 && to_read+to_write+written) { - spin_lock_irq(&conf->device_lock); for (i=disks; i--; ) { + int bitmap_end = 0; + spin_lock_irq(&conf->device_lock); /* fail all writes first */ bi = sh->dev[i].towrite; sh->dev[i].towrite = NULL; - if (bi) to_write--; + if (bi) { to_write--; bitmap_end = 1; } if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); @@ -981,6 +1002,7 @@ static void handle_stripe(struct stripe_head *sh) /* and fail all 'written' */ bi = sh->dev[i].written; sh->dev[i].written = NULL; + if (bi) bitmap_end = 1; while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); clear_bit(BIO_UPTODATE, &bi->bi_flags); @@ -1009,8 +1031,11 @@ static void handle_stripe(struct stripe_head *sh) bi = nextbi; } } + spin_unlock_irq(&conf->device_lock); + if (bitmap_end) + bitmap_endwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, 0, 0); } - spin_unlock_irq(&conf->device_lock); } if (failed > 1 && syncing) { md_done_sync(conf->mddev, STRIPE_SECTORS,0); @@ -1038,6 +1063,7 @@ static void handle_stripe(struct stripe_head *sh) test_bit(R5_UPTODATE, &dev->flags) ) { /* We can return any write requests */ struct bio *wbi, *wbi2; + int bitmap_end = 0; PRINTK("Return write for disc %d\n", i); spin_lock_irq(&conf->device_lock); wbi = dev->written; @@ -1051,7 +1077,13 @@ static void handle_stripe(struct stripe_head *sh) } wbi = wbi2; } + if (dev->towrite == NULL) + bitmap_end = 1; spin_unlock_irq(&conf->device_lock); + if (bitmap_end) + bitmap_endwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, + !test_bit(STRIPE_DEGRADED, &sh->state), 0); } } } @@ -1175,7 +1207,8 @@ static void handle_stripe(struct stripe_head *sh) } } /* now if nothing is locked, and if we have enough data, we can start a write request */ - if (locked == 0 && (rcw == 0 ||rmw == 0)) { + if (locked == 0 && (rcw == 0 ||rmw == 0) && + !test_bit(STRIPE_BIT_DELAY, &sh->state)) { PRINTK("Computing parity...\n"); compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE); /* now every locked buffer is ready to be written */ @@ -1231,6 +1264,7 @@ static void handle_stripe(struct stripe_head *sh) dev = &sh->dev[failed_num]; set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); + clear_bit(STRIPE_DEGRADED, &sh->state); locked++; set_bit(STRIPE_INSYNC, &sh->state); set_bit(R5_Syncio, &dev->flags); @@ -1298,6 +1332,8 @@ static void handle_stripe(struct stripe_head *sh) bi->bi_next = NULL; generic_make_request(bi); } else { + if (rw == 1) + set_bit(STRIPE_DEGRADED, &sh->state); PRINTK("skip op %ld on disc %d for sector %llu\n", bi->bi_rw, i, (unsigned long long)sh->sector); clear_bit(R5_LOCKED, &sh->dev[i].flags); @@ -1322,6 +1358,20 @@ static inline void raid5_activate_delayed(raid5_conf_t *conf) } } +static inline void activate_bit_delay(raid5_conf_t *conf) +{ + /* device_lock is held */ + struct list_head head; + list_add(&head, &conf->bitmap_list); + list_del_init(&conf->bitmap_list); + while (!list_empty(&head)) { + struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); + list_del_init(&sh->lru); + atomic_inc(&sh->count); + __release_stripe(conf, sh); + } +} + static void unplug_slaves(mddev_t *mddev) { raid5_conf_t *conf = mddev_to_conf(mddev); @@ -1354,8 +1404,10 @@ static void raid5_unplug_device(request_queue_t *q) spin_lock_irqsave(&conf->device_lock, flags); - if (blk_remove_plug(q)) + if (blk_remove_plug(q)) { + conf->seq_flush++; raid5_activate_delayed(conf); + } md_wakeup_thread(mddev->thread); spin_unlock_irqrestore(&conf->device_lock, flags); @@ -1493,10 +1545,20 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i sector_t first_sector; int raid_disks = conf->raid_disks; int data_disks = raid_disks-1; + sector_t max_sector = mddev->size << 1; + int sync_blocks; - if (sector_nr >= mddev->size <<1) { + if (sector_nr >= max_sector) { /* just being told to finish up .. nothing much to do */ unplug_slaves(mddev); + + if (mddev->curr_resync < max_sector) /* aborted */ + bitmap_end_sync(mddev->bitmap, mddev->curr_resync, + &sync_blocks, 1); + else /* compelted sync */ + conf->fullsync = 0; + bitmap_close_sync(mddev->bitmap); + return 0; } /* if there is 1 or more failed drives and we are trying @@ -1508,6 +1570,13 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i *skipped = 1; return rv; } + if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && + !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { + /* we can skip this block, and probably more */ + sync_blocks /= STRIPE_SECTORS; + *skipped = 1; + return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ + } x = sector_nr; chunk_offset = sector_div(x, sectors_per_chunk); @@ -1525,6 +1594,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(1); } + bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 0); spin_lock(&sh->lock); set_bit(STRIPE_SYNCING, &sh->state); clear_bit(STRIPE_INSYNC, &sh->state); @@ -1558,6 +1628,13 @@ static void raid5d (mddev_t *mddev) while (1) { struct list_head *first; + if (conf->seq_flush - conf->seq_write > 0) { + int seq = conf->seq_flush; + bitmap_unplug(mddev->bitmap); + conf->seq_write = seq; + activate_bit_delay(conf); + } + if (list_empty(&conf->handle_list) && atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && !blk_queue_plugged(mddev->queue) && @@ -1591,7 +1668,7 @@ static void raid5d (mddev_t *mddev) PRINTK("--- raid5d inactive\n"); } -static int run (mddev_t *mddev) +static int run(mddev_t *mddev) { raid5_conf_t *conf; int raid_disk, memory; @@ -1621,6 +1698,7 @@ static int run (mddev_t *mddev) init_waitqueue_head(&conf->wait_for_overlap); INIT_LIST_HEAD(&conf->handle_list); INIT_LIST_HEAD(&conf->delayed_list); + INIT_LIST_HEAD(&conf->bitmap_list); INIT_LIST_HEAD(&conf->inactive_list); atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); @@ -1732,6 +1810,9 @@ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + /* Ok, everything is just fine now */ + if (mddev->bitmap) + mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; + mddev->queue->unplug_fn = raid5_unplug_device; mddev->queue->issue_flush_fn = raid5_issue_flush; @@ -1912,6 +1993,8 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) rdev->in_sync = 0; rdev->raid_disk = disk; found = 1; + if (rdev->saved_raid_disk != disk) + conf->fullsync = 1; p->rdev = rdev; break; } @@ -1941,6 +2024,35 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) return 0; } +static void raid5_quiesce(mddev_t *mddev, int state) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + + switch(state) { + case 1: /* stop all writes */ + spin_lock_irq(&conf->device_lock); + conf->quiesce = 1; + wait_event_lock_irq(conf->wait_for_stripe, + atomic_read(&conf->active_stripes) == 0, + conf->device_lock, /* nothing */); + spin_unlock_irq(&conf->device_lock); + break; + + case 0: /* re-enable writes */ + spin_lock_irq(&conf->device_lock); + conf->quiesce = 0; + wake_up(&conf->wait_for_stripe); + spin_unlock_irq(&conf->device_lock); + break; + } + if (mddev->thread) { + if (mddev->bitmap) + mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; + else + mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + md_wakeup_thread(mddev->thread); + } +} static mdk_personality_t raid5_personality= { .name = "raid5", @@ -1955,6 +2067,7 @@ static mdk_personality_t raid5_personality= .spare_active = raid5_spare_active, .sync_request = sync_request, .resize = raid5_resize, + .quiesce = raid5_quiesce, }; static int __init raid5_init (void) diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index d63ddcb4afad..176fc653c284 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -134,6 +134,7 @@ struct stripe_head { unsigned long state; /* state flags */ atomic_t count; /* nr of active thread/requests */ spinlock_t lock; + int bm_seq; /* sequence number for bitmap flushes */ struct r5dev { struct bio req; struct bio_vec vec; @@ -165,12 +166,13 @@ struct stripe_head { /* * Stripe state */ -#define STRIPE_ERROR 1 #define STRIPE_HANDLE 2 #define STRIPE_SYNCING 3 #define STRIPE_INSYNC 4 #define STRIPE_PREREAD_ACTIVE 5 #define STRIPE_DELAYED 6 +#define STRIPE_DEGRADED 7 +#define STRIPE_BIT_DELAY 8 /* * Plugging: @@ -210,10 +212,20 @@ struct raid5_private_data { struct list_head handle_list; /* stripes needing handling */ struct list_head delayed_list; /* stripes that have plugged requests */ + struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ atomic_t preread_active_stripes; /* stripes with scheduled io */ char cache_name[20]; kmem_cache_t *slab_cache; /* for allocating stripes */ + + int seq_flush, seq_write; + int quiesce; + + int fullsync; /* set to 1 if a full sync is needed, + * (fresh device added). + * Cleared when a sync completes. + */ + /* * Free stripes pool */ -- cgit v1.2.3