diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bitmap.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-cache-target.c | 51 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 111 | ||||
-rw-r--r-- | drivers/md/dm.c | 5 | ||||
-rw-r--r-- | drivers/md/md.c | 239 | ||||
-rw-r--r-- | drivers/md/md.h | 1 | ||||
-rw-r--r-- | drivers/md/raid1.c | 8 | ||||
-rw-r--r-- | drivers/md/raid10.c | 24 | ||||
-rw-r--r-- | drivers/md/raid5.c | 27 |
9 files changed, 344 insertions, 126 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 4fd9d6aeff6a..5a2c75499824 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -846,7 +846,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) set_bit(bit, kaddr); else - test_and_set_bit_le(bit, kaddr); + set_bit_le(bit, kaddr); kunmap_atomic(kaddr); pr_debug("set file bit %lu page %lu\n", bit, page->index); /* record page number so it gets flushed to disk when unplug occurs */ @@ -868,7 +868,7 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) clear_bit(bit, paddr); else - test_and_clear_bit_le(bit, paddr); + clear_bit_le(bit, paddr); kunmap_atomic(paddr); if (!test_page_attr(bitmap, page->index, BITMAP_PAGE_NEEDWRITE)) { set_page_attr(bitmap, page->index, BITMAP_PAGE_PENDING); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 66120bd46d15..10744091e6ca 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -6,6 +6,7 @@ #include "dm.h" #include "dm-bio-prison.h" +#include "dm-bio-record.h" #include "dm-cache-metadata.h" #include <linux/dm-io.h> @@ -201,10 +202,15 @@ struct per_bio_data { unsigned req_nr:2; struct dm_deferred_entry *all_io_entry; - /* writethrough fields */ + /* + * writethrough fields. These MUST remain at the end of this + * structure and the 'cache' member must be the first as it + * is used to determine the offsetof the writethrough fields. + */ struct cache *cache; dm_cblock_t cblock; bio_end_io_t *saved_bi_end_io; + struct dm_bio_details bio_details; }; struct dm_cache_migration { @@ -513,16 +519,28 @@ static void save_stats(struct cache *cache) /*---------------------------------------------------------------- * Per bio data *--------------------------------------------------------------*/ -static struct per_bio_data *get_per_bio_data(struct bio *bio) + +/* + * If using writeback, leave out struct per_bio_data's writethrough fields. + */ +#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) +#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) + +static size_t get_per_bio_data_size(struct cache *cache) +{ + return cache->features.write_through ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; +} + +static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) { - struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); + struct per_bio_data *pb = dm_per_bio_data(bio, data_size); BUG_ON(!pb); return pb; } -static struct per_bio_data *init_per_bio_data(struct bio *bio) +static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) { - struct per_bio_data *pb = get_per_bio_data(bio); + struct per_bio_data *pb = get_per_bio_data(bio, data_size); pb->tick = false; pb->req_nr = dm_bio_get_target_bio_nr(bio); @@ -556,7 +574,8 @@ static void remap_to_cache(struct cache *cache, struct bio *bio, static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) { unsigned long flags; - struct per_bio_data *pb = get_per_bio_data(bio); + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); spin_lock_irqsave(&cache->lock, flags); if (cache->need_tick_bio && @@ -635,7 +654,7 @@ static void defer_writethrough_bio(struct cache *cache, struct bio *bio) static void writethrough_endio(struct bio *bio, int err) { - struct per_bio_data *pb = get_per_bio_data(bio); + struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); bio->bi_end_io = pb->saved_bi_end_io; if (err) { @@ -643,6 +662,7 @@ static void writethrough_endio(struct bio *bio, int err) return; } + dm_bio_restore(&pb->bio_details, bio); remap_to_cache(pb->cache, bio, pb->cblock); /* @@ -662,11 +682,12 @@ static void writethrough_endio(struct bio *bio, int err) static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, dm_oblock_t oblock, dm_cblock_t cblock) { - struct per_bio_data *pb = get_per_bio_data(bio); + struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); pb->cache = cache; pb->cblock = cblock; pb->saved_bi_end_io = bio->bi_end_io; + dm_bio_record(&pb->bio_details, bio); bio->bi_end_io = writethrough_endio; remap_to_origin_clear_discard(pb->cache, bio, oblock); @@ -1035,7 +1056,8 @@ static void defer_bio(struct cache *cache, struct bio *bio) static void process_flush_bio(struct cache *cache, struct bio *bio) { - struct per_bio_data *pb = get_per_bio_data(bio); + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); BUG_ON(bio->bi_size); if (!pb->req_nr) @@ -1107,7 +1129,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs, dm_oblock_t block = get_bio_block(cache, bio); struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell; struct policy_result lookup_result; - struct per_bio_data *pb = get_per_bio_data(bio); + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); bool discarded_block = is_discarded_oblock(cache, block); bool can_migrate = discarded_block || spare_migration_bandwidth(cache); @@ -1881,7 +1904,6 @@ static int cache_create(struct cache_args *ca, struct cache **result) cache->ti = ca->ti; ti->private = cache; - ti->per_bio_data_size = sizeof(struct per_bio_data); ti->num_flush_bios = 2; ti->flush_supported = true; @@ -1890,6 +1912,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) ti->discard_zeroes_data_unsupported = true; memcpy(&cache->features, &ca->features, sizeof(cache->features)); + ti->per_bio_data_size = get_per_bio_data_size(cache); cache->callbacks.congested_fn = cache_is_congested; dm_table_add_target_callbacks(ti->table, &cache->callbacks); @@ -2092,6 +2115,7 @@ static int cache_map(struct dm_target *ti, struct bio *bio) int r; dm_oblock_t block = get_bio_block(cache, bio); + size_t pb_data_size = get_per_bio_data_size(cache); bool can_migrate = false; bool discarded_block; struct dm_bio_prison_cell *cell; @@ -2108,7 +2132,7 @@ static int cache_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_REMAPPED; } - pb = init_per_bio_data(bio); + pb = init_per_bio_data(bio, pb_data_size); if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) { defer_bio(cache, bio); @@ -2193,7 +2217,8 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) { struct cache *cache = ti->private; unsigned long flags; - struct per_bio_data *pb = get_per_bio_data(bio); + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); if (pb->tick) { policy_tick(cache->policy); diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 311e3d35b272..1d3fe1a40a9b 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1279,6 +1279,31 @@ static int raid_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } +static const char *decipher_sync_action(struct mddev *mddev) +{ + if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + return "frozen"; + + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + return "reshape"; + + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + return "resync"; + else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + return "check"; + return "repair"; + } + + if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) + return "recover"; + } + + return "idle"; +} + static void raid_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) { @@ -1298,8 +1323,18 @@ static void raid_status(struct dm_target *ti, status_type_t type, sync = rs->md.recovery_cp; if (sync >= rs->md.resync_max_sectors) { + /* + * Sync complete. + */ array_in_sync = 1; sync = rs->md.resync_max_sectors; + } else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) { + /* + * If "check" or "repair" is occurring, the array has + * undergone and initial sync and the health characters + * should not be 'a' anymore. + */ + array_in_sync = 1; } else { /* * The array may be doing an initial sync, or it may @@ -1311,6 +1346,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) array_in_sync = 1; } + /* * Status characters: * 'D' = Dead/Failed device @@ -1339,6 +1375,21 @@ static void raid_status(struct dm_target *ti, status_type_t type, (unsigned long long) sync, (unsigned long long) rs->md.resync_max_sectors); + /* + * Sync action: + * See Documentation/device-mapper/dm-raid.c for + * information on each of these states. + */ + DMEMIT(" %s", decipher_sync_action(&rs->md)); + + /* + * resync_mismatches/mismatch_cnt + * This field shows the number of discrepancies found when + * performing a "check" of the array. + */ + DMEMIT(" %llu", + (unsigned long long) + atomic64_read(&rs->md.resync_mismatches)); break; case STATUSTYPE_TABLE: /* The string you would use to construct this array */ @@ -1425,7 +1476,62 @@ static void raid_status(struct dm_target *ti, status_type_t type, } } -static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) +static int raid_message(struct dm_target *ti, unsigned argc, char **argv) +{ + struct raid_set *rs = ti->private; + struct mddev *mddev = &rs->md; + + if (!strcasecmp(argv[0], "reshape")) { + DMERR("Reshape not supported."); + return -EINVAL; + } + + if (!mddev->pers || !mddev->pers->sync_request) + return -EINVAL; + + if (!strcasecmp(argv[0], "frozen")) + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + else + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + + if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) { + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_reap_sync_thread(mddev); + } + } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) + return -EBUSY; + else if (!strcasecmp(argv[0], "resync")) + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + else if (!strcasecmp(argv[0], "recover")) { + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + } else { + if (!strcasecmp(argv[0], "check")) + set_bit(MD_RECOVERY_CHECK, &mddev->recovery); + else if (!!strcasecmp(argv[0], "repair")) + return -EINVAL; + set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + } + if (mddev->ro == 2) { + /* A write to sync_action is enough to justify + * canceling read-auto mode + */ + mddev->ro = 0; + if (!mddev->suspended) + md_wakeup_thread(mddev->sync_thread); + } + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + if (!mddev->suspended) + md_wakeup_thread(mddev->thread); + + return 0; +} + +static int raid_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) { struct raid_set *rs = ti->private; unsigned i; @@ -1482,12 +1588,13 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 4, 2}, + .version = {1, 5, 0}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, .map = raid_map, .status = raid_status, + .message = raid_message, .iterate_devices = raid_iterate_devices, .io_hints = raid_io_hints, .presuspend = raid_presuspend, diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7e469260fe5e..d5370a94b2c1 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -339,7 +339,7 @@ out: return md ? 0 : -ENXIO; } -static int dm_blk_close(struct gendisk *disk, fmode_t mode) +static void dm_blk_close(struct gendisk *disk, fmode_t mode) { struct mapped_device *md = disk->private_data; @@ -349,8 +349,6 @@ static int dm_blk_close(struct gendisk *disk, fmode_t mode) dm_put(md); spin_unlock(&_minor_lock); - - return 0; } int dm_open_count(struct mapped_device *md) @@ -611,6 +609,7 @@ static void dec_pending(struct dm_io *io, int error) queue_io(md, bio); } else { /* done with normal IO or empty flush */ + trace_block_bio_complete(md->queue, bio, io_error); bio_endio(bio, io_error); } } diff --git a/drivers/md/md.c b/drivers/md/md.c index 1d03ebde40b5..681d1099a2d5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -72,6 +72,9 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; static struct workqueue_struct *md_misc_wq; +static int remove_and_add_spares(struct mddev *mddev, + struct md_rdev *this); + #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } /* @@ -1555,8 +1558,8 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ sector, count, 1) == 0) return -EINVAL; } - } else if (sb->bblog_offset == 0) - rdev->badblocks.shift = -1; + } else if (sb->bblog_offset != 0) + rdev->badblocks.shift = 0; if (!refdev) { ret = 1; @@ -2402,6 +2405,11 @@ static void md_update_sb(struct mddev * mddev, int force_change) int nospares = 0; int any_badblocks_changed = 0; + if (mddev->ro) { + if (force_change) + set_bit(MD_CHANGE_DEVS, &mddev->flags); + return; + } repeat: /* First make sure individual recovery_offsets are correct */ rdev_for_each(rdev, mddev) { @@ -2791,12 +2799,10 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) /* personality does all needed checks */ if (rdev->mddev->pers->hot_remove_disk == NULL) return -EINVAL; - err = rdev->mddev->pers-> - hot_remove_disk(rdev->mddev, rdev); - if (err) - return err; - sysfs_unlink_rdev(rdev->mddev, rdev); - rdev->raid_disk = -1; + clear_bit(Blocked, &rdev->flags); + remove_and_add_spares(rdev->mddev, rdev); + if (rdev->raid_disk >= 0) + return -EBUSY; set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); md_wakeup_thread(rdev->mddev->thread); } else if (rdev->mddev->pers) { @@ -3212,7 +3218,7 @@ int md_rdev_init(struct md_rdev *rdev) * be used - I wonder if that matters */ rdev->badblocks.count = 0; - rdev->badblocks.shift = 0; + rdev->badblocks.shift = -1; /* disabled until explicitly enabled */ rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); seqlock_init(&rdev->badblocks.lock); if (rdev->badblocks.page == NULL) @@ -3284,9 +3290,6 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe goto abort_free; } } - if (super_format == -1) - /* hot-add for 0.90, or non-persistent: so no badblocks */ - rdev->badblocks.shift = -1; return rdev; @@ -4216,8 +4219,6 @@ action_show(struct mddev *mddev, char *page) return sprintf(page, "%s\n", type); } -static void reap_sync_thread(struct mddev *mddev); - static ssize_t action_store(struct mddev *mddev, const char *page, size_t len) { @@ -4232,7 +4233,7 @@ action_store(struct mddev *mddev, const char *page, size_t len) if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - reap_sync_thread(mddev); + md_reap_sync_thread(mddev); } } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) @@ -5270,7 +5271,7 @@ static void __md_stop_writes(struct mddev *mddev) if (mddev->sync_thread) { set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery); - reap_sync_thread(mddev); + md_reap_sync_thread(mddev); } del_timer_sync(&mddev->safemode_timer); @@ -5278,7 +5279,8 @@ static void __md_stop_writes(struct mddev *mddev) bitmap_flush(mddev); md_super_wait(mddev); - if (!mddev->in_sync || mddev->flags) { + if (mddev->ro == 0 && + (!mddev->in_sync || mddev->flags)) { /* mark array as shutdown cleanly */ mddev->in_sync = 1; md_update_sb(mddev, 1); @@ -5801,7 +5803,7 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) else sysfs_notify_dirent_safe(rdev->sysfs_state); - md_update_sb(mddev, 1); + set_bit(MD_CHANGE_DEVS, &mddev->flags); if (mddev->degraded) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -5868,6 +5870,9 @@ static int hot_remove_disk(struct mddev * mddev, dev_t dev) if (!rdev) return -ENXIO; + clear_bit(Blocked, &rdev->flags); + remove_and_add_spares(mddev, rdev); + if (rdev->raid_disk >= 0) goto busy; @@ -6481,6 +6486,28 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, err = md_set_readonly(mddev, bdev); goto done_unlock; + case HOT_REMOVE_DISK: + err = hot_remove_disk(mddev, new_decode_dev(arg)); + goto done_unlock; + + case ADD_NEW_DISK: + /* We can support ADD_NEW_DISK on read-only arrays + * on if we are re-adding a preexisting device. + * So require mddev->pers and MD_DISK_SYNC. + */ + if (mddev->pers) { + mdu_disk_info_t info; + if (copy_from_user(&info, argp, sizeof(info))) + err = -EFAULT; + else if (!(info.state & (1<<MD_DISK_SYNC))) + /* Need to clear read-only for this */ + break; + else + err = add_new_disk(mddev, &info); + goto done_unlock; + } + break; + case BLKROSET: if (get_user(ro, (int __user *)(arg))) { err = -EFAULT; @@ -6551,10 +6578,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, goto done_unlock; } - case HOT_REMOVE_DISK: - err = hot_remove_disk(mddev, new_decode_dev(arg)); - goto done_unlock; - case HOT_ADD_DISK: err = hot_add_disk(mddev, new_decode_dev(arg)); goto done_unlock; @@ -6642,15 +6665,13 @@ static int md_open(struct block_device *bdev, fmode_t mode) return err; } -static int md_release(struct gendisk *disk, fmode_t mode) +static void md_release(struct gendisk *disk, fmode_t mode) { struct mddev *mddev = disk->private_data; BUG_ON(!mddev); atomic_dec(&mddev->openers); mddev_put(mddev); - - return 0; } static int md_media_changed(struct gendisk *disk) @@ -7635,14 +7656,16 @@ void md_do_sync(struct md_thread *thread) } EXPORT_SYMBOL_GPL(md_do_sync); -static int remove_and_add_spares(struct mddev *mddev) +static int remove_and_add_spares(struct mddev *mddev, + struct md_rdev *this) { struct md_rdev *rdev; int spares = 0; int removed = 0; rdev_for_each(rdev, mddev) - if (rdev->raid_disk >= 0 && + if ((this == NULL || rdev == this) && + rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) && @@ -7657,74 +7680,52 @@ static int remove_and_add_spares(struct mddev *mddev) if (removed && mddev->kobj.sd) sysfs_notify(&mddev->kobj, NULL, "degraded"); + if (this) + goto no_add; + rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) spares++; - if (rdev->raid_disk < 0 - && !test_bit(Faulty, &rdev->flags)) { - rdev->recovery_offset = 0; - if (mddev->pers-> - hot_add_disk(mddev, rdev) == 0) { - if (sysfs_link_rdev(mddev, rdev)) - /* failure here is OK */; - spares++; - md_new_event(mddev); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - } + if (rdev->raid_disk >= 0) + continue; + if (test_bit(Faulty, &rdev->flags)) + continue; + if (mddev->ro && + rdev->saved_raid_disk < 0) + continue; + + rdev->recovery_offset = 0; + if (rdev->saved_raid_disk >= 0 && mddev->in_sync) { + spin_lock_irq(&mddev->write_lock); + if (mddev->in_sync) + /* OK, this device, which is in_sync, + * will definitely be noticed before + * the next write, so recovery isn't + * needed. + */ + rdev->recovery_offset = mddev->recovery_cp; + spin_unlock_irq(&mddev->write_lock); + } + if (mddev->ro && rdev->recovery_offset != MaxSector) + /* not safe to add this disk now */ + continue; + if (mddev->pers-> + hot_add_disk(mddev, rdev) == 0) { + if (sysfs_link_rdev(mddev, rdev)) + /* failure here is OK */; + spares++; + md_new_event(mddev); + set_bit(MD_CHANGE_DEVS, &mddev->flags); } } +no_add: if (removed) set_bit(MD_CHANGE_DEVS, &mddev->flags); return spares; } -static void reap_sync_thread(struct mddev *mddev) -{ - struct md_rdev *rdev; - - /* resync has finished, collect result */ - md_unregister_thread(&mddev->sync_thread); - if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && - !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { - /* success...*/ - /* activate any spares */ - if (mddev->pers->spare_active(mddev)) { - sysfs_notify(&mddev->kobj, NULL, - "degraded"); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - } - } - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - mddev->pers->finish_reshape) - mddev->pers->finish_reshape(mddev); - - /* If array is no-longer degraded, then any saved_raid_disk - * information must be scrapped. Also if any device is now - * In_sync we must scrape the saved_raid_disk for that device - * do the superblock for an incrementally recovered device - * written out. - */ - rdev_for_each(rdev, mddev) - if (!mddev->degraded || - test_bit(In_sync, &rdev->flags)) - rdev->saved_raid_disk = -1; - - md_update_sb(mddev, 1); - clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); - clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); - /* flag recovery needed just to double check */ - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - sysfs_notify_dirent_safe(mddev->sysfs_action); - md_new_event(mddev); - if (mddev->event_work.func) - queue_work(md_misc_wq, &mddev->event_work); -} - /* * This routine is regularly called by all per-raid-array threads to * deal with generic issues like resync and super-block update. @@ -7780,22 +7781,16 @@ void md_check_recovery(struct mddev *mddev) int spares = 0; if (mddev->ro) { - /* Only thing we do on a ro array is remove - * failed devices. + /* On a read-only array we can: + * - remove failed devices + * - add already-in_sync devices if the array itself + * is in-sync. + * As we only add devices that are already in-sync, + * we can activate the spares immediately. */ - struct md_rdev *rdev; - rdev_for_each(rdev, mddev) - if (rdev->raid_disk >= 0 && - !test_bit(Blocked, &rdev->flags) && - test_bit(Faulty, &rdev->flags) && - atomic_read(&rdev->nr_pending)==0) { - if (mddev->pers->hot_remove_disk( - mddev, rdev) == 0) { - sysfs_unlink_rdev(mddev, rdev); - rdev->raid_disk = -1; - } - } clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + remove_and_add_spares(mddev, NULL); + mddev->pers->spare_active(mddev); goto unlock; } @@ -7827,7 +7822,7 @@ void md_check_recovery(struct mddev *mddev) goto unlock; } if (mddev->sync_thread) { - reap_sync_thread(mddev); + md_reap_sync_thread(mddev); goto unlock; } /* Set RUNNING before clearing NEEDED to avoid @@ -7858,7 +7853,7 @@ void md_check_recovery(struct mddev *mddev) goto unlock; set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - } else if ((spares = remove_and_add_spares(mddev))) { + } else if ((spares = remove_and_add_spares(mddev, NULL))) { clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); @@ -7908,6 +7903,51 @@ void md_check_recovery(struct mddev *mddev) } } +void md_reap_sync_thread(struct mddev *mddev) +{ + struct md_rdev *rdev; + + /* resync has finished, collect result */ + md_unregister_thread(&mddev->sync_thread); + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + /* success...*/ + /* activate any spares */ + if (mddev->pers->spare_active(mddev)) { + sysfs_notify(&mddev->kobj, NULL, + "degraded"); + set_bit(MD_CHANGE_DEVS, &mddev->flags); + } + } + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + mddev->pers->finish_reshape) + mddev->pers->finish_reshape(mddev); + + /* If array is no-longer degraded, then any saved_raid_disk + * information must be scrapped. Also if any device is now + * In_sync we must scrape the saved_raid_disk for that device + * do the superblock for an incrementally recovered device + * written out. + */ + rdev_for_each(rdev, mddev) + if (!mddev->degraded || + test_bit(In_sync, &rdev->flags)) + rdev->saved_raid_disk = -1; + + md_update_sb(mddev, 1); + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + /* flag recovery needed just to double check */ + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + sysfs_notify_dirent_safe(mddev->sysfs_action); + md_new_event(mddev); + if (mddev->event_work.func) + queue_work(md_misc_wq, &mddev->event_work); +} + void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) { sysfs_notify_dirent_safe(rdev->sysfs_state); @@ -8633,6 +8673,7 @@ EXPORT_SYMBOL(md_register_thread); EXPORT_SYMBOL(md_unregister_thread); EXPORT_SYMBOL(md_wakeup_thread); EXPORT_SYMBOL(md_check_recovery); +EXPORT_SYMBOL(md_reap_sync_thread); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MD RAID framework"); MODULE_ALIAS("md"); diff --git a/drivers/md/md.h b/drivers/md/md.h index d90fb1a879e1..653f992b687a 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -567,6 +567,7 @@ extern struct md_thread *md_register_thread( extern void md_unregister_thread(struct md_thread **threadp); extern void md_wakeup_thread(struct md_thread *thread); extern void md_check_recovery(struct mddev *mddev); +extern void md_reap_sync_thread(struct mddev *mddev); extern void md_write_start(struct mddev *mddev, struct bio *bi); extern void md_write_end(struct mddev *mddev); extern void md_done_sync(struct mddev *mddev, int blocks, int ok); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index aeb4e3f74791..55951182af73 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -971,7 +971,12 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; bio->bi_next = NULL; - generic_make_request(bio); + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + /* Just ignore it */ + bio_endio(bio, 0); + else + generic_make_request(bio); bio = next; } kfree(plug); @@ -2854,6 +2859,7 @@ static int stop(struct mddev *mddev) if (conf->r1bio_pool) mempool_destroy(conf->r1bio_pool); kfree(conf->mirrors); + safe_put_page(conf->tmppage); kfree(conf->poolinfo); kfree(conf); mddev->private = NULL; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index e32e8b1042f8..59d4daa5f4c7 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1133,7 +1133,12 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; bio->bi_next = NULL; - generic_make_request(bio); + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + /* Just ignore it */ + bio_endio(bio, 0); + else + generic_make_request(bio); bio = next; } kfree(plug); @@ -2888,6 +2893,22 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, if (init_resync(conf)) return 0; + /* + * Allow skipping a full rebuild for incremental assembly + * of a clean array, like RAID1 does. + */ + if (mddev->bitmap == NULL && + mddev->recovery_cp == MaxSector && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && + conf->fullsync == 0) { + *skipped = 1; + max_sector = mddev->dev_sectors; + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + max_sector = mddev->resync_max_sectors; + return max_sector - sector_nr; + } + skipped: max_sector = mddev->dev_sectors; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || @@ -3778,6 +3799,7 @@ static int stop(struct mddev *mddev) if (conf->r10bio_pool) mempool_destroy(conf->r10bio_pool); + safe_put_page(conf->tmppage); kfree(conf->mirrors); kfree(conf); mddev->private = NULL; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2fefb9f2198e..9359828ffe26 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -184,6 +184,8 @@ static void return_io(struct bio *return_bi) return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; + trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), + bi, 0); bio_endio(bi, 0); bi = return_bi; } @@ -1884,8 +1886,15 @@ static void raid5_end_write_request(struct bio *bi, int error) &rdev->mddev->recovery); } else if (is_badblock(rdev, sh->sector, STRIPE_SECTORS, - &first_bad, &bad_sectors)) + &first_bad, &bad_sectors)) { set_bit(R5_MadeGood, &sh->dev[i].flags); + if (test_bit(R5_ReadError, &sh->dev[i].flags)) + /* That was a successful write so make + * sure it looks like we already did + * a re-write. + */ + set_bit(R5_ReWrite, &sh->dev[i].flags); + } } rdev_dec_pending(rdev, conf->mddev); @@ -3913,6 +3922,8 @@ static void raid5_align_endio(struct bio *bi, int error) rdev_dec_pending(rdev, conf->mddev); if (!error && uptodate) { + trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), + raid_bi, 0); bio_endio(raid_bi, 0); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); @@ -4381,6 +4392,8 @@ static void make_request(struct mddev *mddev, struct bio * bi) if ( rw == WRITE ) md_write_end(mddev); + trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), + bi, 0); bio_endio(bi, 0); } } @@ -4665,9 +4678,10 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped = 1; return rv; } - if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && - !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && - !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { + if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && + !conf->fullsync && + !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && + sync_blocks >= STRIPE_SECTORS) { /* we can skip this block, and probably more */ sync_blocks /= STRIPE_SECTORS; *skipped = 1; @@ -4757,8 +4771,11 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) handled++; } remaining = raid5_dec_bi_active_stripes(raid_bio); - if (remaining == 0) + if (remaining == 0) { + trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), + raid_bio, 0); bio_endio(raid_bio, 0); + } if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); return handled; |