diff options
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r-- | drivers/md/raid5.c | 350 |
1 files changed, 220 insertions, 130 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b98765f6f77f..cd2f96b2c572 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -296,12 +296,9 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, BUG_ON(atomic_read(&conf->active_stripes)==0); if (test_bit(STRIPE_HANDLE, &sh->state)) { if (test_bit(STRIPE_DELAYED, &sh->state) && - !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) list_add_tail(&sh->lru, &conf->delayed_list); - if (atomic_read(&conf->preread_active_stripes) - < IO_THRESHOLD) - md_wakeup_thread(conf->mddev->thread); - } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && + else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && sh->bm_seq - conf->seq_write > 0) list_add_tail(&sh->lru, &conf->bitmap_list); else { @@ -2898,31 +2895,102 @@ static int want_replace(struct stripe_head *sh, int disk_idx) * Returns 1 when no more member devices need to be checked, otherwise returns * 0 to tell the loop in handle_stripe_fill to continue */ -static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, - int disk_idx, int disks) + +static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, + int disk_idx, int disks) { struct r5dev *dev = &sh->dev[disk_idx]; struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], &sh->dev[s->failed_num[1]] }; + int i; + + + if (test_bit(R5_LOCKED, &dev->flags) || + test_bit(R5_UPTODATE, &dev->flags)) + /* No point reading this as we already have it or have + * decided to get it. + */ + return 0; + + if (dev->toread || + (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags))) + /* We need this block to directly satisfy a request */ + return 1; + + if (s->syncing || s->expanding || + (s->replacing && want_replace(sh, disk_idx))) + /* When syncing, or expanding we read everything. + * When replacing, we need the replaced block. + */ + return 1; + + if ((s->failed >= 1 && fdev[0]->toread) || + (s->failed >= 2 && fdev[1]->toread)) + /* If we want to read from a failed device, then + * we need to actually read every other device. + */ + return 1; + + /* Sometimes neither read-modify-write nor reconstruct-write + * cycles can work. In those cases we read every block we + * can. Then the parity-update is certain to have enough to + * work with. + * This can only be a problem when we need to write something, + * and some device has failed. If either of those tests + * fail we need look no further. + */ + if (!s->failed || !s->to_write) + return 0; + + if (test_bit(R5_Insync, &dev->flags) && + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + /* Pre-reads at not permitted until after short delay + * to gather multiple requests. However if this + * device is no Insync, the block could only be be computed + * and there is no need to delay that. + */ + return 0; + + for (i = 0; i < s->failed; i++) { + if (fdev[i]->towrite && + !test_bit(R5_UPTODATE, &fdev[i]->flags) && + !test_bit(R5_OVERWRITE, &fdev[i]->flags)) + /* If we have a partial write to a failed + * device, then we will need to reconstruct + * the content of that device, so all other + * devices must be read. + */ + return 1; + } + + /* If we are forced to do a reconstruct-write, either because + * the current RAID6 implementation only supports that, or + * or because parity cannot be trusted and we are currently + * recovering it, there is extra need to be careful. + * If one of the devices that we would need to read, because + * it is not being overwritten (and maybe not written at all) + * is missing/faulty, then we need to read everything we can. + */ + if (sh->raid_conf->level != 6 && + sh->sector < sh->raid_conf->mddev->recovery_cp) + /* reconstruct-write isn't being forced */ + return 0; + for (i = 0; i < s->failed; i++) { + if (!test_bit(R5_UPTODATE, &fdev[i]->flags) && + !test_bit(R5_OVERWRITE, &fdev[i]->flags)) + return 1; + } + + return 0; +} + +static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, + int disk_idx, int disks) +{ + struct r5dev *dev = &sh->dev[disk_idx]; /* is the data in this block needed, and can we get it? */ - if (!test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && - (dev->toread || - (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || - s->syncing || s->expanding || - (s->replacing && want_replace(sh, disk_idx)) || - (s->failed >= 1 && fdev[0]->toread) || - (s->failed >= 2 && fdev[1]->toread) || - (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && - (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) && - !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || - ((sh->raid_conf->level == 6 || - sh->sector >= sh->raid_conf->mddev->recovery_cp) - && s->failed && s->to_write && - (s->to_write - s->non_overwrite < - sh->raid_conf->raid_disks - sh->raid_conf->max_degraded) && - (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) { + if (need_this_block(sh, s, disk_idx, disks)) { /* we would like to get this block, possibly by computing it, * otherwise read it if the backing disk is insync */ @@ -3102,7 +3170,8 @@ static void handle_stripe_dirtying(struct r5conf *conf, * generate correct data from the parity. */ if (conf->max_degraded == 2 || - (recovery_cp < MaxSector && sh->sector >= recovery_cp)) { + (recovery_cp < MaxSector && sh->sector >= recovery_cp && + s->failed == 0)) { /* Calculate the real rcw later - for now make it * look like rcw is cheaper */ @@ -4081,7 +4150,7 @@ static void activate_bit_delay(struct r5conf *conf, } } -int md_raid5_congested(struct mddev *mddev, int bits) +static int raid5_congested(struct mddev *mddev, int bits) { struct r5conf *conf = mddev->private; @@ -4098,24 +4167,14 @@ int md_raid5_congested(struct mddev *mddev, int bits) return 0; } -EXPORT_SYMBOL_GPL(md_raid5_congested); - -static int raid5_congested(void *data, int bits) -{ - struct mddev *mddev = data; - - return mddev_congested(mddev, bits) || - md_raid5_congested(mddev, bits); -} /* We want read requests to align with chunks where possible, * but write requests don't need to. */ -static int raid5_mergeable_bvec(struct request_queue *q, +static int raid5_mergeable_bvec(struct mddev *mddev, struct bvec_merge_data *bvm, struct bio_vec *biovec) { - struct mddev *mddev = q->queuedata; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_sectors; @@ -5062,12 +5121,17 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int schedule_timeout_uninterruptible(1); } /* Need to check if array will still be degraded after recovery/resync - * We don't need to check the 'failed' flag as when that gets set, - * recovery aborts. + * Note in case of > 1 drive failures it's possible we're rebuilding + * one drive while leaving another faulty drive in array. */ - for (i = 0; i < conf->raid_disks; i++) - if (conf->disks[i].rdev == NULL) + rcu_read_lock(); + for (i = 0; i < conf->raid_disks; i++) { + struct md_rdev *rdev = ACCESS_ONCE(conf->disks[i].rdev); + + if (rdev == NULL || test_bit(Faulty, &rdev->flags)) still_degraded = 1; + } + rcu_read_unlock(); bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); @@ -5296,11 +5360,14 @@ static void raid5d(struct md_thread *thread) static ssize_t raid5_show_stripe_cache_size(struct mddev *mddev, char *page) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; + int ret = 0; + spin_lock(&mddev->lock); + conf = mddev->private; if (conf) - return sprintf(page, "%d\n", conf->max_nr_stripes); - else - return 0; + ret = sprintf(page, "%d\n", conf->max_nr_stripes); + spin_unlock(&mddev->lock); + return ret; } int @@ -5339,21 +5406,25 @@ EXPORT_SYMBOL(raid5_set_cache_size); static ssize_t raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; unsigned long new; int err; if (len >= PAGE_SIZE) return -EINVAL; - if (!conf) - return -ENODEV; - if (kstrtoul(page, 10, &new)) return -EINVAL; - err = raid5_set_cache_size(mddev, new); + err = mddev_lock(mddev); if (err) return err; - return len; + conf = mddev->private; + if (!conf) + err = -ENODEV; + else + err = raid5_set_cache_size(mddev, new); + mddev_unlock(mddev); + + return err ?: len; } static struct md_sysfs_entry @@ -5364,29 +5435,40 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, static ssize_t raid5_show_preread_threshold(struct mddev *mddev, char *page) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; + int ret = 0; + spin_lock(&mddev->lock); + conf = mddev->private; if (conf) - return sprintf(page, "%d\n", conf->bypass_threshold); - else - return 0; + ret = sprintf(page, "%d\n", conf->bypass_threshold); + spin_unlock(&mddev->lock); + return ret; } static ssize_t raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; unsigned long new; + int err; + if (len >= PAGE_SIZE) return -EINVAL; - if (!conf) - return -ENODEV; - if (kstrtoul(page, 10, &new)) return -EINVAL; - if (new > conf->max_nr_stripes) - return -EINVAL; - conf->bypass_threshold = new; - return len; + + err = mddev_lock(mddev); + if (err) + return err; + conf = mddev->private; + if (!conf) + err = -ENODEV; + else if (new > conf->max_nr_stripes) + err = -EINVAL; + else + conf->bypass_threshold = new; + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry @@ -5398,39 +5480,48 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, static ssize_t raid5_show_skip_copy(struct mddev *mddev, char *page) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; + int ret = 0; + spin_lock(&mddev->lock); + conf = mddev->private; if (conf) - return sprintf(page, "%d\n", conf->skip_copy); - else - return 0; + ret = sprintf(page, "%d\n", conf->skip_copy); + spin_unlock(&mddev->lock); + return ret; } static ssize_t raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; unsigned long new; + int err; + if (len >= PAGE_SIZE) return -EINVAL; - if (!conf) - return -ENODEV; - if (kstrtoul(page, 10, &new)) return -EINVAL; new = !!new; - if (new == conf->skip_copy) - return len; - mddev_suspend(mddev); - conf->skip_copy = new; - if (new) - mddev->queue->backing_dev_info.capabilities |= - BDI_CAP_STABLE_WRITES; - else - mddev->queue->backing_dev_info.capabilities &= - ~BDI_CAP_STABLE_WRITES; - mddev_resume(mddev); - return len; + err = mddev_lock(mddev); + if (err) + return err; + conf = mddev->private; + if (!conf) + err = -ENODEV; + else if (new != conf->skip_copy) { + mddev_suspend(mddev); + conf->skip_copy = new; + if (new) + mddev->queue->backing_dev_info.capabilities |= + BDI_CAP_STABLE_WRITES; + else + mddev->queue->backing_dev_info.capabilities &= + ~BDI_CAP_STABLE_WRITES; + mddev_resume(mddev); + } + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry @@ -5454,11 +5545,14 @@ raid5_stripecache_active = __ATTR_RO(stripe_cache_active); static ssize_t raid5_show_group_thread_cnt(struct mddev *mddev, char *page) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; + int ret = 0; + spin_lock(&mddev->lock); + conf = mddev->private; if (conf) - return sprintf(page, "%d\n", conf->worker_cnt_per_group); - else - return 0; + ret = sprintf(page, "%d\n", conf->worker_cnt_per_group); + spin_unlock(&mddev->lock); + return ret; } static int alloc_thread_groups(struct r5conf *conf, int cnt, @@ -5468,7 +5562,7 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt, static ssize_t raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; unsigned long new; int err; struct r5worker_group *new_groups, *old_groups; @@ -5476,41 +5570,41 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) if (len >= PAGE_SIZE) return -EINVAL; - if (!conf) - return -ENODEV; - if (kstrtoul(page, 10, &new)) return -EINVAL; - if (new == conf->worker_cnt_per_group) - return len; - - mddev_suspend(mddev); + err = mddev_lock(mddev); + if (err) + return err; + conf = mddev->private; + if (!conf) + err = -ENODEV; + else if (new != conf->worker_cnt_per_group) { + mddev_suspend(mddev); - old_groups = conf->worker_groups; - if (old_groups) - flush_workqueue(raid5_wq); + old_groups = conf->worker_groups; + if (old_groups) + flush_workqueue(raid5_wq); - err = alloc_thread_groups(conf, new, - &group_cnt, &worker_cnt_per_group, - &new_groups); - if (!err) { - spin_lock_irq(&conf->device_lock); - conf->group_cnt = group_cnt; - conf->worker_cnt_per_group = worker_cnt_per_group; - conf->worker_groups = new_groups; - spin_unlock_irq(&conf->device_lock); + err = alloc_thread_groups(conf, new, + &group_cnt, &worker_cnt_per_group, + &new_groups); + if (!err) { + spin_lock_irq(&conf->device_lock); + conf->group_cnt = group_cnt; + conf->worker_cnt_per_group = worker_cnt_per_group; + conf->worker_groups = new_groups; + spin_unlock_irq(&conf->device_lock); - if (old_groups) - kfree(old_groups[0].workers); - kfree(old_groups); + if (old_groups) + kfree(old_groups[0].workers); + kfree(old_groups); + } + mddev_resume(mddev); } + mddev_unlock(mddev); - mddev_resume(mddev); - - if (err) - return err; - return len; + return err ?: len; } static struct md_sysfs_entry @@ -6178,11 +6272,6 @@ static int run(struct mddev *mddev) if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) mddev->queue->backing_dev_info.ra_pages = 2 * stripe; - blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); - - mddev->queue->backing_dev_info.congested_data = mddev; - mddev->queue->backing_dev_info.congested_fn = raid5_congested; - chunk_size = mddev->chunk_sectors << 9; blk_queue_io_min(mddev->queue, chunk_size); blk_queue_io_opt(mddev->queue, chunk_size * @@ -6260,17 +6349,12 @@ abort: return -EIO; } -static int stop(struct mddev *mddev) +static void raid5_free(struct mddev *mddev, void *priv) { - struct r5conf *conf = mddev->private; + struct r5conf *conf = priv; - md_unregister_thread(&mddev->thread); - if (mddev->queue) - mddev->queue->backing_dev_info.congested_fn = NULL; free_conf(conf); - mddev->private = NULL; mddev->to_remove = &raid5_attrs_group; - return 0; } static void status(struct seq_file *seq, struct mddev *mddev) @@ -7044,7 +7128,7 @@ static struct md_personality raid6_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = raid5_free, .status = status, .error_handler = error, .hot_add_disk = raid5_add_disk, @@ -7058,6 +7142,8 @@ static struct md_personality raid6_personality = .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, .takeover = raid6_takeover, + .congested = raid5_congested, + .mergeable_bvec = raid5_mergeable_bvec, }; static struct md_personality raid5_personality = { @@ -7066,7 +7152,7 @@ static struct md_personality raid5_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = raid5_free, .status = status, .error_handler = error, .hot_add_disk = raid5_add_disk, @@ -7080,6 +7166,8 @@ static struct md_personality raid5_personality = .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, .takeover = raid5_takeover, + .congested = raid5_congested, + .mergeable_bvec = raid5_mergeable_bvec, }; static struct md_personality raid4_personality = @@ -7089,7 +7177,7 @@ static struct md_personality raid4_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = raid5_free, .status = status, .error_handler = error, .hot_add_disk = raid5_add_disk, @@ -7103,6 +7191,8 @@ static struct md_personality raid4_personality = .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, .takeover = raid4_takeover, + .congested = raid5_congested, + .mergeable_bvec = raid5_mergeable_bvec, }; static int __init raid5_init(void) |