diff options
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r-- | drivers/md/md.c | 542 |
1 files changed, 308 insertions, 234 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index ef859ccb0366..10670c62b09e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -79,16 +79,10 @@ static const char *action_name[NR_SYNC_ACTIONS] = { [ACTION_IDLE] = "idle", }; -/* pers_list is a list of registered personalities protected by pers_lock. */ -static LIST_HEAD(pers_list); -static DEFINE_SPINLOCK(pers_lock); +static DEFINE_XARRAY(md_submodule); static const struct kobj_type md_ktype; -const struct md_cluster_operations *md_cluster_ops; -EXPORT_SYMBOL(md_cluster_ops); -static struct module *md_cluster_mod; - static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; @@ -117,32 +111,48 @@ static void md_wakeup_thread_directly(struct md_thread __rcu *thread); /* Default safemode delay: 200 msec */ #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) /* - * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' - * is 1000 KB/sec, so the extra system load does not show up that much. - * Increase it if you want to have more _guaranteed_ speed. Note that - * the RAID driver will use the maximum available bandwidth if the IO - * subsystem is idle. There is also an 'absolute maximum' reconstruction - * speed limit - in case reconstruction slows down your system despite - * idle IO detection. + * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit' + * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load + * does not show up that much. Increase it if you want to have more guaranteed + * speed. Note that the RAID driver will use the maximum bandwidth + * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle. + * + * Background sync IO speed control: + * + * - below speed min: + * no limit; + * - above speed min and below speed max: + * a) if mddev is idle, then no limit; + * b) if mddev is busy handling normal IO, then limit inflight sync IO + * to sync_io_depth; + * - above speed max: + * sync IO can't be issued; * - * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. - * or /sys/block/mdX/md/sync_speed_{min,max} + * Following configurations can be changed via /proc/sys/dev/raid/ for system + * or /sys/block/mdX/md/ for one array. */ - static int sysctl_speed_limit_min = 1000; static int sysctl_speed_limit_max = 200000; -static inline int speed_min(struct mddev *mddev) +static int sysctl_sync_io_depth = 32; + +static int speed_min(struct mddev *mddev) { return mddev->sync_speed_min ? mddev->sync_speed_min : sysctl_speed_limit_min; } -static inline int speed_max(struct mddev *mddev) +static int speed_max(struct mddev *mddev) { return mddev->sync_speed_max ? mddev->sync_speed_max : sysctl_speed_limit_max; } +static int sync_io_depth(struct mddev *mddev) +{ + return mddev->sync_io_depth ? + mddev->sync_io_depth : sysctl_sync_io_depth; +} + static void rdev_uninit_serial(struct md_rdev *rdev) { if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) @@ -299,14 +309,21 @@ static const struct ctl_table raid_table[] = { .procname = "speed_limit_min", .data = &sysctl_speed_limit_min, .maxlen = sizeof(int), - .mode = S_IRUGO|S_IWUSR, + .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "speed_limit_max", .data = &sysctl_speed_limit_max, .maxlen = sizeof(int), - .mode = S_IRUGO|S_IWUSR, + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sync_io_depth", + .data = &sysctl_sync_io_depth, + .maxlen = sizeof(int), + .mode = 0644, .proc_handler = proc_dointvec, }, }; @@ -894,16 +911,40 @@ struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) } EXPORT_SYMBOL_GPL(md_find_rdev_rcu); -static struct md_personality *find_pers(int level, char *clevel) +static struct md_personality *get_pers(int level, char *clevel) { - struct md_personality *pers; - list_for_each_entry(pers, &pers_list, list) { - if (level != LEVEL_NONE && pers->level == level) - return pers; - if (strcmp(pers->name, clevel)==0) - return pers; + struct md_personality *ret = NULL; + struct md_submodule_head *head; + unsigned long i; + + xa_lock(&md_submodule); + xa_for_each(&md_submodule, i, head) { + if (head->type != MD_PERSONALITY) + continue; + if ((level != LEVEL_NONE && head->id == level) || + !strcmp(head->name, clevel)) { + if (try_module_get(head->owner)) + ret = (void *)head; + break; + } } - return NULL; + xa_unlock(&md_submodule); + + if (!ret) { + if (level != LEVEL_NONE) + pr_warn("md: personality for level %d is not loaded!\n", + level); + else + pr_warn("md: personality for level %s is not loaded!\n", + clevel); + } + + return ret; +} + +static void put_pers(struct md_personality *pers) +{ + module_put(pers->head.owner); } /* return the offset of the super block in 512byte sectors */ @@ -1186,7 +1227,7 @@ int md_check_no_bitmap(struct mddev *mddev) if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) return 0; pr_warn("%s: bitmaps are not supported for %s\n", - mdname(mddev), mddev->pers->name); + mdname(mddev), mddev->pers->head.name); return 1; } EXPORT_SYMBOL(md_check_no_bitmap); @@ -2365,19 +2406,6 @@ int md_integrity_register(struct mddev *mddev) return 0; /* shouldn't register */ pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); - if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || - (mddev->level != 1 && mddev->level != 10 && - bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { - /* - * No need to handle the failure of bioset_integrity_create, - * because the function is called by md_run() -> pers->run(), - * md_run calls bioset_exit -> bioset_integrity_free in case - * of failure case. - */ - pr_err("md: failed to create integrity pool for %s\n", - mdname(mddev)); - return -EINVAL; - } return 0; } EXPORT_SYMBOL(md_integrity_register); @@ -2645,11 +2673,11 @@ repeat: force_change = 1; if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) nospares = 1; - ret = md_cluster_ops->metadata_update_start(mddev); + ret = mddev->cluster_ops->metadata_update_start(mddev); /* Has someone else has updated the sb */ if (!does_sb_need_changing(mddev)) { if (ret == 0) - md_cluster_ops->metadata_update_cancel(mddev); + mddev->cluster_ops->metadata_update_cancel(mddev); bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)); @@ -2789,7 +2817,7 @@ rewrite: /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ if (mddev_is_clustered(mddev) && ret == 0) - md_cluster_ops->metadata_update_finish(mddev); + mddev->cluster_ops->metadata_update_finish(mddev); if (mddev->in_sync != sync_req || !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), @@ -2948,7 +2976,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) else { err = 0; if (mddev_is_clustered(mddev)) - err = md_cluster_ops->remove_disk(mddev, rdev); + err = mddev->cluster_ops->remove_disk(mddev, rdev); if (err == 0) { md_kick_rdev_from_array(rdev); @@ -3058,7 +3086,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) * by this node eventually */ if (!mddev_is_clustered(rdev->mddev) || - (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { + (err = mddev->cluster_ops->gather_bitmaps(rdev)) == 0) { clear_bit(Faulty, &rdev->flags); err = add_bound_rdev(rdev); } @@ -3866,7 +3894,7 @@ level_show(struct mddev *mddev, char *page) spin_lock(&mddev->lock); p = mddev->pers; if (p) - ret = sprintf(page, "%s\n", p->name); + ret = sprintf(page, "%s\n", p->head.name); else if (mddev->clevel[0]) ret = sprintf(page, "%s\n", mddev->clevel); else if (mddev->level != LEVEL_NONE) @@ -3923,7 +3951,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) rv = -EINVAL; if (!mddev->pers->quiesce) { pr_warn("md: %s: %s does not support online personality change\n", - mdname(mddev), mddev->pers->name); + mdname(mddev), mddev->pers->head.name); goto out_unlock; } @@ -3937,24 +3965,20 @@ level_store(struct mddev *mddev, const char *buf, size_t len) if (request_module("md-%s", clevel) != 0) request_module("md-level-%s", clevel); - spin_lock(&pers_lock); - pers = find_pers(level, clevel); - if (!pers || !try_module_get(pers->owner)) { - spin_unlock(&pers_lock); - pr_warn("md: personality %s not loaded\n", clevel); + pers = get_pers(level, clevel); + if (!pers) { rv = -EINVAL; goto out_unlock; } - spin_unlock(&pers_lock); if (pers == mddev->pers) { /* Nothing to do! */ - module_put(pers->owner); + put_pers(pers); rv = len; goto out_unlock; } if (!pers->takeover) { - module_put(pers->owner); + put_pers(pers); pr_warn("md: %s: %s does not support personality takeover\n", mdname(mddev), clevel); rv = -EINVAL; @@ -3975,7 +3999,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->raid_disks -= mddev->delta_disks; mddev->delta_disks = 0; mddev->reshape_backwards = 0; - module_put(pers->owner); + put_pers(pers); pr_warn("md: %s: %s would not accept array\n", mdname(mddev), clevel); rv = PTR_ERR(priv); @@ -3990,7 +4014,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) oldpriv = mddev->private; mddev->pers = pers; mddev->private = priv; - strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); mddev->level = mddev->new_level; mddev->layout = mddev->new_layout; mddev->chunk_sectors = mddev->new_chunk_sectors; @@ -4032,7 +4056,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->to_remove = &md_redundancy_group; } - module_put(oldpers->owner); + put_pers(oldpers); rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) @@ -4063,7 +4087,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) * it must always be in_sync */ mddev->in_sync = 1; - del_timer_sync(&mddev->safemode_timer); + timer_delete_sync(&mddev->safemode_timer); } pers->run(mddev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); @@ -5090,7 +5114,7 @@ static ssize_t sync_min_show(struct mddev *mddev, char *page) { return sprintf(page, "%d (%s)\n", speed_min(mddev), - mddev->sync_speed_min ? "local": "system"); + mddev->sync_speed_min ? "local" : "system"); } static ssize_t @@ -5099,7 +5123,7 @@ sync_min_store(struct mddev *mddev, const char *buf, size_t len) unsigned int min; int rv; - if (strncmp(buf, "system", 6)==0) { + if (strncmp(buf, "system", 6) == 0) { min = 0; } else { rv = kstrtouint(buf, 10, &min); @@ -5119,7 +5143,7 @@ static ssize_t sync_max_show(struct mddev *mddev, char *page) { return sprintf(page, "%d (%s)\n", speed_max(mddev), - mddev->sync_speed_max ? "local": "system"); + mddev->sync_speed_max ? "local" : "system"); } static ssize_t @@ -5128,7 +5152,7 @@ sync_max_store(struct mddev *mddev, const char *buf, size_t len) unsigned int max; int rv; - if (strncmp(buf, "system", 6)==0) { + if (strncmp(buf, "system", 6) == 0) { max = 0; } else { rv = kstrtouint(buf, 10, &max); @@ -5145,6 +5169,35 @@ static struct md_sysfs_entry md_sync_max = __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); static ssize_t +sync_io_depth_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%d (%s)\n", sync_io_depth(mddev), + mddev->sync_io_depth ? "local" : "system"); +} + +static ssize_t +sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned int max; + int rv; + + if (strncmp(buf, "system", 6) == 0) { + max = 0; + } else { + rv = kstrtouint(buf, 10, &max); + if (rv < 0) + return rv; + if (max == 0) + return -EINVAL; + } + mddev->sync_io_depth = max; + return len; +} + +static struct md_sysfs_entry md_sync_io_depth = +__ATTR_RW(sync_io_depth); + +static ssize_t degraded_show(struct mddev *mddev, char *page) { return sprintf(page, "%d\n", mddev->degraded); @@ -5590,7 +5643,7 @@ __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, static ssize_t serialize_policy_show(struct mddev *mddev, char *page) { - if (mddev->pers == NULL || (mddev->pers->level != 1)) + if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) return sprintf(page, "n/a\n"); else return sprintf(page, "%d\n", mddev->serialize_policy); @@ -5616,7 +5669,7 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) err = mddev_suspend_and_lock(mddev); if (err) return err; - if (mddev->pers == NULL || (mddev->pers->level != 1)) { + if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) { pr_err("md: serialize_policy is only effective for raid1\n"); err = -EINVAL; goto unlock; @@ -5670,6 +5723,7 @@ static struct attribute *md_redundancy_attrs[] = { &md_mismatches.attr, &md_sync_min.attr, &md_sync_max.attr, + &md_sync_io_depth.attr, &md_sync_speed.attr, &md_sync_force_parallel.attr, &md_sync_completed.attr, @@ -5994,7 +6048,7 @@ static int add_named_array(const char *val, const struct kernel_param *kp) static void md_safemode_timeout(struct timer_list *t) { - struct mddev *mddev = from_timer(mddev, t, safemode_timer); + struct mddev *mddev = timer_container_of(mddev, t, safemode_timer); mddev->safemode = 1; if (mddev->external) @@ -6102,30 +6156,21 @@ int md_run(struct mddev *mddev) goto exit_sync_set; } - spin_lock(&pers_lock); - pers = find_pers(mddev->level, mddev->clevel); - if (!pers || !try_module_get(pers->owner)) { - spin_unlock(&pers_lock); - if (mddev->level != LEVEL_NONE) - pr_warn("md: personality for level %d is not loaded!\n", - mddev->level); - else - pr_warn("md: personality for level %s is not loaded!\n", - mddev->clevel); + pers = get_pers(mddev->level, mddev->clevel); + if (!pers) { err = -EINVAL; goto abort; } - spin_unlock(&pers_lock); - if (mddev->level != pers->level) { - mddev->level = pers->level; - mddev->new_level = pers->level; + if (mddev->level != pers->head.id) { + mddev->level = pers->head.id; + mddev->new_level = pers->head.id; } - strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); if (mddev->reshape_position != MaxSector && pers->start_reshape == NULL) { /* This personality cannot handle reshaping... */ - module_put(pers->owner); + put_pers(pers); err = -EINVAL; goto abort; } @@ -6180,7 +6225,7 @@ int md_run(struct mddev *mddev) } if (err == 0 && pers->sync_request && (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { - err = mddev->bitmap_ops->create(mddev, -1); + err = mddev->bitmap_ops->create(mddev); if (err) pr_warn("%s: failed to create bitmap (%d)\n", mdname(mddev), err); @@ -6252,7 +6297,7 @@ bitmap_abort: if (mddev->private) pers->free(mddev, mddev->private); mddev->private = NULL; - module_put(pers->owner); + put_pers(pers); mddev->bitmap_ops->destroy(mddev); abort: bioset_exit(&mddev->io_clone_set); @@ -6413,7 +6458,7 @@ static void md_clean(struct mddev *mddev) static void __md_stop_writes(struct mddev *mddev) { - del_timer_sync(&mddev->safemode_timer); + timer_delete_sync(&mddev->safemode_timer); if (mddev->pers && mddev->pers->quiesce) { mddev->pers->quiesce(mddev, 1); @@ -6473,7 +6518,7 @@ static void __md_stop(struct mddev *mddev) mddev->private = NULL; if (pers->sync_request && mddev->to_remove == NULL) mddev->to_remove = &md_redundancy_group; - module_put(pers->owner); + put_pers(pers); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); bioset_exit(&mddev->bio_set); @@ -6989,7 +7034,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) set_bit(Candidate, &rdev->flags); else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { /* --add initiated by this node */ - err = md_cluster_ops->add_new_disk(mddev, rdev); + err = mddev->cluster_ops->add_new_disk(mddev, rdev); if (err) { export_rdev(rdev, mddev); return err; @@ -7006,14 +7051,14 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) if (mddev_is_clustered(mddev)) { if (info->state & (1 << MD_DISK_CANDIDATE)) { if (!err) { - err = md_cluster_ops->new_disk_ack(mddev, - err == 0); + err = mddev->cluster_ops->new_disk_ack( + mddev, err == 0); if (err) md_kick_rdev_from_array(rdev); } } else { if (err) - md_cluster_ops->add_new_disk_cancel(mddev); + mddev->cluster_ops->add_new_disk_cancel(mddev); else err = add_bound_rdev(rdev); } @@ -7093,10 +7138,9 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) goto busy; kick_rdev: - if (mddev_is_clustered(mddev)) { - if (md_cluster_ops->remove_disk(mddev, rdev)) - goto busy; - } + if (mddev_is_clustered(mddev) && + mddev->cluster_ops->remove_disk(mddev, rdev)) + goto busy; md_kick_rdev_from_array(rdev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); @@ -7241,7 +7285,7 @@ static int set_bitmap_file(struct mddev *mddev, int fd) err = 0; if (mddev->pers) { if (fd >= 0) { - err = mddev->bitmap_ops->create(mddev, -1); + err = mddev->bitmap_ops->create(mddev); if (!err) err = mddev->bitmap_ops->load(mddev); @@ -7399,7 +7443,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) rv = mddev->pers->resize(mddev, num_sectors); if (!rv) { if (mddev_is_clustered(mddev)) - md_cluster_ops->update_size(mddev, old_dev_sectors); + mddev->cluster_ops->update_size(mddev, old_dev_sectors); else if (!mddev_is_dm(mddev)) set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); @@ -7447,6 +7491,28 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) return rv; } +static int get_cluster_ops(struct mddev *mddev) +{ + xa_lock(&md_submodule); + mddev->cluster_ops = xa_load(&md_submodule, ID_CLUSTER); + if (mddev->cluster_ops && + !try_module_get(mddev->cluster_ops->head.owner)) + mddev->cluster_ops = NULL; + xa_unlock(&md_submodule); + + return mddev->cluster_ops == NULL ? -ENOENT : 0; +} + +static void put_cluster_ops(struct mddev *mddev) +{ + if (!mddev->cluster_ops) + return; + + mddev->cluster_ops->leave(mddev); + module_put(mddev->cluster_ops->head.owner); + mddev->cluster_ops = NULL; +} + /* * update_array_info is used to change the configuration of an * on-line array. @@ -7535,7 +7601,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; - rv = mddev->bitmap_ops->create(mddev, -1); + rv = mddev->bitmap_ops->create(mddev); if (!rv) rv = mddev->bitmap_ops->load(mddev); @@ -7555,16 +7621,15 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) if (mddev->bitmap_info.nodes) { /* hold PW on all the bitmap lock */ - if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { + if (mddev->cluster_ops->lock_all_bitmaps(mddev) <= 0) { pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); rv = -EPERM; - md_cluster_ops->unlock_all_bitmaps(mddev); + mddev->cluster_ops->unlock_all_bitmaps(mddev); goto err; } mddev->bitmap_info.nodes = 0; - md_cluster_ops->leave(mddev); - module_put(md_cluster_mod); + put_cluster_ops(mddev); mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; } mddev->bitmap_ops->destroy(mddev); @@ -7848,7 +7913,7 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode, case CLUSTERED_DISK_NACK: if (mddev_is_clustered(mddev)) - md_cluster_ops->new_disk_ack(mddev, false); + mddev->cluster_ops->new_disk_ack(mddev, false); else err = -EINVAL; goto unlock; @@ -8130,7 +8195,8 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev) return; mddev->pers->error_handler(mddev, rdev); - if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR) + if (mddev->pers->head.id == ID_RAID0 || + mddev->pers->head.id == ID_LINEAR) return; if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) @@ -8168,14 +8234,17 @@ static void status_unused(struct seq_file *seq) static void status_personalities(struct seq_file *seq) { - struct md_personality *pers; + struct md_submodule_head *head; + unsigned long i; seq_puts(seq, "Personalities : "); - spin_lock(&pers_lock); - list_for_each_entry(pers, &pers_list, list) - seq_printf(seq, "[%s] ", pers->name); - spin_unlock(&pers_lock); + xa_lock(&md_submodule); + xa_for_each(&md_submodule, i, head) + if (head->type == MD_PERSONALITY) + seq_printf(seq, "[%s] ", head->name); + xa_unlock(&md_submodule); + seq_puts(seq, "\n"); } @@ -8398,7 +8467,7 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, " (read-only)"); if (mddev->ro == MD_AUTO_READ) seq_printf(seq, " (auto-read-only)"); - seq_printf(seq, " %s", mddev->pers->name); + seq_printf(seq, " %s", mddev->pers->head.name); } else { seq_printf(seq, "inactive"); } @@ -8518,67 +8587,34 @@ static const struct proc_ops mdstat_proc_ops = { .proc_poll = mdstat_poll, }; -int register_md_personality(struct md_personality *p) -{ - pr_debug("md: %s personality registered for level %d\n", - p->name, p->level); - spin_lock(&pers_lock); - list_add_tail(&p->list, &pers_list); - spin_unlock(&pers_lock); - return 0; -} -EXPORT_SYMBOL(register_md_personality); - -int unregister_md_personality(struct md_personality *p) +int register_md_submodule(struct md_submodule_head *msh) { - pr_debug("md: %s personality unregistered\n", p->name); - spin_lock(&pers_lock); - list_del_init(&p->list); - spin_unlock(&pers_lock); - return 0; + return xa_insert(&md_submodule, msh->id, msh, GFP_KERNEL); } -EXPORT_SYMBOL(unregister_md_personality); +EXPORT_SYMBOL_GPL(register_md_submodule); -int register_md_cluster_operations(const struct md_cluster_operations *ops, - struct module *module) +void unregister_md_submodule(struct md_submodule_head *msh) { - int ret = 0; - spin_lock(&pers_lock); - if (md_cluster_ops != NULL) - ret = -EALREADY; - else { - md_cluster_ops = ops; - md_cluster_mod = module; - } - spin_unlock(&pers_lock); - return ret; + xa_erase(&md_submodule, msh->id); } -EXPORT_SYMBOL(register_md_cluster_operations); - -int unregister_md_cluster_operations(void) -{ - spin_lock(&pers_lock); - md_cluster_ops = NULL; - spin_unlock(&pers_lock); - return 0; -} -EXPORT_SYMBOL(unregister_md_cluster_operations); +EXPORT_SYMBOL_GPL(unregister_md_submodule); int md_setup_cluster(struct mddev *mddev, int nodes) { - int ret; - if (!md_cluster_ops) + int ret = get_cluster_ops(mddev); + + if (ret) { request_module("md-cluster"); - spin_lock(&pers_lock); + ret = get_cluster_ops(mddev); + } + /* ensure module won't be unloaded */ - if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { + if (ret) { pr_warn("can't find md-cluster module or get its reference.\n"); - spin_unlock(&pers_lock); - return -ENOENT; + return ret; } - spin_unlock(&pers_lock); - ret = md_cluster_ops->join(mddev, nodes); + ret = mddev->cluster_ops->join(mddev, nodes); if (!ret) mddev->safemode_delay = 0; return ret; @@ -8586,56 +8622,58 @@ int md_setup_cluster(struct mddev *mddev, int nodes) void md_cluster_stop(struct mddev *mddev) { - if (!md_cluster_ops) - return; - md_cluster_ops->leave(mddev); - module_put(md_cluster_mod); + put_cluster_ops(mddev); } -static int is_mddev_idle(struct mddev *mddev, int init) +static bool is_rdev_holder_idle(struct md_rdev *rdev, bool init) +{ + unsigned long last_events = rdev->last_events; + + if (!bdev_is_partition(rdev->bdev)) + return true; + + /* + * If rdev is partition, and user doesn't issue IO to the array, the + * array is still not idle if user issues IO to other partitions. + */ + rdev->last_events = part_stat_read_accum(rdev->bdev->bd_disk->part0, + sectors) - + part_stat_read_accum(rdev->bdev, sectors); + + return init || rdev->last_events <= last_events; +} + +/* + * mddev is idle if following conditions are matched since last check: + * 1) mddev doesn't have normal IO completed; + * 2) mddev doesn't have inflight normal IO; + * 3) if any member disk is partition, and other partitions don't have IO + * completed; + * + * Noted this checking rely on IO accounting is enabled. + */ +static bool is_mddev_idle(struct mddev *mddev, int init) { + unsigned long last_events = mddev->normal_io_events; + struct gendisk *disk; struct md_rdev *rdev; - int idle; - int curr_events; + bool idle = true; - idle = 1; - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) { - struct gendisk *disk = rdev->bdev->bd_disk; + disk = mddev_is_dm(mddev) ? mddev->dm_gendisk : mddev->gendisk; + if (!disk) + return true; - if (!init && !blk_queue_io_stat(disk->queue)) - continue; + mddev->normal_io_events = part_stat_read_accum(disk->part0, sectors); + if (!init && (mddev->normal_io_events > last_events || + bdev_count_inflight(disk->part0))) + idle = false; - curr_events = (int)part_stat_read_accum(disk->part0, sectors) - - atomic_read(&disk->sync_io); - /* sync IO will cause sync_io to increase before the disk_stats - * as sync_io is counted when a request starts, and - * disk_stats is counted when it completes. - * So resync activity will cause curr_events to be smaller than - * when there was no such activity. - * non-sync IO will cause disk_stat to increase without - * increasing sync_io so curr_events will (eventually) - * be larger than it was before. Once it becomes - * substantially larger, the test below will cause - * the array to appear non-idle, and resync will slow - * down. - * If there is a lot of outstanding resync activity when - * we set last_event to curr_events, then all that activity - * completing might cause the array to appear non-idle - * and resync will be slowed down even though there might - * not have been non-resync activity. This will only - * happen once though. 'last_events' will soon reflect - * the state where there is little or no outstanding - * resync requests, and further resync activity will - * always make curr_events less than last_events. - * - */ - if (init || curr_events - rdev->last_events > 64) { - rdev->last_events = curr_events; - idle = 0; - } - } + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (!is_rdev_holder_idle(rdev, init)) + idle = false; rcu_read_unlock(); + return idle; } @@ -8761,14 +8799,14 @@ static void md_bitmap_start(struct mddev *mddev, mddev->pers->bitmap_sector(mddev, &md_io_clone->offset, &md_io_clone->sectors); - mddev->bitmap_ops->startwrite(mddev, md_io_clone->offset, - md_io_clone->sectors); + mddev->bitmap_ops->start_write(mddev, md_io_clone->offset, + md_io_clone->sectors); } static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone) { - mddev->bitmap_ops->endwrite(mddev, md_io_clone->offset, - md_io_clone->sectors); + mddev->bitmap_ops->end_write(mddev, md_io_clone->offset, + md_io_clone->sectors); } static void md_end_clone_io(struct bio *bio) @@ -8947,6 +8985,23 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) } } +static bool sync_io_within_limit(struct mddev *mddev) +{ + int io_sectors; + + /* + * For raid456, sync IO is stripe(4k) per IO, for other levels, it's + * RESYNC_PAGES(64k) per IO. + */ + if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6) + io_sectors = 8; + else + io_sectors = 128; + + return atomic_read(&mddev->recovery_active) < + io_sectors * sync_io_depth(mddev); +} + #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) #define UPDATE_FREQUENCY (5*60*HZ) @@ -8982,7 +9037,7 @@ void md_do_sync(struct md_thread *thread) } if (mddev_is_clustered(mddev)) { - ret = md_cluster_ops->resync_start(mddev); + ret = mddev->cluster_ops->resync_start(mddev); if (ret) goto skip; @@ -9009,7 +9064,7 @@ void md_do_sync(struct md_thread *thread) * */ if (mddev_is_clustered(mddev)) - md_cluster_ops->resync_start_notify(mddev); + mddev->cluster_ops->resync_start_notify(mddev); do { int mddev2_minor = -1; mddev->curr_resync = MD_RESYNC_DELAYED; @@ -9215,7 +9270,8 @@ void md_do_sync(struct md_thread *thread) msleep(500); goto repeat; } - if (!is_mddev_idle(mddev, 0)) { + if (!sync_io_within_limit(mddev) && + !is_mddev_idle(mddev, 0)) { /* * Give other IO more of a chance. * The faster the devices, the less we wait. @@ -9362,6 +9418,12 @@ static bool rdev_is_spare(struct md_rdev *rdev) static bool rdev_addable(struct md_rdev *rdev) { + struct mddev *mddev; + + mddev = READ_ONCE(rdev->mddev); + if (!mddev) + return false; + /* rdev is already used, don't add it again. */ if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 || test_bit(Faulty, &rdev->flags)) @@ -9372,7 +9434,7 @@ static bool rdev_addable(struct md_rdev *rdev) return true; /* Allow to add if array is read-write. */ - if (md_is_rdwr(rdev->mddev)) + if (md_is_rdwr(mddev)) return true; /* @@ -9400,17 +9462,11 @@ static bool md_spares_need_change(struct mddev *mddev) return false; } -static int remove_and_add_spares(struct mddev *mddev, - struct md_rdev *this) +static int remove_spares(struct mddev *mddev, struct md_rdev *this) { struct md_rdev *rdev; - int spares = 0; int removed = 0; - if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - /* Mustn't remove devices when resync thread is running */ - return 0; - rdev_for_each(rdev, mddev) { if ((this == NULL || rdev == this) && rdev_removeable(rdev) && !mddev->pers->hot_remove_disk(mddev, rdev)) { @@ -9424,6 +9480,21 @@ static int remove_and_add_spares(struct mddev *mddev, if (removed && mddev->kobj.sd) sysfs_notify_dirent_safe(mddev->sysfs_degraded); + return removed; +} + +static int remove_and_add_spares(struct mddev *mddev, + struct md_rdev *this) +{ + struct md_rdev *rdev; + int spares = 0; + int removed = 0; + + if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + /* Mustn't remove devices when resync thread is running */ + return 0; + + removed = remove_spares(mddev, this); if (this && removed) goto no_add; @@ -9466,6 +9537,7 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) /* Check if resync is in progress. */ if (mddev->recovery_cp < MaxSector) { + remove_spares(mddev, NULL); set_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); return true; @@ -9702,8 +9774,8 @@ void md_check_recovery(struct mddev *mddev) * remove disk. */ rdev_for_each_safe(rdev, tmp, mddev) { - if (test_and_clear_bit(ClusterRemove, &rdev->flags) && - rdev->raid_disk < 0) + if (rdev->raid_disk < 0 && + test_and_clear_bit(ClusterRemove, &rdev->flags)) md_kick_rdev_from_array(rdev); } } @@ -9793,7 +9865,7 @@ void md_reap_sync_thread(struct mddev *mddev) * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by * clustered raid */ if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) - md_cluster_ops->resync_finish(mddev); + mddev->cluster_ops->resync_finish(mddev); clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); @@ -9801,13 +9873,13 @@ void md_reap_sync_thread(struct mddev *mddev) clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); /* - * We call md_cluster_ops->update_size here because sync_size could + * We call mddev->cluster_ops->update_size here because sync_size could * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, * so it is time to update size across cluster. */ if (mddev_is_clustered(mddev) && is_reshaped && !test_bit(MD_CLOSING, &mddev->flags)) - md_cluster_ops->update_size(mddev, old_dev_sectors); + mddev->cluster_ops->update_size(mddev, old_dev_sectors); /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); sysfs_notify_dirent_safe(mddev->sysfs_completed); @@ -9845,9 +9917,9 @@ EXPORT_SYMBOL(md_finish_reshape); /* Bad block management */ -/* Returns 1 on success, 0 on failure */ -int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, - int is_new) +/* Returns true on success, false on failure */ +bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new) { struct mddev *mddev = rdev->mddev; @@ -9859,7 +9931,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, * avoid it. */ if (test_bit(Faulty, &rdev->flags)) - return 1; + return true; if (is_new) s += rdev->new_data_offset; @@ -9867,7 +9939,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, s += rdev->data_offset; if (!badblocks_set(&rdev->badblocks, s, sectors, 0)) - return 0; + return false; /* Make sure they get written out promptly */ if (test_bit(ExternalBbl, &rdev->flags)) @@ -9876,12 +9948,12 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, set_mask_bits(&mddev->sb_flags, 0, BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); md_wakeup_thread(rdev->mddev->thread); - return 1; + return true; } EXPORT_SYMBOL_GPL(rdev_set_badblocks); -int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, - int is_new) +void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new) { if (is_new) s += rdev->new_data_offset; @@ -9889,11 +9961,10 @@ int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, s += rdev->data_offset; if (!badblocks_clear(&rdev->badblocks, s, sectors)) - return 0; + return; if (test_bit(ExternalBbl, &rdev->flags)) sysfs_notify_dirent_safe(rdev->sysfs_badblocks); - return 1; } EXPORT_SYMBOL_GPL(rdev_clear_badblocks); @@ -10010,8 +10081,11 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) /* Check for change of roles in the active devices */ rdev_for_each_safe(rdev2, tmp, mddev) { - if (test_bit(Faulty, &rdev2->flags)) + if (test_bit(Faulty, &rdev2->flags)) { + if (test_bit(ClusterRemove, &rdev2->flags)) + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); continue; + } /* Check if the roles changed */ role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); @@ -10034,7 +10108,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && - !md_cluster_ops->resync_status_get(mddev)) { + !mddev->cluster_ops->resync_status_get(mddev)) { /* * -1 to make raid1_add_disk() set conf->fullsync * to 1. This could avoid skipping sync when the |