diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-08-05 20:51:40 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-08-05 20:51:40 +0300 |
commit | e0fc99e21e6e299673f1640105ac0c1d829c2d93 (patch) | |
tree | 4637b171164c3e653e9002b2bc962b32f05b5a07 /drivers/md/md.c | |
parent | 4834ce9d8e074bb7ae197632e0708219b9f389b5 (diff) | |
parent | f59589fc89665102923725e80e12f782d5f74f67 (diff) | |
download | linux-e0fc99e21e6e299673f1640105ac0c1d829c2d93.tar.xz |
Merge tag 'for-5.9/drivers-20200803' of git://git.kernel.dk/linux-block
Pull block driver updates from Jens Axboe:
- NVMe:
- ZNS support (Aravind, Keith, Matias, Niklas)
- Misc cleanups, optimizations, fixes (Baolin, Chaitanya, David,
Dongli, Max, Sagi)
- null_blk zone capacity support (Aravind)
- MD:
- raid5/6 fixes (ChangSyun)
- Warning fixes (Damien)
- raid5 stripe fixes (Guoqing, Song, Yufen)
- sysfs deadlock fix (Junxiao)
- raid10 deadlock fix (Vitaly)
- struct_size conversions (Gustavo)
- Set of bcache updates/fixes (Coly)
* tag 'for-5.9/drivers-20200803' of git://git.kernel.dk/linux-block: (117 commits)
md/raid5: Allow degraded raid6 to do rmw
md/raid5: Fix Force reconstruct-write io stuck in degraded raid5
raid5: don't duplicate code for different paths in handle_stripe
raid5-cache: hold spinlock instead of mutex in r5c_journal_mode_show
md: print errno in super_written
md/raid5: remove the redundant setting of STRIPE_HANDLE
md: register new md sysfs file 'uuid' read-only
md: fix max sectors calculation for super 1.0
nvme-loop: remove extra variable in create ctrl
nvme-loop: set ctrl state connecting after init
nvme-multipath: do not fall back to __nvme_find_path() for non-optimized paths
nvme-multipath: fix logic for non-optimized paths
nvme-rdma: fix controller reset hang during traffic
nvme-tcp: fix controller reset hang during traffic
nvmet: introduce the passthru Kconfig option
nvmet: introduce the passthru configfs interface
nvmet: Add passthru enable/disable helpers
nvmet: add passthru code to process commands
nvme: export nvme_find_get_ns() and nvme_put_ns()
nvme: introduce nvme_ctrl_get_by_path()
...
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r-- | drivers/md/md.c | 181 |
1 files changed, 138 insertions, 43 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index 96b28f6d025c..153bc766bc5a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -101,6 +101,8 @@ static void mddev_detach(struct mddev *mddev); * count by 2 for every hour elapsed between read errors. */ #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 +/* Default safemode delay: 200 msec */ +#define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) /* * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' * is 1000 KB/sec, so the extra system load does not show up that much. @@ -463,24 +465,46 @@ check_suspended: } EXPORT_SYMBOL(md_handle_request); +struct md_io { + struct mddev *mddev; + bio_end_io_t *orig_bi_end_io; + void *orig_bi_private; + unsigned long start_time; +}; + +static void md_end_io(struct bio *bio) +{ + struct md_io *md_io = bio->bi_private; + struct mddev *mddev = md_io->mddev; + + disk_end_io_acct(mddev->gendisk, bio_op(bio), md_io->start_time); + + bio->bi_end_io = md_io->orig_bi_end_io; + bio->bi_private = md_io->orig_bi_private; + + mempool_free(md_io, &mddev->md_io_pool); + + if (bio->bi_end_io) + bio->bi_end_io(bio); +} + static blk_qc_t md_submit_bio(struct bio *bio) { const int rw = bio_data_dir(bio); - const int sgrp = op_stat_group(bio_op(bio)); struct mddev *mddev = bio->bi_disk->private_data; - unsigned int sectors; - if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { + if (mddev == NULL || mddev->pers == NULL) { bio_io_error(bio); return BLK_QC_T_NONE; } - blk_queue_split(&bio); - - if (mddev == NULL || mddev->pers == NULL) { + if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { bio_io_error(bio); return BLK_QC_T_NONE; } + + blk_queue_split(&bio); + if (mddev->ro == 1 && unlikely(rw == WRITE)) { if (bio_sectors(bio) != 0) bio->bi_status = BLK_STS_IOERR; @@ -488,21 +512,27 @@ static blk_qc_t md_submit_bio(struct bio *bio) return BLK_QC_T_NONE; } - /* - * save the sectors now since our bio can - * go away inside make_request - */ - sectors = bio_sectors(bio); + if (bio->bi_end_io != md_end_io) { + struct md_io *md_io; + + md_io = mempool_alloc(&mddev->md_io_pool, GFP_NOIO); + md_io->mddev = mddev; + md_io->orig_bi_end_io = bio->bi_end_io; + md_io->orig_bi_private = bio->bi_private; + + bio->bi_end_io = md_end_io; + bio->bi_private = md_io; + + md_io->start_time = disk_start_io_acct(mddev->gendisk, + bio_sectors(bio), + bio_op(bio)); + } + /* bio could be mergeable after passing to underlayer */ bio->bi_opf &= ~REQ_NOMERGE; md_handle_request(mddev, bio); - part_stat_lock(); - part_stat_inc(&mddev->gendisk->part0, ios[sgrp]); - part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors); - part_stat_unlock(); - return BLK_QC_T_NONE; } @@ -928,7 +958,8 @@ static void super_written(struct bio *bio) struct mddev *mddev = rdev->mddev; if (bio->bi_status) { - pr_err("md: super_written gets error=%d\n", bio->bi_status); + pr_err("md: %s gets error=%d\n", __func__, + blk_status_to_errno(bio->bi_status)); md_error(mddev, rdev); if (!test_bit(Faulty, &rdev->flags) && (bio->bi_opf & MD_FAILFAST)) { @@ -2143,6 +2174,24 @@ retry: sb->sb_csum = calc_sb_1_csum(sb); } +static sector_t super_1_choose_bm_space(sector_t dev_size) +{ + sector_t bm_space; + + /* if the device is bigger than 8Gig, save 64k for bitmap + * usage, if bigger than 200Gig, save 128k + */ + if (dev_size < 64*2) + bm_space = 0; + else if (dev_size - 64*2 >= 200*1024*1024*2) + bm_space = 128*2; + else if (dev_size - 4*2 > 8*1024*1024*2) + bm_space = 64*2; + else + bm_space = 4*2; + return bm_space; +} + static unsigned long long super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) { @@ -2163,13 +2212,22 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) return 0; } else { /* minor version 0; superblock after data */ - sector_t sb_start; - sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2; + sector_t sb_start, bm_space; + sector_t dev_size = i_size_read(rdev->bdev->bd_inode) >> 9; + + /* 8K is for superblock */ + sb_start = dev_size - 8*2; sb_start &= ~(sector_t)(4*2 - 1); - max_sectors = rdev->sectors + sb_start - rdev->sb_start; + + bm_space = super_1_choose_bm_space(dev_size); + + /* Space that can be used to store date needs to decrease + * superblock bitmap space and bad block space(4K) + */ + max_sectors = sb_start - bm_space - 4*2; + if (!num_sectors || num_sectors > max_sectors) num_sectors = max_sectors; - rdev->sb_start = sb_start; } sb = page_address(rdev->sb_page); sb->data_size = cpu_to_le64(num_sectors); @@ -2421,9 +2479,13 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) goto fail; ko = &part_to_dev(rdev->bdev->bd_part)->kobj; - if (sysfs_create_link(&rdev->kobj, ko, "block")) - /* failure here is OK */; + /* failure here is OK */ + err = sysfs_create_link(&rdev->kobj, ko, "block"); rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); + rdev->sysfs_unack_badblocks = + sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); + rdev->sysfs_badblocks = + sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks"); list_add_rcu(&rdev->same_set, &mddev->disks); bd_link_disk_holder(rdev->bdev, mddev->gendisk); @@ -2457,7 +2519,11 @@ static void unbind_rdev_from_array(struct md_rdev *rdev) rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); + sysfs_put(rdev->sysfs_unack_badblocks); + sysfs_put(rdev->sysfs_badblocks); rdev->sysfs_state = NULL; + rdev->sysfs_unack_badblocks = NULL; + rdev->sysfs_badblocks = NULL; rdev->badblocks.count = 0; /* We need to delay this, otherwise we can deadlock when * writing to 'remove' to "dev/state". We also need @@ -2802,7 +2868,7 @@ rewrite: goto repeat; wake_up(&mddev->sb_wait); if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); rdev_for_each(rdev, mddev) { if (test_and_clear_bit(FaultRecorded, &rdev->flags)) @@ -3182,8 +3248,8 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) return err; } else sysfs_notify_dirent_safe(rdev->sysfs_state); - if (sysfs_link_rdev(rdev->mddev, rdev)) - /* failure here is OK */; + /* failure here is OK */; + sysfs_link_rdev(rdev->mddev, rdev); /* don't wakeup anyone, leave that to userspace. */ } else { if (slot >= rdev->mddev->raid_disks && @@ -4055,7 +4121,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev_resume(mddev); if (!mddev->thread) md_update_sb(mddev, 1); - sysfs_notify(&mddev->kobj, NULL, "level"); + sysfs_notify_dirent_safe(mddev->sysfs_level); md_new_event(mddev); rv = len; out_unlock: @@ -4168,6 +4234,14 @@ static struct md_sysfs_entry md_raid_disks = __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); static ssize_t +uuid_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%pU\n", mddev->uuid); +} +static struct md_sysfs_entry md_uuid = +__ATTR(uuid, S_IRUGO, uuid_show, NULL); + +static ssize_t chunk_size_show(struct mddev *mddev, char *page) { if (mddev->reshape_position != MaxSector && @@ -4808,7 +4882,7 @@ action_store(struct mddev *mddev, const char *page, size_t len) } if (err) return err; - sysfs_notify(&mddev->kobj, NULL, "degraded"); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); } else { if (cmd_match(page, "check")) set_bit(MD_RECOVERY_CHECK, &mddev->recovery); @@ -5423,6 +5497,7 @@ static struct attribute *md_default_attrs[] = { &md_level.attr, &md_layout.attr, &md_raid_disks.attr, + &md_uuid.attr, &md_chunk_size.attr, &md_size.attr, &md_resync_start.attr, @@ -5514,6 +5589,13 @@ static void md_free(struct kobject *ko) if (mddev->sysfs_state) sysfs_put(mddev->sysfs_state); + if (mddev->sysfs_completed) + sysfs_put(mddev->sysfs_completed); + if (mddev->sysfs_degraded) + sysfs_put(mddev->sysfs_degraded); + if (mddev->sysfs_level) + sysfs_put(mddev->sysfs_level); + if (mddev->gendisk) del_gendisk(mddev->gendisk); @@ -5525,6 +5607,7 @@ static void md_free(struct kobject *ko) bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); + mempool_exit(&mddev->md_io_pool); kfree(mddev); } @@ -5620,6 +5703,11 @@ static int md_alloc(dev_t dev, char *name) */ mddev->hold_active = UNTIL_STOP; + error = mempool_init_kmalloc_pool(&mddev->md_io_pool, BIO_POOL_SIZE, + sizeof(struct md_io)); + if (error) + goto abort; + error = -ENOMEM; mddev->queue = blk_alloc_queue(NUMA_NO_NODE); if (!mddev->queue) @@ -5676,6 +5764,9 @@ static int md_alloc(dev_t dev, char *name) if (!error && mddev->kobj.sd) { kobject_uevent(&mddev->kobj, KOBJ_ADD); mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); + mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); + mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); + mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); } mddev_put(mddev); return error; @@ -5961,7 +6052,7 @@ int md_run(struct mddev *mddev) if (mddev_is_clustered(mddev)) mddev->safemode_delay = 0; else - mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ + mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; mddev->in_sync = 1; smp_wmb(); spin_lock(&mddev->lock); @@ -6028,7 +6119,7 @@ static int do_md_run(struct mddev *mddev) kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); sysfs_notify_dirent_safe(mddev->sysfs_state); sysfs_notify_dirent_safe(mddev->sysfs_action); - sysfs_notify(&mddev->kobj, NULL, "degraded"); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); out: clear_bit(MD_NOT_READY, &mddev->flags); return err; @@ -7339,6 +7430,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.nodes = 0; md_cluster_ops->leave(mddev); + module_put(md_cluster_mod); + mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; } mddev_suspend(mddev); md_bitmap_destroy(mddev); @@ -8330,6 +8423,7 @@ EXPORT_SYMBOL(unregister_md_cluster_operations); int md_setup_cluster(struct mddev *mddev, int nodes) { + int ret; if (!md_cluster_ops) request_module("md-cluster"); spin_lock(&pers_lock); @@ -8341,7 +8435,10 @@ int md_setup_cluster(struct mddev *mddev, int nodes) } spin_unlock(&pers_lock); - return md_cluster_ops->join(mddev, nodes); + ret = md_cluster_ops->join(mddev, nodes); + if (!ret) + mddev->safemode_delay = 0; + return ret; } void md_cluster_stop(struct mddev *mddev) @@ -8742,7 +8839,7 @@ void md_do_sync(struct md_thread *thread) } else mddev->curr_resync = 3; /* no longer delayed */ mddev->curr_resync_completed = j; - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); md_new_event(mddev); update_time = jiffies; @@ -8770,7 +8867,7 @@ void md_do_sync(struct md_thread *thread) mddev->recovery_cp = j; update_time = jiffies; set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); } while (j >= mddev->resync_max && @@ -8877,7 +8974,7 @@ void md_do_sync(struct md_thread *thread) !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && mddev->curr_resync > 3) { mddev->curr_resync_completed = mddev->curr_resync; - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); } mddev->pers->sync_request(mddev, max_sectors, &skipped); @@ -9007,7 +9104,7 @@ static int remove_and_add_spares(struct mddev *mddev, } if (removed && mddev->kobj.sd) - sysfs_notify(&mddev->kobj, NULL, "degraded"); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); if (this && removed) goto no_add; @@ -9035,8 +9132,8 @@ static int remove_and_add_spares(struct mddev *mddev, rdev->recovery_offset = 0; } if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { - if (sysfs_link_rdev(mddev, rdev)) - /* failure here is OK */; + /* failure here is OK */ + sysfs_link_rdev(mddev, rdev); if (!test_bit(Journal, &rdev->flags)) spares++; md_new_event(mddev); @@ -9290,8 +9387,7 @@ void md_reap_sync_thread(struct mddev *mddev) /* success...*/ /* activate any spares */ if (mddev->pers->spare_active(mddev)) { - sysfs_notify(&mddev->kobj, NULL, - "degraded"); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); } } @@ -9381,8 +9477,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, if (rv == 0) { /* Make sure they get written out promptly */ if (test_bit(ExternalBbl, &rdev->flags)) - sysfs_notify(&rdev->kobj, NULL, - "unacknowledged_bad_blocks"); + sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); sysfs_notify_dirent_safe(rdev->sysfs_state); set_mask_bits(&mddev->sb_flags, 0, BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); @@ -9403,7 +9498,7 @@ int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, s += rdev->data_offset; rv = badblocks_clear(&rdev->badblocks, s, sectors); if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) - sysfs_notify(&rdev->kobj, NULL, "bad_blocks"); + sysfs_notify_dirent_safe(rdev->sysfs_badblocks); return rv; } EXPORT_SYMBOL_GPL(rdev_clear_badblocks); @@ -9633,7 +9728,7 @@ static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) if (rdev->recovery_offset == MaxSector && !test_bit(In_sync, &rdev->flags) && mddev->pers->spare_active(mddev)) - sysfs_notify(&mddev->kobj, NULL, "degraded"); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); put_page(swapout); return 0; |