diff options
Diffstat (limited to 'drivers/md')
50 files changed, 2701 insertions, 807 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index ddb37f6670de..07c19b2182ca 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -37,6 +37,32 @@ config BLK_DEV_MD If unsure, say N. +config MD_BITMAP + bool "MD RAID bitmap support" + default y + depends on BLK_DEV_MD + help + If you say Y here, support for the write intent bitmap will be + enabled. The bitmap can be used to optimize resync speed after power + failure or readding a disk, limiting it to recorded dirty sectors in + bitmap. + + This feature can be added to existing MD array or MD array can be + created with bitmap via mdadm(8). + + If unsure, say Y. + +config MD_LLBITMAP + bool "MD RAID lockless bitmap support" + depends on BLK_DEV_MD + help + If you say Y here, support for the lockless write intent bitmap will + be enabled. + + Note, this is an experimental feature. + + If unsure, say N. + config MD_AUTODETECT bool "Autodetect RAID arrays during kernel boot" depends on BLK_DEV_MD=y @@ -54,6 +80,7 @@ config MD_AUTODETECT config MD_BITMAP_FILE bool "MD bitmap file support (deprecated)" default y + depends on MD_BITMAP help If you say Y here, support for write intent bitmaps in files on an external file system is enabled. This is an alternative to the internal @@ -174,6 +201,7 @@ config MD_RAID456 config MD_CLUSTER tristate "Cluster Support for MD" + select MD_BITMAP depends on BLK_DEV_MD depends on DLM default n @@ -393,6 +421,7 @@ config DM_RAID select MD_RAID1 select MD_RAID10 select MD_RAID456 + select MD_BITMAP select BLK_DEV_MD help A dm target that supports RAID1, RAID10, RAID4, RAID5 and RAID6 mappings diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 87bdfc9fe14c..5a51b3408b70 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -27,7 +27,9 @@ dm-clone-y += dm-clone-target.o dm-clone-metadata.o dm-verity-y += dm-verity-target.o dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o -md-mod-y += md.o md-bitmap.o +md-mod-y += md.o +md-mod-$(CONFIG_MD_BITMAP) += md-bitmap.o +md-mod-$(CONFIG_MD_LLBITMAP) += md-llbitmap.o raid456-y += raid5.o raid5-cache.o raid5-ppl.o linear-y += md-linear.o diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 7510d1c983a5..f327456fc4e0 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -115,8 +115,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) check = bio_kmalloc(nr_segs, GFP_NOIO); if (!check) return; - bio_init(check, bio->bi_bdev, check->bi_inline_vecs, nr_segs, - REQ_OP_READ); + bio_init_inline(check, bio->bi_bdev, nr_segs, REQ_OP_READ); check->bi_iter.bi_sector = bio->bi_iter.bi_sector; check->bi_iter.bi_size = bio->bi_iter.bi_size; diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 020712c5203f..2386d08bf4e4 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -26,8 +26,7 @@ struct bio *bch_bbio_alloc(struct cache_set *c) struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO); struct bio *bio = &b->bio; - bio_init(bio, NULL, bio->bi_inline_vecs, - meta_bucket_pages(&c->cache->sb), 0); + bio_init_inline(bio, NULL, meta_bucket_pages(&c->cache->sb), 0); return bio; } diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 7ff14bd2feb8..d50eb82ccb4f 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -615,7 +615,7 @@ static void do_journal_discard(struct cache *ca) atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT); - bio_init(bio, ca->bdev, bio->bi_inline_vecs, 1, REQ_OP_DISCARD); + bio_init_inline(bio, ca->bdev, 1, REQ_OP_DISCARD); bio->bi_iter.bi_sector = bucket_to_sector(ca->set, ca->sb.d[ja->discard_idx]); bio->bi_iter.bi_size = bucket_bytes(ca); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 26a6a535ec32..73918e55bf04 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -79,7 +79,7 @@ static void moving_init(struct moving_io *io) { struct bio *bio = &io->bio.bio; - bio_init(bio, NULL, bio->bi_inline_vecs, + bio_init_inline(bio, NULL, DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS), 0); bio_get(bio); bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); @@ -145,9 +145,9 @@ static void read_moving(struct cache_set *c) continue; } - io = kzalloc(struct_size(io, bio.bio.bi_inline_vecs, - DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), - GFP_KERNEL); + io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) * + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + GFP_KERNEL); if (!io) goto err; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 1492c8552255..6d250e366412 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2236,7 +2236,7 @@ static int cache_alloc(struct cache *ca) __module_get(THIS_MODULE); kobject_init(&ca->kobj, &bch_cache_ktype); - bio_init(&ca->journal.bio, NULL, ca->journal.bio.bi_inline_vecs, 8, 0); + bio_init_inline(&ca->journal.bio, NULL, 8, 0); /* * When the cache disk is first registered, ca->sb.njournal_buckets diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 302e75f1fc4b..6ba73dc1a3df 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -331,7 +331,7 @@ static void dirty_init(struct keybuf_key *w) struct dirty_io *io = w->private; struct bio *bio = &io->bio; - bio_init(bio, NULL, bio->bi_inline_vecs, + bio_init_inline(bio, NULL, DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0); if (!io->dc->writeback_percent) bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); @@ -536,9 +536,9 @@ static void read_dirty(struct cached_dev *dc) for (i = 0; i < nk; i++) { w = keys[i]; - io = kzalloc(struct_size(io, bio.bi_inline_vecs, - DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), - GFP_KERNEL); + io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) * + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + GFP_KERNEL); if (!io) goto err; diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index ff7595caf440..8f3a23f4b168 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1342,7 +1342,7 @@ static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector, use_dmio(b, op, sector, n_sectors, offset, ioprio); return; } - bio_init(bio, b->c->bdev, bio->bi_inline_vecs, 1, op); + bio_init_inline(bio, b->c->bdev, 1, op); bio->bi_iter.bi_sector = sector; bio->bi_end_io = bio_complete; bio->bi_private = b; diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index c711db6f8f5c..08925aca838c 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -215,16 +215,19 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, } if (test_bit(DROP_WRITES, &fc->flags) && - (fc->corrupt_bio_rw == WRITE || fc->random_write_corrupt)) { + ((fc->corrupt_bio_byte && fc->corrupt_bio_rw == WRITE) || + fc->random_write_corrupt)) { ti->error = "drop_writes is incompatible with random_write_corrupt or corrupt_bio_byte with the WRITE flag set"; return -EINVAL; } else if (test_bit(ERROR_WRITES, &fc->flags) && - (fc->corrupt_bio_rw == WRITE || fc->random_write_corrupt)) { + ((fc->corrupt_bio_byte && fc->corrupt_bio_rw == WRITE) || + fc->random_write_corrupt)) { ti->error = "error_writes is incompatible with random_write_corrupt or corrupt_bio_byte with the WRITE flag set"; return -EINVAL; } else if (test_bit(ERROR_READS, &fc->flags) && - (fc->corrupt_bio_rw == READ || fc->random_read_corrupt)) { + ((fc->corrupt_bio_byte && fc->corrupt_bio_rw == READ) || + fc->random_read_corrupt)) { ti->error = "error_reads is incompatible with random_read_corrupt or corrupt_bio_byte with the READ flag set"; return -EINVAL; } @@ -438,7 +441,7 @@ static struct bio *clone_bio(struct dm_target *ti, struct flakey_c *fc, struct b if (!clone) return NULL; - bio_init(clone, fc->dev->bdev, clone->bi_inline_vecs, nr_iovecs, bio->bi_opf); + bio_init_inline(clone, fc->dev->bdev, nr_iovecs, bio->bi_opf); clone->bi_iter.bi_sector = flakey_map_sector(ti, bio->bi_iter.bi_sector); clone->bi_private = bio; diff --git a/drivers/md/dm-ima.c b/drivers/md/dm-ima.c index b90f34259fbb..8b50c908c6f4 100644 --- a/drivers/md/dm-ima.c +++ b/drivers/md/dm-ima.c @@ -241,10 +241,11 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl /* * First retrieve the target metadata. */ - scnprintf(target_metadata_buf, DM_IMA_TARGET_METADATA_BUF_LEN, - "target_index=%d,target_begin=%llu,target_len=%llu,", - i, ti->begin, ti->len); - target_metadata_buf_len = strlen(target_metadata_buf); + target_metadata_buf_len = + scnprintf(target_metadata_buf, + DM_IMA_TARGET_METADATA_BUF_LEN, + "target_index=%d,target_begin=%llu,target_len=%llu,", + i, ti->begin, ti->len); /* * Then retrieve the actual target data. @@ -448,11 +449,9 @@ void dm_ima_measure_on_device_resume(struct mapped_device *md, bool swap) if (r) goto error; - scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN, - "%sname=%s,uuid=%s;device_resume=no_data;", - DM_IMA_VERSION_STR, dev_name, dev_uuid); - l = strlen(device_table_data); - + l = scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN, + "%sname=%s,uuid=%s;device_resume=no_data;", + DM_IMA_VERSION_STR, dev_name, dev_uuid); } capacity_len = strlen(capacity_str); @@ -561,10 +560,9 @@ void dm_ima_measure_on_device_remove(struct mapped_device *md, bool remove_all) if (dm_ima_alloc_and_copy_name_uuid(md, &dev_name, &dev_uuid, noio)) goto error; - scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN, - "%sname=%s,uuid=%s;device_remove=no_data;", - DM_IMA_VERSION_STR, dev_name, dev_uuid); - l = strlen(device_table_data); + l = scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN, + "%sname=%s,uuid=%s;device_remove=no_data;", + DM_IMA_VERSION_STR, dev_name, dev_uuid); } memcpy(device_table_data + l, remove_all_str, remove_all_len); @@ -647,10 +645,9 @@ void dm_ima_measure_on_table_clear(struct mapped_device *md, bool new_map) if (dm_ima_alloc_and_copy_name_uuid(md, &dev_name, &dev_uuid, noio)) goto error2; - scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN, - "%sname=%s,uuid=%s;table_clear=no_data;", - DM_IMA_VERSION_STR, dev_name, dev_uuid); - l = strlen(device_table_data); + l = scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN, + "%sname=%s,uuid=%s;table_clear=no_data;", + DM_IMA_VERSION_STR, dev_name, dev_uuid); } capacity_len = strlen(capacity_str); @@ -706,7 +703,7 @@ void dm_ima_measure_on_device_rename(struct mapped_device *md) char *old_device_data = NULL, *new_device_data = NULL, *combined_device_data = NULL; char *new_dev_name = NULL, *new_dev_uuid = NULL, *capacity_str = NULL; bool noio = true; - int r; + int r, len; if (dm_ima_alloc_and_copy_device_data(md, &new_device_data, md->ima.active_table.num_targets, noio)) @@ -728,12 +725,11 @@ void dm_ima_measure_on_device_rename(struct mapped_device *md) md->ima.active_table.device_metadata = new_device_data; md->ima.active_table.device_metadata_len = strlen(new_device_data); - scnprintf(combined_device_data, DM_IMA_DEVICE_BUF_LEN * 2, - "%s%snew_name=%s,new_uuid=%s;%s", DM_IMA_VERSION_STR, old_device_data, - new_dev_name, new_dev_uuid, capacity_str); + len = scnprintf(combined_device_data, DM_IMA_DEVICE_BUF_LEN * 2, + "%s%snew_name=%s,new_uuid=%s;%s", DM_IMA_VERSION_STR, old_device_data, + new_dev_name, new_dev_uuid, capacity_str); - dm_ima_measure_data("dm_device_rename", combined_device_data, strlen(combined_device_data), - noio); + dm_ima_measure_data("dm_device_rename", combined_device_data, len, noio); goto exit; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index efeee0a873c0..ab96b692e5a3 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -133,7 +133,7 @@ struct journal_sector { commit_id_t commit_id; }; -#define MAX_TAG_SIZE (JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, last_bytes[MAX_SECTORS_PER_BLOCK])) +#define MAX_TAG_SIZE 255 #define METADATA_PADDING_SECTORS 8 diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 15538ec58f8e..73bf290af181 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -170,7 +170,7 @@ static struct dax_device *linear_dax_pgoff(struct dm_target *ti, pgoff_t *pgoff) static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, void **kaddr, - pfn_t *pfn) + unsigned long *pfn) { struct dax_device *dax_dev = linear_dax_pgoff(ti, &pgoff); diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index d484e8e1d48a..679b07dee229 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -893,7 +893,7 @@ static struct dax_device *log_writes_dax_pgoff(struct dm_target *ti, static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, void **kaddr, - pfn_t *pfn) + unsigned long *pfn) { struct dax_device *dax_dev = log_writes_dax_pgoff(ti, &pgoff); diff --git a/drivers/md/dm-path-selector.c b/drivers/md/dm-path-selector.c index 3e4cb81ce512..d0b883fabfeb 100644 --- a/drivers/md/dm-path-selector.c +++ b/drivers/md/dm-path-selector.c @@ -117,16 +117,16 @@ int dm_register_path_selector(struct path_selector_type *pst) } EXPORT_SYMBOL_GPL(dm_register_path_selector); -int dm_unregister_path_selector(struct path_selector_type *pst) +void dm_unregister_path_selector(struct path_selector_type *pst) { struct ps_internal *psi; down_write(&_ps_lock); psi = __find_path_selector_type(pst->name); - if (!psi) { + if (WARN_ON(!psi)) { up_write(&_ps_lock); - return -EINVAL; + return; } list_del(&psi->list); @@ -134,7 +134,5 @@ int dm_unregister_path_selector(struct path_selector_type *pst) up_write(&_ps_lock); kfree(psi); - - return 0; } EXPORT_SYMBOL_GPL(dm_unregister_path_selector); diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h index 3861b2d8b963..7b2270532e64 100644 --- a/drivers/md/dm-path-selector.h +++ b/drivers/md/dm-path-selector.h @@ -96,7 +96,7 @@ struct path_selector_type { int dm_register_path_selector(struct path_selector_type *type); /* Unregister a path selector */ -int dm_unregister_path_selector(struct path_selector_type *type); +void dm_unregister_path_selector(struct path_selector_type *type); /* Returns a registered path selector type */ struct path_selector_type *dm_get_path_selector(const char *name); diff --git a/drivers/md/dm-ps-historical-service-time.c b/drivers/md/dm-ps-historical-service-time.c index b49e10d76d03..f07e773d9cc0 100644 --- a/drivers/md/dm-ps-historical-service-time.c +++ b/drivers/md/dm-ps-historical-service-time.c @@ -541,8 +541,10 @@ static int __init dm_hst_init(void) { int r = dm_register_path_selector(&hst_ps); - if (r < 0) + if (r < 0) { DMERR("register failed %d", r); + return r; + } DMINFO("version " HST_VERSION " loaded"); @@ -551,10 +553,7 @@ static int __init dm_hst_init(void) static void __exit dm_hst_exit(void) { - int r = dm_unregister_path_selector(&hst_ps); - - if (r < 0) - DMERR("unregister failed %d", r); + dm_unregister_path_selector(&hst_ps); } module_init(dm_hst_init); diff --git a/drivers/md/dm-ps-io-affinity.c b/drivers/md/dm-ps-io-affinity.c index 716807e511ee..80415a045c68 100644 --- a/drivers/md/dm-ps-io-affinity.c +++ b/drivers/md/dm-ps-io-affinity.c @@ -260,10 +260,7 @@ static int __init dm_ioa_init(void) static void __exit dm_ioa_exit(void) { - int ret = dm_unregister_path_selector(&ioa_ps); - - if (ret < 0) - DMERR("unregister failed %d", ret); + dm_unregister_path_selector(&ioa_ps); } module_init(dm_ioa_init); diff --git a/drivers/md/dm-ps-queue-length.c b/drivers/md/dm-ps-queue-length.c index e305f05ad1e5..9c68701ed7a4 100644 --- a/drivers/md/dm-ps-queue-length.c +++ b/drivers/md/dm-ps-queue-length.c @@ -260,8 +260,10 @@ static int __init dm_ql_init(void) { int r = dm_register_path_selector(&ql_ps); - if (r < 0) + if (r < 0) { DMERR("register failed %d", r); + return r; + } DMINFO("version " QL_VERSION " loaded"); @@ -270,10 +272,7 @@ static int __init dm_ql_init(void) static void __exit dm_ql_exit(void) { - int r = dm_unregister_path_selector(&ql_ps); - - if (r < 0) - DMERR("unregister failed %d", r); + dm_unregister_path_selector(&ql_ps); } module_init(dm_ql_init); diff --git a/drivers/md/dm-ps-round-robin.c b/drivers/md/dm-ps-round-robin.c index d1745b123dc1..0c12f4073461 100644 --- a/drivers/md/dm-ps-round-robin.c +++ b/drivers/md/dm-ps-round-robin.c @@ -220,8 +220,10 @@ static int __init dm_rr_init(void) { int r = dm_register_path_selector(&rr_ps); - if (r < 0) + if (r < 0) { DMERR("register failed %d", r); + return r; + } DMINFO("version " RR_VERSION " loaded"); @@ -230,10 +232,7 @@ static int __init dm_rr_init(void) static void __exit dm_rr_exit(void) { - int r = dm_unregister_path_selector(&rr_ps); - - if (r < 0) - DMERR("unregister failed %d", r); + dm_unregister_path_selector(&rr_ps); } module_init(dm_rr_init); diff --git a/drivers/md/dm-ps-service-time.c b/drivers/md/dm-ps-service-time.c index 969d31c40272..0543fe7969c4 100644 --- a/drivers/md/dm-ps-service-time.c +++ b/drivers/md/dm-ps-service-time.c @@ -341,8 +341,10 @@ static int __init dm_st_init(void) { int r = dm_register_path_selector(&st_ps); - if (r < 0) + if (r < 0) { DMERR("register failed %d", r); + return r; + } DMINFO("version " ST_VERSION " loaded"); @@ -351,10 +353,7 @@ static int __init dm_st_init(void) static void __exit dm_st_exit(void) { - int r = dm_unregister_path_selector(&st_ps); - - if (r < 0) - DMERR("unregister failed %d", r); + dm_unregister_path_selector(&st_ps); } module_init(dm_st_init); diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index e8c0a8c6fb51..0a1788fed68c 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -14,7 +14,6 @@ #include "raid5.h" #include "raid10.h" #include "md-bitmap.h" -#include "dm-core.h" #include <linux/device-mapper.h> @@ -439,7 +438,7 @@ static bool rs_is_reshapable(struct raid_set *rs) /* Return true, if raid set in @rs is recovering */ static bool rs_is_recovering(struct raid_set *rs) { - return rs->md.recovery_cp < rs->md.dev_sectors; + return rs->md.resync_offset < rs->md.dev_sectors; } /* Return true, if raid set in @rs is reshaping */ @@ -769,7 +768,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r rs->md.layout = raid_type->algorithm; rs->md.new_layout = rs->md.layout; rs->md.delta_disks = 0; - rs->md.recovery_cp = MaxSector; + rs->md.resync_offset = MaxSector; for (i = 0; i < raid_devs; i++) md_rdev_init(&rs->dev[i].rdev); @@ -913,7 +912,7 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as) rs->md.external = 0; rs->md.persistent = 1; rs->md.major_version = 2; - } else if (rebuild && !rs->md.recovery_cp) { + } else if (rebuild && !rs->md.resync_offset) { /* * Without metadata, we will not be able to tell if the array * is in-sync or not - we must assume it is not. Therefore, @@ -1696,20 +1695,20 @@ static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors) { /* raid0 does not recover */ if (rs_is_raid0(rs)) - rs->md.recovery_cp = MaxSector; + rs->md.resync_offset = MaxSector; /* * A raid6 set has to be recovered either * completely or for the grown part to * ensure proper parity and Q-Syndrome */ else if (rs_is_raid6(rs)) - rs->md.recovery_cp = dev_sectors; + rs->md.resync_offset = dev_sectors; /* * Other raid set types may skip recovery * depending on the 'nosync' flag. */ else - rs->md.recovery_cp = test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags) + rs->md.resync_offset = test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags) ? MaxSector : dev_sectors; } @@ -2144,7 +2143,7 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev) sb->events = cpu_to_le64(mddev->events); sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset); - sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp); + sb->array_resync_offset = cpu_to_le64(mddev->resync_offset); sb->level = cpu_to_le32(mddev->level); sb->layout = cpu_to_le32(mddev->layout); @@ -2335,18 +2334,18 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) } if (!test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) - mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset); + mddev->resync_offset = le64_to_cpu(sb->array_resync_offset); /* * During load, we set FirstUse if a new superblock was written. * There are two reasons we might not have a superblock: * 1) The raid set is brand new - in which case, all of the * devices must have their In_sync bit set. Also, - * recovery_cp must be 0, unless forced. + * resync_offset must be 0, unless forced. * 2) This is a new device being added to an old raid set * and the new device needs to be rebuilt - in which * case the In_sync bit will /not/ be set and - * recovery_cp must be MaxSector. + * resync_offset must be MaxSector. * 3) This is/are a new device(s) being added to an old * raid set during takeover to a higher raid level * to provide capacity for redundancy or during reshape @@ -2391,8 +2390,8 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) new_devs > 1 ? "s" : ""); return -EINVAL; } else if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) && rs_is_recovering(rs)) { - DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)", - (unsigned long long) mddev->recovery_cp); + DMERR("'rebuild' specified while raid set is not in-sync (resync_offset=%llu)", + (unsigned long long) mddev->resync_offset); return -EINVAL; } else if (rs_is_reshaping(rs)) { DMERR("'rebuild' specified while raid set is being reshaped (reshape_position=%llu)", @@ -2532,6 +2531,10 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) struct md_rdev *rdev, *freshest; struct mddev *mddev = &rs->md; + /* Respect resynchronization requested with "sync" argument. */ + if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) + set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); + freshest = NULL; rdev_for_each(rdev, mddev) { if (test_bit(Journal, &rdev->flags)) @@ -2697,11 +2700,11 @@ static int rs_adjust_data_offsets(struct raid_set *rs) } out: /* - * Raise recovery_cp in case data_offset != 0 to + * Raise resync_offset in case data_offset != 0 to * avoid false recovery positives in the constructor. */ - if (rs->md.recovery_cp < rs->md.dev_sectors) - rs->md.recovery_cp += rs->dev[0].rdev.data_offset; + if (rs->md.resync_offset < rs->md.dev_sectors) + rs->md.resync_offset += rs->dev[0].rdev.data_offset; /* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */ rdev_for_each(rdev, &rs->md) { @@ -2756,7 +2759,7 @@ static int rs_setup_takeover(struct raid_set *rs) } clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags); - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; while (d--) { rdev = &rs->dev[d].rdev; @@ -2764,7 +2767,7 @@ static int rs_setup_takeover(struct raid_set *rs) if (test_bit(d, (void *) rs->rebuild_disks)) { clear_bit(In_sync, &rdev->flags); clear_bit(Faulty, &rdev->flags); - mddev->recovery_cp = rdev->recovery_offset = 0; + mddev->resync_offset = rdev->recovery_offset = 0; /* Bitmap has to be created when we do an "up" takeover */ set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); } @@ -3222,7 +3225,7 @@ size_check: if (r) goto bad; - rs_setup_recovery(rs, rs->md.recovery_cp < rs->md.dev_sectors ? rs->md.recovery_cp : rs->md.dev_sectors); + rs_setup_recovery(rs, rs->md.resync_offset < rs->md.dev_sectors ? rs->md.resync_offset : rs->md.dev_sectors); } else { /* This is no size change or it is shrinking, update size and record in superblocks */ r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false); @@ -3305,7 +3308,7 @@ size_check: /* Disable/enable discard support on raid set. */ configure_discard_support(rs); - rs->md.dm_gendisk = ti->table->md->disk; + rs->md.dm_gendisk = dm_disk(dm_table_get_md(ti->table)); mddev_unlock(&rs->md); return 0; @@ -3446,7 +3449,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, } else { if (state == st_idle && !test_bit(MD_RECOVERY_INTR, &recovery)) - r = mddev->recovery_cp; + r = mddev->resync_offset; else r = mddev->curr_resync_completed; @@ -3810,8 +3813,10 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) struct raid_set *rs = ti->private; unsigned int chunk_size_bytes = to_bytes(rs->md.chunk_sectors); - limits->io_min = chunk_size_bytes; - limits->io_opt = chunk_size_bytes * mddev_data_stripes(rs); + if (chunk_size_bytes) { + limits->io_min = chunk_size_bytes; + limits->io_opt = chunk_size_bytes * mddev_data_stripes(rs); + } } static void raid_presuspend(struct dm_target *ti) @@ -3950,9 +3955,11 @@ static int __load_dirty_region_bitmap(struct raid_set *rs) !test_and_set_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) { struct mddev *mddev = &rs->md; - r = mddev->bitmap_ops->load(mddev); - if (r) - DMERR("Failed to load bitmap"); + if (md_bitmap_enabled(mddev, false)) { + r = mddev->bitmap_ops->load(mddev); + if (r) + DMERR("Failed to load bitmap"); + } } return r; @@ -4067,16 +4074,18 @@ static int raid_preresume(struct dm_target *ti) mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) { int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize; - r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors, - chunksize, false); - if (r) - DMERR("Failed to resize bitmap"); + if (md_bitmap_enabled(mddev, false)) { + r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors, + chunksize); + if (r) + DMERR("Failed to resize bitmap"); + } } /* Check for any resize/reshape on @rs and adjust/initiate */ - if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) { + if (mddev->resync_offset && mddev->resync_offset < MaxSector) { set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - mddev->resync_min = mddev->recovery_cp; + mddev->resync_min = mddev->resync_offset; if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) mddev->resync_max_sectors = mddev->dev_sectors; } diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 5bbbdf8fc1bd..1461dc740dae 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -316,7 +316,7 @@ static struct dax_device *stripe_dax_pgoff(struct dm_target *ti, pgoff_t *pgoff) static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, void **kaddr, - pfn_t *pfn) + unsigned long *pfn) { struct dax_device *dax_dev = stripe_dax_pgoff(ti, &pgoff); @@ -456,11 +456,15 @@ static void stripe_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct stripe_c *sc = ti->private; - unsigned int chunk_size = sc->chunk_size << SECTOR_SHIFT; + unsigned int io_min, io_opt; limits->chunk_sectors = sc->chunk_size; - limits->io_min = chunk_size; - limits->io_opt = chunk_size * sc->stripes; + + if (!check_shl_overflow(sc->chunk_size, SECTOR_SHIFT, &io_min) && + !check_mul_overflow(io_min, sc->stripes, &io_opt)) { + limits->io_min = io_min; + limits->io_opt = io_opt; + } } static struct target_type stripe_target = { diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index d9d5e6aa5707..ad0a60a07b93 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -899,17 +899,17 @@ static bool dm_table_supports_dax(struct dm_table *t, return true; } -static int device_is_rq_stackable(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int device_is_not_rq_stackable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { struct block_device *bdev = dev->bdev; struct request_queue *q = bdev_get_queue(bdev); /* request-based cannot stack on partitions! */ if (bdev_is_partition(bdev)) - return false; + return true; - return queue_is_mq(q); + return !queue_is_mq(q); } static int dm_table_determine_type(struct dm_table *t) @@ -1005,7 +1005,7 @@ verify_rq_based: /* Non-request-stackable devices can't be used for request-based dm */ if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, device_is_rq_stackable, NULL)) { + ti->type->iterate_devices(ti, device_is_not_rq_stackable, NULL)) { DMERR("table load rejected: including non-request-stackable devices"); return -EINVAL; } diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 652627aea11b..2af5a9514c05 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -255,7 +255,7 @@ static void io_err_io_hints(struct dm_target *ti, struct queue_limits *limits) static long io_err_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, void **kaddr, - pfn_t *pfn) + unsigned long *pfn) { return -EIO; } diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 05cf4e3f2bbe..007bb93e5fca 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -4111,8 +4111,8 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | - DM_TARGET_IMMUTABLE, - .version = {1, 23, 0}, + DM_TARGET_IMMUTABLE | DM_TARGET_PASSES_CRYPTO, + .version = {1, 24, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -4497,7 +4497,8 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type thin_target = { .name = "thin", - .version = {1, 23, 0}, + .features = DM_TARGET_PASSES_CRYPTO, + .version = {1, 24, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, diff --git a/drivers/md/dm-vdo/funnel-workqueue.c b/drivers/md/dm-vdo/funnel-workqueue.c index ae11941c90a9..0613c82bbe8e 100644 --- a/drivers/md/dm-vdo/funnel-workqueue.c +++ b/drivers/md/dm-vdo/funnel-workqueue.c @@ -252,8 +252,7 @@ static void service_work_queue(struct simple_work_queue *queue) * This speeds up some performance tests; that "other work" might include other VDO * threads. */ - if (need_resched()) - cond_resched(); + cond_resched(); } run_finish_hook(queue); diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c index e7f4153e55e3..8fc22fb14196 100644 --- a/drivers/md/dm-vdo/vio.c +++ b/drivers/md/dm-vdo/vio.c @@ -212,7 +212,7 @@ int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t return VDO_SUCCESS; bio->bi_ioprio = 0; - bio->bi_io_vec = bio->bi_inline_vecs; + bio->bi_io_vec = bio_inline_vecs(bio); bio->bi_max_vecs = vio->block_count + 1; if (VDO_ASSERT(size <= vio_size, "specified size %d is not greater than allocated %d", size, vio_size) != VDO_SUCCESS) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 631a887b487c..d382a390d39a 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -191,7 +191,7 @@ static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io, u8 *want_digest, u8 *data) { if (unlikely(verity_hash(v, io, data, 1 << v->data_dev_block_bits, - verity_io_real_digest(v, io), true))) + verity_io_real_digest(v, io)))) return 0; return memcmp(verity_io_real_digest(v, io), want_digest, @@ -392,7 +392,7 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, /* Always re-validate the corrected block against the expected hash */ r = verity_hash(v, io, fio->output, 1 << v->data_dev_block_bits, - verity_io_real_digest(v, io), true); + verity_io_real_digest(v, io)); if (unlikely(r < 0)) return r; diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 81186bded1ce..66a00a8ccb39 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -19,7 +19,6 @@ #include "dm-audit.h" #include <linux/module.h> #include <linux/reboot.h> -#include <linux/scatterlist.h> #include <linux/string.h> #include <linux/jump_label.h> #include <linux/security.h> @@ -61,9 +60,6 @@ module_param_array_named(use_bh_bytes, dm_verity_use_bh_bytes, uint, NULL, 0644) static DEFINE_STATIC_KEY_FALSE(use_bh_wq_enabled); -/* Is at least one dm-verity instance using ahash_tfm instead of shash_tfm? */ -static DEFINE_STATIC_KEY_FALSE(ahash_enabled); - struct dm_verity_prefetch_work { struct work_struct work; struct dm_verity *v; @@ -118,100 +114,21 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block, return block >> (level * v->hash_per_block_bits); } -static int verity_ahash_update(struct dm_verity *v, struct ahash_request *req, - const u8 *data, size_t len, - struct crypto_wait *wait) -{ - struct scatterlist sg; - - if (likely(!is_vmalloc_addr(data))) { - sg_init_one(&sg, data, len); - ahash_request_set_crypt(req, &sg, NULL, len); - return crypto_wait_req(crypto_ahash_update(req), wait); - } - - do { - int r; - size_t this_step = min_t(size_t, len, PAGE_SIZE - offset_in_page(data)); - - flush_kernel_vmap_range((void *)data, this_step); - sg_init_table(&sg, 1); - sg_set_page(&sg, vmalloc_to_page(data), this_step, offset_in_page(data)); - ahash_request_set_crypt(req, &sg, NULL, this_step); - r = crypto_wait_req(crypto_ahash_update(req), wait); - if (unlikely(r)) - return r; - data += this_step; - len -= this_step; - } while (len); - - return 0; -} - -/* - * Wrapper for crypto_ahash_init, which handles verity salting. - */ -static int verity_ahash_init(struct dm_verity *v, struct ahash_request *req, - struct crypto_wait *wait, bool may_sleep) -{ - int r; - - ahash_request_set_tfm(req, v->ahash_tfm); - ahash_request_set_callback(req, - may_sleep ? CRYPTO_TFM_REQ_MAY_SLEEP | CRYPTO_TFM_REQ_MAY_BACKLOG : 0, - crypto_req_done, (void *)wait); - crypto_init_wait(wait); - - r = crypto_wait_req(crypto_ahash_init(req), wait); - - if (unlikely(r < 0)) { - if (r != -ENOMEM) - DMERR("crypto_ahash_init failed: %d", r); - return r; - } - - if (likely(v->salt_size && (v->version >= 1))) - r = verity_ahash_update(v, req, v->salt, v->salt_size, wait); - - return r; -} - -static int verity_ahash_final(struct dm_verity *v, struct ahash_request *req, - u8 *digest, struct crypto_wait *wait) -{ - int r; - - if (unlikely(v->salt_size && (!v->version))) { - r = verity_ahash_update(v, req, v->salt, v->salt_size, wait); - - if (r < 0) { - DMERR("%s failed updating salt: %d", __func__, r); - goto out; - } - } - - ahash_request_set_crypt(req, NULL, digest, 0); - r = crypto_wait_req(crypto_ahash_final(req), wait); -out: - return r; -} - int verity_hash(struct dm_verity *v, struct dm_verity_io *io, - const u8 *data, size_t len, u8 *digest, bool may_sleep) + const u8 *data, size_t len, u8 *digest) { + struct shash_desc *desc = &io->hash_desc; int r; - if (static_branch_unlikely(&ahash_enabled) && !v->shash_tfm) { - struct ahash_request *req = verity_io_hash_req(v, io); - struct crypto_wait wait; - - r = verity_ahash_init(v, req, &wait, may_sleep) ?: - verity_ahash_update(v, req, data, len, &wait) ?: - verity_ahash_final(v, req, digest, &wait); + desc->tfm = v->shash_tfm; + if (unlikely(v->initial_hashstate == NULL)) { + /* Version 0: salt at end */ + r = crypto_shash_init(desc) ?: + crypto_shash_update(desc, data, len) ?: + crypto_shash_update(desc, v->salt, v->salt_size) ?: + crypto_shash_final(desc, digest); } else { - struct shash_desc *desc = verity_io_hash_req(v, io); - - desc->tfm = v->shash_tfm; + /* Version 1: salt at beginning */ r = crypto_shash_import(desc, v->initial_hashstate) ?: crypto_shash_finup(desc, data, len, digest); } @@ -362,7 +279,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, } r = verity_hash(v, io, data, 1 << v->hash_dev_block_bits, - verity_io_real_digest(v, io), !io->in_bh); + verity_io_real_digest(v, io)); if (unlikely(r < 0)) goto release_ret_r; @@ -465,7 +382,7 @@ static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io, goto free_ret; r = verity_hash(v, io, buffer, 1 << v->data_dev_block_bits, - verity_io_real_digest(v, io), true); + verity_io_real_digest(v, io)); if (unlikely(r)) goto free_ret; @@ -581,7 +498,7 @@ static int verity_verify_io(struct dm_verity_io *io) } r = verity_hash(v, io, data, block_size, - verity_io_real_digest(v, io), !io->in_bh); + verity_io_real_digest(v, io)); if (unlikely(r < 0)) { kunmap_local(data); return r; @@ -1092,12 +1009,7 @@ static void verity_dtr(struct dm_target *ti) kfree(v->zero_digest); verity_free_sig(v); - if (v->ahash_tfm) { - static_branch_dec(&ahash_enabled); - crypto_free_ahash(v->ahash_tfm); - } else { - crypto_free_shash(v->shash_tfm); - } + crypto_free_shash(v->shash_tfm); kfree(v->alg_name); @@ -1157,7 +1069,8 @@ static int verity_alloc_zero_digest(struct dm_verity *v) if (!v->zero_digest) return r; - io = kmalloc(sizeof(*io) + v->hash_reqsize, GFP_KERNEL); + io = kmalloc(sizeof(*io) + crypto_shash_descsize(v->shash_tfm), + GFP_KERNEL); if (!io) return r; /* verity_dtr will free zero_digest */ @@ -1168,7 +1081,7 @@ static int verity_alloc_zero_digest(struct dm_verity *v) goto out; r = verity_hash(v, io, zero_data, 1 << v->data_dev_block_bits, - v->zero_digest, true); + v->zero_digest); out: kfree(io); @@ -1324,9 +1237,7 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, static int verity_setup_hash_alg(struct dm_verity *v, const char *alg_name) { struct dm_target *ti = v->ti; - struct crypto_ahash *ahash; - struct crypto_shash *shash = NULL; - const char *driver_name; + struct crypto_shash *shash; v->alg_name = kstrdup(alg_name, GFP_KERNEL); if (!v->alg_name) { @@ -1334,50 +1245,14 @@ static int verity_setup_hash_alg(struct dm_verity *v, const char *alg_name) return -ENOMEM; } - /* - * Allocate the hash transformation object that this dm-verity instance - * will use. The vast majority of dm-verity users use CPU-based - * hashing, so when possible use the shash API to minimize the crypto - * API overhead. If the ahash API resolves to a different driver - * (likely an off-CPU hardware offload), use ahash instead. Also use - * ahash if the obsolete dm-verity format with the appended salt is - * being used, so that quirk only needs to be handled in one place. - */ - ahash = crypto_alloc_ahash(alg_name, 0, - v->use_bh_wq ? CRYPTO_ALG_ASYNC : 0); - if (IS_ERR(ahash)) { + shash = crypto_alloc_shash(alg_name, 0, 0); + if (IS_ERR(shash)) { ti->error = "Cannot initialize hash function"; - return PTR_ERR(ahash); - } - driver_name = crypto_ahash_driver_name(ahash); - if (v->version >= 1 /* salt prepended, not appended? */) { - shash = crypto_alloc_shash(alg_name, 0, 0); - if (!IS_ERR(shash) && - strcmp(crypto_shash_driver_name(shash), driver_name) != 0) { - /* - * ahash gave a different driver than shash, so probably - * this is a case of real hardware offload. Use ahash. - */ - crypto_free_shash(shash); - shash = NULL; - } - } - if (!IS_ERR_OR_NULL(shash)) { - crypto_free_ahash(ahash); - ahash = NULL; - v->shash_tfm = shash; - v->digest_size = crypto_shash_digestsize(shash); - v->hash_reqsize = sizeof(struct shash_desc) + - crypto_shash_descsize(shash); - DMINFO("%s using shash \"%s\"", alg_name, driver_name); - } else { - v->ahash_tfm = ahash; - static_branch_inc(&ahash_enabled); - v->digest_size = crypto_ahash_digestsize(ahash); - v->hash_reqsize = sizeof(struct ahash_request) + - crypto_ahash_reqsize(ahash); - DMINFO("%s using ahash \"%s\"", alg_name, driver_name); + return PTR_ERR(shash); } + v->shash_tfm = shash; + v->digest_size = crypto_shash_digestsize(shash); + DMINFO("%s using \"%s\"", alg_name, crypto_shash_driver_name(shash)); if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) { ti->error = "Digest size too big"; return -EINVAL; @@ -1402,7 +1277,7 @@ static int verity_setup_salt_and_hashstate(struct dm_verity *v, const char *arg) return -EINVAL; } } - if (v->shash_tfm) { + if (v->version) { /* Version 1: salt at beginning */ SHASH_DESC_ON_STACK(desc, v->shash_tfm); int r; @@ -1681,7 +1556,8 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - ti->per_io_data_size = sizeof(struct dm_verity_io) + v->hash_reqsize; + ti->per_io_data_size = sizeof(struct dm_verity_io) + + crypto_shash_descsize(v->shash_tfm); r = verity_fec_ctr(v); if (r) @@ -1788,10 +1664,7 @@ static int verity_preresume(struct dm_target *ti) bdev = dm_disk(dm_table_get_md(ti->table))->part0; root_digest.digest = v->root_digest; root_digest.digest_len = v->digest_size; - if (static_branch_unlikely(&ahash_enabled) && !v->shash_tfm) - root_digest.alg = crypto_ahash_alg_name(v->ahash_tfm); - else - root_digest.alg = crypto_shash_alg_name(v->shash_tfm); + root_digest.alg = crypto_shash_alg_name(v->shash_tfm); r = security_bdev_setintegrity(bdev, LSM_INT_DMVERITY_ROOTHASH, &root_digest, sizeof(root_digest)); @@ -1817,7 +1690,7 @@ static struct target_type verity_target = { .name = "verity", /* Note: the LSMs depend on the singleton and immutable features */ .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE, - .version = {1, 11, 0}, + .version = {1, 12, 0}, .module = THIS_MODULE, .ctr = verity_ctr, .dtr = verity_dtr, diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index 8cbb57862ae1..6d141abd965c 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -39,11 +39,10 @@ struct dm_verity { struct dm_target *ti; struct dm_bufio_client *bufio; char *alg_name; - struct crypto_ahash *ahash_tfm; /* either this or shash_tfm is set */ - struct crypto_shash *shash_tfm; /* either this or ahash_tfm is set */ + struct crypto_shash *shash_tfm; u8 *root_digest; /* digest of the root block */ u8 *salt; /* salt: its size is salt_size */ - u8 *initial_hashstate; /* salted initial state, if shash_tfm is set */ + u8 *initial_hashstate; /* salted initial state, if version >= 1 */ u8 *zero_digest; /* digest for a zero block */ #ifdef CONFIG_SECURITY u8 *root_digest_sig; /* signature of the root digest */ @@ -61,7 +60,6 @@ struct dm_verity { bool hash_failed:1; /* set if hash of any block failed */ bool use_bh_wq:1; /* try to verify in BH wq before normal work-queue */ unsigned int digest_size; /* digest size for the current hash algorithm */ - unsigned int hash_reqsize; /* the size of temporary space for crypto */ enum verity_mode mode; /* mode for handling verification errors */ enum verity_mode error_mode;/* mode for handling I/O errors */ unsigned int corrupted_errs;/* Number of errors for corrupted blocks */ @@ -100,19 +98,13 @@ struct dm_verity_io { u8 want_digest[HASH_MAX_DIGESTSIZE]; /* - * This struct is followed by a variable-sized hash request of size - * v->hash_reqsize, either a struct ahash_request or a struct shash_desc - * (depending on whether ahash_tfm or shash_tfm is being used). To - * access it, use verity_io_hash_req(). + * Temporary space for hashing. This is variable-length and must be at + * the end of the struct. struct shash_desc is just the fixed part; + * it's followed by a context of size crypto_shash_descsize(shash_tfm). */ + struct shash_desc hash_desc; }; -static inline void *verity_io_hash_req(struct dm_verity *v, - struct dm_verity_io *io) -{ - return io + 1; -} - static inline u8 *verity_io_real_digest(struct dm_verity *v, struct dm_verity_io *io) { @@ -126,7 +118,7 @@ static inline u8 *verity_io_want_digest(struct dm_verity *v, } extern int verity_hash(struct dm_verity *v, struct dm_verity_io *io, - const u8 *data, size_t len, u8 *digest, bool may_sleep); + const u8 *data, size_t len, u8 *digest); extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, sector_t block, u8 *digest, bool *is_zero); diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index a428e1cacf07..d8de4a3076a1 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -13,7 +13,6 @@ #include <linux/dm-io.h> #include <linux/dm-kcopyd.h> #include <linux/dax.h> -#include <linux/pfn_t.h> #include <linux/libnvdimm.h> #include <linux/delay.h> #include "dm-io-tracker.h" @@ -256,7 +255,7 @@ static int persistent_memory_claim(struct dm_writecache *wc) int r; loff_t s; long p, da; - pfn_t pfn; + unsigned long pfn; int id; struct page **pages; sector_t offset; @@ -290,7 +289,7 @@ static int persistent_memory_claim(struct dm_writecache *wc) r = da; goto err2; } - if (!pfn_t_has_page(pfn)) { + if (!pfn_valid(pfn)) { wc->memory_map = NULL; r = -EOPNOTSUPP; goto err2; @@ -314,13 +313,13 @@ static int persistent_memory_claim(struct dm_writecache *wc) r = daa ? daa : -EINVAL; goto err3; } - if (!pfn_t_has_page(pfn)) { + if (!pfn_valid(pfn)) { r = -EOPNOTSUPP; goto err3; } while (daa-- && i < p) { - pages[i++] = pfn_t_to_page(pfn); - pfn.val++; + pages[i++] = pfn_to_page(pfn); + pfn++; if (!(i & 15)) cond_resched(); } diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 3d31b82e0730..78e17dd4d01b 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -467,8 +467,6 @@ void dm_zone_endio(struct dm_io *io, struct bio *clone) bdev_offset_from_zone_start(disk->part0, clone->bi_iter.bi_sector); } - - return; } static int dm_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx, diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 5da3db06da10..9da329078ea4 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -1062,7 +1062,7 @@ static int dmz_iterate_devices(struct dm_target *ti, struct dmz_target *dmz = ti->private; unsigned int zone_nr_sectors = dmz_zone_nr_sectors(dmz->metadata); sector_t capacity; - int i, r; + int i, r = 0; for (i = 0; i < dmz->nr_ddevs; i++) { capacity = dmz->dev[i].capacity & ~(zone_nr_sectors - 1); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index abfe0392b5a4..7bd6fa05b00a 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -403,9 +403,9 @@ static void do_deferred_remove(struct work_struct *w) dm_deferred_remove(); } -static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int dm_blk_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct mapped_device *md = bdev->bd_disk->private_data; + struct mapped_device *md = disk->private_data; return dm_get_geometry(md, geo); } @@ -1024,10 +1024,8 @@ static void dm_wq_requeue_work(struct work_struct *work) * * 2) io->orig_bio points to new cloned bio which matches the requeued dm_io. */ -static void dm_io_complete(struct dm_io *io) +static inline void dm_io_complete(struct dm_io *io) { - bool first_requeue; - /* * Only dm_io that has been split needs two stage requeue, otherwise * we may run into long bio clone chain during suspend and OOM could @@ -1036,12 +1034,7 @@ static void dm_io_complete(struct dm_io *io) * Also flush data dm_io won't be marked as DM_IO_WAS_SPLIT, so they * also aren't handled via the first stage requeue. */ - if (dm_io_flagged(io, DM_IO_WAS_SPLIT)) - first_requeue = true; - else - first_requeue = false; - - __dm_io_complete(io, first_requeue); + __dm_io_complete(io, dm_io_flagged(io, DM_IO_WAS_SPLIT)); } /* @@ -1218,7 +1211,7 @@ static struct dm_target *dm_dax_get_live_target(struct mapped_device *md, static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, void **kaddr, - pfn_t *pfn) + unsigned long *pfn) { struct mapped_device *md = dax_get_private(dax_dev); sector_t sector = pgoff * PAGE_SECTORS; diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 7f524a26cebc..84b7e2af6dba 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -34,15 +34,6 @@ #include "md-bitmap.h" #include "md-cluster.h" -#define BITMAP_MAJOR_LO 3 -/* version 4 insists the bitmap is in little-endian order - * with version 3, it is host-endian which is non-portable - * Version 5 is currently set only for clustered devices - */ -#define BITMAP_MAJOR_HI 4 -#define BITMAP_MAJOR_CLUSTERED 5 -#define BITMAP_MAJOR_HOSTENDIAN 3 - /* * in-memory bitmap: * @@ -224,6 +215,8 @@ struct bitmap { int cluster_slot; }; +static struct workqueue_struct *md_bitmap_wq; + static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks, int chunksize, bool init); @@ -232,20 +225,19 @@ static inline char *bmname(struct bitmap *bitmap) return bitmap->mddev ? mdname(bitmap->mddev) : "mdX"; } -static bool __bitmap_enabled(struct bitmap *bitmap) -{ - return bitmap->storage.filemap && - !test_bit(BITMAP_STALE, &bitmap->flags); -} - -static bool bitmap_enabled(struct mddev *mddev) +static bool bitmap_enabled(void *data, bool flush) { - struct bitmap *bitmap = mddev->bitmap; + struct bitmap *bitmap = data; - if (!bitmap) - return false; + if (!flush) + return true; - return __bitmap_enabled(bitmap); + /* + * If caller want to flush bitmap pages to underlying disks, check if + * there are cached pages in filemap. + */ + return !test_bit(BITMAP_STALE, &bitmap->flags) && + bitmap->storage.filemap != NULL; } /* @@ -484,7 +476,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, return -EINVAL; } - md_super_write(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit), page); + md_write_metadata(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit), + page, 0); return 0; } @@ -1244,7 +1237,7 @@ static void __bitmap_unplug(struct bitmap *bitmap) int dirty, need_write; int writing = 0; - if (!__bitmap_enabled(bitmap)) + if (!bitmap_enabled(bitmap, true)) return; /* look at each page to see if there are any set bits that need to be @@ -1788,15 +1781,9 @@ static bool __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, bool degraded) { bitmap_counter_t *bmc; - bool rv; + bool rv = false; - if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */ - *blocks = 1024; - return true; /* always resync if no bitmap */ - } spin_lock_irq(&bitmap->counts.lock); - - rv = false; bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0); if (bmc) { /* locked */ @@ -1845,10 +1832,6 @@ static void __bitmap_end_sync(struct bitmap *bitmap, sector_t offset, bitmap_counter_t *bmc; unsigned long flags; - if (bitmap == NULL) { - *blocks = 1024; - return; - } spin_lock_irqsave(&bitmap->counts.lock, flags); bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0); if (bmc == NULL) @@ -1987,12 +1970,12 @@ static void bitmap_dirty_bits(struct mddev *mddev, unsigned long s, md_bitmap_set_memory_bits(bitmap, sec, 1); md_bitmap_file_set_bit(bitmap, sec); - if (sec < bitmap->mddev->recovery_cp) + if (sec < bitmap->mddev->resync_offset) /* We are asserting that the array is dirty, - * so move the recovery_cp address back so + * so move the resync_offset address back so * that it is obvious that it is dirty */ - bitmap->mddev->recovery_cp = sec; + bitmap->mddev->resync_offset = sec; } } @@ -2060,9 +2043,6 @@ static void bitmap_start_behind_write(struct mddev *mddev) struct bitmap *bitmap = mddev->bitmap; int bw; - if (!bitmap) - return; - atomic_inc(&bitmap->behind_writes); bw = atomic_read(&bitmap->behind_writes); if (bw > bitmap->behind_writes_used) @@ -2076,9 +2056,6 @@ static void bitmap_end_behind_write(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; - if (!bitmap) - return; - if (atomic_dec_and_test(&bitmap->behind_writes)) wake_up(&bitmap->behind_wait); pr_debug("dec write-behind count %d/%lu\n", @@ -2258,7 +2235,7 @@ static int bitmap_load(struct mddev *mddev) || bitmap->events_cleared == mddev->events) /* no need to keep dirty bits to optimise a * re-add of a missing device */ - start = mddev->recovery_cp; + start = mddev->resync_offset; mutex_lock(&mddev->bitmap_info.mutex); err = md_bitmap_init_from_disk(bitmap, start); @@ -2593,15 +2570,14 @@ err: return ret; } -static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize, - bool init) +static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize) { struct bitmap *bitmap = mddev->bitmap; if (!bitmap) return 0; - return __bitmap_resize(bitmap, blocks, chunksize, init); + return __bitmap_resize(bitmap, blocks, chunksize, false); } static ssize_t @@ -2990,12 +2966,19 @@ static struct attribute *md_bitmap_attrs[] = { &max_backlog_used.attr, NULL }; -const struct attribute_group md_bitmap_group = { + +static struct attribute_group md_bitmap_group = { .name = "bitmap", .attrs = md_bitmap_attrs, }; static struct bitmap_operations bitmap_ops = { + .head = { + .type = MD_BITMAP, + .id = ID_BITMAP, + .name = "bitmap", + }, + .enabled = bitmap_enabled, .create = bitmap_create, .resize = bitmap_resize, @@ -3013,6 +2996,9 @@ static struct bitmap_operations bitmap_ops = { .start_write = bitmap_start_write, .end_write = bitmap_end_write, + .start_discard = bitmap_start_write, + .end_discard = bitmap_end_write, + .start_sync = bitmap_start_sync, .end_sync = bitmap_end_sync, .cond_end_sync = bitmap_cond_end_sync, @@ -3026,9 +3012,22 @@ static struct bitmap_operations bitmap_ops = { .copy_from_slot = bitmap_copy_from_slot, .set_pages = bitmap_set_pages, .free = md_bitmap_free, + + .group = &md_bitmap_group, }; -void mddev_set_bitmap_ops(struct mddev *mddev) +int md_bitmap_init(void) +{ + md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, + 0); + if (!md_bitmap_wq) + return -ENOMEM; + + return register_md_submodule(&bitmap_ops.head); +} + +void md_bitmap_exit(void) { - mddev->bitmap_ops = &bitmap_ops; + destroy_workqueue(md_bitmap_wq); + unregister_md_submodule(&bitmap_ops.head); } diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 59e9dd45cfde..b42a28fa83a0 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -9,10 +9,26 @@ #define BITMAP_MAGIC 0x6d746962 +/* + * version 3 is host-endian order, this is deprecated and not used for new + * array + */ +#define BITMAP_MAJOR_LO 3 +#define BITMAP_MAJOR_HOSTENDIAN 3 +/* version 4 is little-endian order, the default value */ +#define BITMAP_MAJOR_HI 4 +/* version 5 is only used for cluster */ +#define BITMAP_MAJOR_CLUSTERED 5 +/* version 6 is only used for lockless bitmap */ +#define BITMAP_MAJOR_LOCKLESS 6 + /* use these for bitmap->flags and bitmap->sb->state bit-fields */ enum bitmap_state { - BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ + BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ BITMAP_WRITE_ERROR = 2, /* A write error has occurred */ + BITMAP_FIRST_USE = 3, /* llbitmap is just created */ + BITMAP_CLEAN = 4, /* llbitmap is created with assume_clean */ + BITMAP_DAEMON_BUSY = 5, /* llbitmap daemon is not finished after daemon_sleep */ BITMAP_HOSTENDIAN =15, }; @@ -61,11 +77,15 @@ struct md_bitmap_stats { struct file *file; }; +typedef void (md_bitmap_fn)(struct mddev *mddev, sector_t offset, + unsigned long sectors); + struct bitmap_operations { - bool (*enabled)(struct mddev *mddev); + struct md_submodule_head head; + + bool (*enabled)(void *data, bool flush); int (*create)(struct mddev *mddev); - int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize, - bool init); + int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize); int (*load)(struct mddev *mddev); void (*destroy)(struct mddev *mddev); @@ -80,10 +100,13 @@ struct bitmap_operations { void (*end_behind_write)(struct mddev *mddev); void (*wait_behind_writes)(struct mddev *mddev); - void (*start_write)(struct mddev *mddev, sector_t offset, - unsigned long sectors); - void (*end_write)(struct mddev *mddev, sector_t offset, - unsigned long sectors); + md_bitmap_fn *start_write; + md_bitmap_fn *end_write; + md_bitmap_fn *start_discard; + md_bitmap_fn *end_discard; + + sector_t (*skip_sync_blocks)(struct mddev *mddev, sector_t offset); + bool (*blocks_synced)(struct mddev *mddev, sector_t offset); bool (*start_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks, bool degraded); void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks); @@ -101,9 +124,75 @@ struct bitmap_operations { sector_t *hi, bool clear_bits); void (*set_pages)(void *data, unsigned long pages); void (*free)(void *data); + + struct attribute_group *group; }; /* the bitmap API */ -void mddev_set_bitmap_ops(struct mddev *mddev); +static inline bool md_bitmap_registered(struct mddev *mddev) +{ + return mddev->bitmap_ops != NULL; +} + +static inline bool md_bitmap_enabled(struct mddev *mddev, bool flush) +{ + /* bitmap_ops must be registered before creating bitmap. */ + if (!md_bitmap_registered(mddev)) + return false; + + if (!mddev->bitmap) + return false; + + return mddev->bitmap_ops->enabled(mddev->bitmap, flush); +} + +static inline bool md_bitmap_start_sync(struct mddev *mddev, sector_t offset, + sector_t *blocks, bool degraded) +{ + /* always resync if no bitmap */ + if (!md_bitmap_enabled(mddev, false)) { + *blocks = 1024; + return true; + } + + return mddev->bitmap_ops->start_sync(mddev, offset, blocks, degraded); +} + +static inline void md_bitmap_end_sync(struct mddev *mddev, sector_t offset, + sector_t *blocks) +{ + if (!md_bitmap_enabled(mddev, false)) { + *blocks = 1024; + return; + } + + mddev->bitmap_ops->end_sync(mddev, offset, blocks); +} + +#ifdef CONFIG_MD_BITMAP +int md_bitmap_init(void); +void md_bitmap_exit(void); +#else +static inline int md_bitmap_init(void) +{ + return 0; +} +static inline void md_bitmap_exit(void) +{ +} +#endif + +#ifdef CONFIG_MD_LLBITMAP +int md_llbitmap_init(void); +void md_llbitmap_exit(void); +#else +static inline int md_llbitmap_init(void) +{ + return 0; +} +static inline void md_llbitmap_exit(void) +{ +} +#endif #endif diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 94221d964d4f..11f1e91d387d 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -337,11 +337,11 @@ static void recover_bitmaps(struct md_thread *thread) md_wakeup_thread(mddev->sync_thread); if (hi > 0) { - if (lo < mddev->recovery_cp) - mddev->recovery_cp = lo; + if (lo < mddev->resync_offset) + mddev->resync_offset = lo; /* wake up thread to continue resync in case resync * is not finished */ - if (mddev->recovery_cp != MaxSector) { + if (mddev->resync_offset != MaxSector) { /* * clear the REMOTE flag since we will launch * resync thread in current node. @@ -630,7 +630,7 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0)) ret = mddev->bitmap_ops->resize(mddev, le64_to_cpu(msg->high), - 0, false); + 0); break; default: ret = -1; @@ -863,9 +863,9 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots) lockres_free(bm_lockres); continue; } - if ((hi > 0) && (lo < mddev->recovery_cp)) { + if ((hi > 0) && (lo < mddev->resync_offset)) { set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - mddev->recovery_cp = lo; + mddev->resync_offset = lo; md_check_recovery(mddev); } @@ -979,7 +979,7 @@ err: lockres_free(cinfo->resync_lockres); lockres_free(cinfo->bitmap_lockres); if (cinfo->lockspace) - dlm_release_lockspace(cinfo->lockspace, 2); + dlm_release_lockspace(cinfo->lockspace, DLM_RELEASE_NORMAL); mddev->cluster_info = NULL; kfree(cinfo); return ret; @@ -1027,7 +1027,7 @@ static int leave(struct mddev *mddev) * Also, we should send BITMAP_NEEDS_SYNC message in * case reshaping is interrupted. */ - if ((cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector) || + if ((cinfo->slot_number > 0 && mddev->resync_offset != MaxSector) || (mddev->reshape_position != MaxSector && test_bit(MD_CLOSING, &mddev->flags))) resync_bitmap(mddev); @@ -1042,7 +1042,7 @@ static int leave(struct mddev *mddev) lockres_free(cinfo->resync_lockres); lockres_free(cinfo->bitmap_lockres); unlock_all_bitmaps(mddev); - dlm_release_lockspace(cinfo->lockspace, 2); + dlm_release_lockspace(cinfo->lockspace, DLM_RELEASE_NORMAL); kfree(cinfo); return 0; } @@ -1605,8 +1605,8 @@ static int gather_bitmaps(struct md_rdev *rdev) pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn); goto out; } - if ((hi > 0) && (lo < mddev->recovery_cp)) - mddev->recovery_cp = lo; + if ((hi > 0) && (lo < mddev->resync_offset)) + mddev->resync_offset = lo; } out: return err; diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 5d9b08115375..7033d982d377 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -73,6 +73,7 @@ static int linear_set_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_hw_sectors = mddev->chunk_sectors; lim.max_write_zeroes_sectors = mddev->chunk_sectors; + lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors; lim.io_min = mddev->chunk_sectors << 9; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); if (err) @@ -256,18 +257,10 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) if (unlikely(bio_end_sector(bio) > end_sector)) { /* This bio crosses a device boundary, so we have to split it */ - struct bio *split = bio_split(bio, end_sector - bio_sector, - GFP_NOIO, &mddev->bio_set); - - if (IS_ERR(split)) { - bio->bi_status = errno_to_blk_status(PTR_ERR(split)); - bio_endio(bio); + bio = bio_submit_split_bioset(bio, end_sector - bio_sector, + &mddev->bio_set); + if (!bio) return true; - } - - bio_chain(split, bio); - submit_bio_noacct(bio); - bio = split; } md_account_bio(mddev, &bio); diff --git a/drivers/md/md-llbitmap.c b/drivers/md/md-llbitmap.c new file mode 100644 index 000000000000..1eb434306162 --- /dev/null +++ b/drivers/md/md-llbitmap.c @@ -0,0 +1,1626 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/blkdev.h> +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/timer.h> +#include <linux/sched.h> +#include <linux/list.h> +#include <linux/file.h> +#include <linux/seq_file.h> +#include <trace/events/block.h> + +#include "md.h" +#include "md-bitmap.h" + +/* + * #### Background + * + * Redundant data is used to enhance data fault tolerance, and the storage + * methods for redundant data vary depending on the RAID levels. And it's + * important to maintain the consistency of redundant data. + * + * Bitmap is used to record which data blocks have been synchronized and which + * ones need to be resynchronized or recovered. Each bit in the bitmap + * represents a segment of data in the array. When a bit is set, it indicates + * that the multiple redundant copies of that data segment may not be + * consistent. Data synchronization can be performed based on the bitmap after + * power failure or readding a disk. If there is no bitmap, a full disk + * synchronization is required. + * + * #### Key Features + * + * - IO fastpath is lockless, if user issues lots of write IO to the same + * bitmap bit in a short time, only the first write has additional overhead + * to update bitmap bit, no additional overhead for the following writes; + * - support only resync or recover written data, means in the case creating + * new array or replacing with a new disk, there is no need to do a full disk + * resync/recovery; + * + * #### Key Concept + * + * ##### State Machine + * + * Each bit is one byte, contain 6 different states, see llbitmap_state. And + * there are total 8 different actions, see llbitmap_action, can change state: + * + * llbitmap state machine: transitions between states + * + * | | Startwrite | Startsync | Endsync | Abortsync| + * | --------- | ---------- | --------- | ------- | ------- | + * | Unwritten | Dirty | x | x | x | + * | Clean | Dirty | x | x | x | + * | Dirty | x | x | x | x | + * | NeedSync | x | Syncing | x | x | + * | Syncing | x | Syncing | Dirty | NeedSync | + * + * | | Reload | Daemon | Discard | Stale | + * | --------- | -------- | ------ | --------- | --------- | + * | Unwritten | x | x | x | x | + * | Clean | x | x | Unwritten | NeedSync | + * | Dirty | NeedSync | Clean | Unwritten | NeedSync | + * | NeedSync | x | x | Unwritten | x | + * | Syncing | NeedSync | x | Unwritten | NeedSync | + * + * Typical scenarios: + * + * 1) Create new array + * All bits will be set to Unwritten by default, if --assume-clean is set, + * all bits will be set to Clean instead. + * + * 2) write data, raid1/raid10 have full copy of data, while raid456 doesn't and + * rely on xor data + * + * 2.1) write new data to raid1/raid10: + * Unwritten --StartWrite--> Dirty + * + * 2.2) write new data to raid456: + * Unwritten --StartWrite--> NeedSync + * + * Because the initial recover for raid456 is skipped, the xor data is not built + * yet, the bit must be set to NeedSync first and after lazy initial recover is + * finished, the bit will finally set to Dirty(see 5.1 and 5.4); + * + * 2.3) cover write + * Clean --StartWrite--> Dirty + * + * 3) daemon, if the array is not degraded: + * Dirty --Daemon--> Clean + * + * 4) discard + * {Clean, Dirty, NeedSync, Syncing} --Discard--> Unwritten + * + * 5) resync and recover + * + * 5.1) common process + * NeedSync --Startsync--> Syncing --Endsync--> Dirty --Daemon--> Clean + * + * 5.2) resync after power failure + * Dirty --Reload--> NeedSync + * + * 5.3) recover while replacing with a new disk + * By default, the old bitmap framework will recover all data, and llbitmap + * implements this by a new helper, see llbitmap_skip_sync_blocks: + * + * skip recover for bits other than dirty or clean; + * + * 5.4) lazy initial recover for raid5: + * By default, the old bitmap framework will only allow new recover when there + * are spares(new disk), a new recovery flag MD_RECOVERY_LAZY_RECOVER is added + * to perform raid456 lazy recover for set bits(from 2.2). + * + * 6. special handling for degraded array: + * + * - Dirty bits will never be cleared, daemon will just do nothing, so that if + * a disk is readded, Clean bits can be skipped with recovery; + * - Dirty bits will convert to Syncing from start write, to do data recovery + * for new added disks; + * - New write will convert bits to NeedSync directly; + * + * ##### Bitmap IO + * + * ##### Chunksize + * + * The default bitmap size is 128k, incluing 1k bitmap super block, and + * the default size of segment of data in the array each bit(chunksize) is 64k, + * and chunksize will adjust to twice the old size each time if the total number + * bits is not less than 127k.(see llbitmap_init) + * + * ##### READ + * + * While creating bitmap, all pages will be allocated and read for llbitmap, + * there won't be read afterwards + * + * ##### WRITE + * + * WRITE IO is divided into logical_block_size of the array, the dirty state + * of each block is tracked independently, for example: + * + * each page is 4k, contain 8 blocks; each block is 512 bytes contain 512 bit; + * + * | page0 | page1 | ... | page 31 | + * | | + * | \-----------------------\ + * | | + * | block0 | block1 | ... | block 8| + * | | + * | \-----------------\ + * | | + * | bit0 | bit1 | ... | bit511 | + * + * From IO path, if one bit is changed to Dirty or NeedSync, the corresponding + * subpage will be marked dirty, such block must write first before the IO is + * issued. This behaviour will affect IO performance, to reduce the impact, if + * multiple bits are changed in the same block in a short time, all bits in this + * block will be changed to Dirty/NeedSync, so that there won't be any overhead + * until daemon clears dirty bits. + * + * ##### Dirty Bits synchronization + * + * IO fast path will set bits to dirty, and those dirty bits will be cleared + * by daemon after IO is done. llbitmap_page_ctl is used to synchronize between + * IO path and daemon; + * + * IO path: + * 1) try to grab a reference, if succeed, set expire time after 5s and return; + * 2) if failed to grab a reference, wait for daemon to finish clearing dirty + * bits; + * + * Daemon (Daemon will be woken up every daemon_sleep seconds): + * For each page: + * 1) check if page expired, if not skip this page; for expired page: + * 2) suspend the page and wait for inflight write IO to be done; + * 3) change dirty page to clean; + * 4) resume the page; + */ + +#define BITMAP_DATA_OFFSET 1024 + +/* 64k is the max IO size of sync IO for raid1/raid10 */ +#define MIN_CHUNK_SIZE (64 * 2) + +/* By default, daemon will be woken up every 30s */ +#define DEFAULT_DAEMON_SLEEP 30 + +/* + * Dirtied bits that have not been accessed for more than 5s will be cleared + * by daemon. + */ +#define DEFAULT_BARRIER_IDLE 5 + +enum llbitmap_state { + /* No valid data, init state after assemble the array */ + BitUnwritten = 0, + /* data is consistent */ + BitClean, + /* data will be consistent after IO is done, set directly for writes */ + BitDirty, + /* + * data need to be resynchronized: + * 1) set directly for writes if array is degraded, prevent full disk + * synchronization after readding a disk; + * 2) reassemble the array after power failure, and dirty bits are + * found after reloading the bitmap; + * 3) set for first write for raid5, to build initial xor data lazily + */ + BitNeedSync, + /* data is synchronizing */ + BitSyncing, + BitStateCount, + BitNone = 0xff, +}; + +enum llbitmap_action { + /* User write new data, this is the only action from IO fast path */ + BitmapActionStartwrite = 0, + /* Start recovery */ + BitmapActionStartsync, + /* Finish recovery */ + BitmapActionEndsync, + /* Failed recovery */ + BitmapActionAbortsync, + /* Reassemble the array */ + BitmapActionReload, + /* Daemon thread is trying to clear dirty bits */ + BitmapActionDaemon, + /* Data is deleted */ + BitmapActionDiscard, + /* + * Bitmap is stale, mark all bits in addition to BitUnwritten to + * BitNeedSync. + */ + BitmapActionStale, + BitmapActionCount, + /* Init state is BitUnwritten */ + BitmapActionInit, +}; + +enum llbitmap_page_state { + LLPageFlush = 0, + LLPageDirty, +}; + +struct llbitmap_page_ctl { + char *state; + struct page *page; + unsigned long expire; + unsigned long flags; + wait_queue_head_t wait; + struct percpu_ref active; + /* Per block size dirty state, maximum 64k page / 1 sector = 128 */ + unsigned long dirty[]; +}; + +struct llbitmap { + struct mddev *mddev; + struct llbitmap_page_ctl **pctl; + + unsigned int nr_pages; + unsigned int io_size; + unsigned int blocks_per_page; + + /* shift of one chunk */ + unsigned long chunkshift; + /* size of one chunk in sector */ + unsigned long chunksize; + /* total number of chunks */ + unsigned long chunks; + unsigned long last_end_sync; + /* + * time in seconds that dirty bits will be cleared if the page is not + * accessed. + */ + unsigned long barrier_idle; + /* fires on first BitDirty state */ + struct timer_list pending_timer; + struct work_struct daemon_work; + + unsigned long flags; + __u64 events_cleared; + + /* for slow disks */ + atomic_t behind_writes; + wait_queue_head_t behind_wait; +}; + +struct llbitmap_unplug_work { + struct work_struct work; + struct llbitmap *llbitmap; + struct completion *done; +}; + +static struct workqueue_struct *md_llbitmap_io_wq; +static struct workqueue_struct *md_llbitmap_unplug_wq; + +static char state_machine[BitStateCount][BitmapActionCount] = { + [BitUnwritten] = { + [BitmapActionStartwrite] = BitDirty, + [BitmapActionStartsync] = BitNone, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitNone, + [BitmapActionReload] = BitNone, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitNone, + [BitmapActionStale] = BitNone, + }, + [BitClean] = { + [BitmapActionStartwrite] = BitDirty, + [BitmapActionStartsync] = BitNone, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitNone, + [BitmapActionReload] = BitNone, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitNeedSync, + }, + [BitDirty] = { + [BitmapActionStartwrite] = BitNone, + [BitmapActionStartsync] = BitNone, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitNone, + [BitmapActionReload] = BitNeedSync, + [BitmapActionDaemon] = BitClean, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitNeedSync, + }, + [BitNeedSync] = { + [BitmapActionStartwrite] = BitNone, + [BitmapActionStartsync] = BitSyncing, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitNone, + [BitmapActionReload] = BitNone, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitNone, + }, + [BitSyncing] = { + [BitmapActionStartwrite] = BitNone, + [BitmapActionStartsync] = BitSyncing, + [BitmapActionEndsync] = BitDirty, + [BitmapActionAbortsync] = BitNeedSync, + [BitmapActionReload] = BitNeedSync, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitNeedSync, + }, +}; + +static void __llbitmap_flush(struct mddev *mddev); + +static enum llbitmap_state llbitmap_read(struct llbitmap *llbitmap, loff_t pos) +{ + unsigned int idx; + unsigned int offset; + + pos += BITMAP_DATA_OFFSET; + idx = pos >> PAGE_SHIFT; + offset = offset_in_page(pos); + + return llbitmap->pctl[idx]->state[offset]; +} + +/* set all the bits in the subpage as dirty */ +static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap, + struct llbitmap_page_ctl *pctl, + unsigned int block) +{ + bool level_456 = raid_is_456(llbitmap->mddev); + unsigned int io_size = llbitmap->io_size; + int pos; + + for (pos = block * io_size; pos < (block + 1) * io_size; pos++) { + switch (pctl->state[pos]) { + case BitUnwritten: + pctl->state[pos] = level_456 ? BitNeedSync : BitDirty; + break; + case BitClean: + pctl->state[pos] = BitDirty; + break; + }; + } +} + +static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx, + int offset) +{ + struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; + unsigned int io_size = llbitmap->io_size; + int block = offset / io_size; + int pos; + + if (!test_bit(LLPageDirty, &pctl->flags)) + set_bit(LLPageDirty, &pctl->flags); + + /* + * For degraded array, dirty bits will never be cleared, and we must + * resync all the dirty bits, hence skip infect new dirty bits to + * prevent resync unnecessary data. + */ + if (llbitmap->mddev->degraded) { + set_bit(block, pctl->dirty); + return; + } + + /* + * The subpage usually contains a total of 512 bits. If any single bit + * within the subpage is marked as dirty, the entire sector will be + * written. To avoid impacting write performance, when multiple bits + * within the same sector are modified within llbitmap->barrier_idle, + * all bits in the sector will be collectively marked as dirty at once. + */ + if (test_and_set_bit(block, pctl->dirty)) { + llbitmap_infect_dirty_bits(llbitmap, pctl, block); + return; + } + + for (pos = block * io_size; pos < (block + 1) * io_size; pos++) { + if (pos == offset) + continue; + if (pctl->state[pos] == BitDirty || + pctl->state[pos] == BitNeedSync) { + llbitmap_infect_dirty_bits(llbitmap, pctl, block); + return; + } + } +} + +static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state, + loff_t pos) +{ + unsigned int idx; + unsigned int bit; + + pos += BITMAP_DATA_OFFSET; + idx = pos >> PAGE_SHIFT; + bit = offset_in_page(pos); + + llbitmap->pctl[idx]->state[bit] = state; + if (state == BitDirty || state == BitNeedSync) + llbitmap_set_page_dirty(llbitmap, idx, bit); +} + +static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx) +{ + struct mddev *mddev = llbitmap->mddev; + struct page *page = NULL; + struct md_rdev *rdev; + + if (llbitmap->pctl && llbitmap->pctl[idx]) + page = llbitmap->pctl[idx]->page; + if (page) + return page; + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + return ERR_PTR(-ENOMEM); + + rdev_for_each(rdev, mddev) { + sector_t sector; + + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + continue; + + sector = mddev->bitmap_info.offset + + (idx << PAGE_SECTORS_SHIFT); + + if (sync_page_io(rdev, sector, PAGE_SIZE, page, REQ_OP_READ, + true)) + return page; + + md_error(mddev, rdev); + } + + __free_page(page); + return ERR_PTR(-EIO); +} + +static void llbitmap_write_page(struct llbitmap *llbitmap, int idx) +{ + struct page *page = llbitmap->pctl[idx]->page; + struct mddev *mddev = llbitmap->mddev; + struct md_rdev *rdev; + int block; + + for (block = 0; block < llbitmap->blocks_per_page; block++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; + + if (!test_and_clear_bit(block, pctl->dirty)) + continue; + + rdev_for_each(rdev, mddev) { + sector_t sector; + sector_t bit_sector = llbitmap->io_size >> SECTOR_SHIFT; + + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + continue; + + sector = mddev->bitmap_info.offset + rdev->sb_start + + (idx << PAGE_SECTORS_SHIFT) + + block * bit_sector; + md_write_metadata(mddev, rdev, sector, + llbitmap->io_size, page, + block * llbitmap->io_size); + } + } +} + +static void active_release(struct percpu_ref *ref) +{ + struct llbitmap_page_ctl *pctl = + container_of(ref, struct llbitmap_page_ctl, active); + + wake_up(&pctl->wait); +} + +static void llbitmap_free_pages(struct llbitmap *llbitmap) +{ + int i; + + if (!llbitmap->pctl) + return; + + for (i = 0; i < llbitmap->nr_pages; i++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; + + if (!pctl || !pctl->page) + break; + + __free_page(pctl->page); + percpu_ref_exit(&pctl->active); + } + + kfree(llbitmap->pctl[0]); + kfree(llbitmap->pctl); + llbitmap->pctl = NULL; +} + +static int llbitmap_cache_pages(struct llbitmap *llbitmap) +{ + struct llbitmap_page_ctl *pctl; + unsigned int nr_pages = DIV_ROUND_UP(llbitmap->chunks + + BITMAP_DATA_OFFSET, PAGE_SIZE); + unsigned int size = struct_size(pctl, dirty, BITS_TO_LONGS( + llbitmap->blocks_per_page)); + int i; + + llbitmap->pctl = kmalloc_array(nr_pages, sizeof(void *), + GFP_KERNEL | __GFP_ZERO); + if (!llbitmap->pctl) + return -ENOMEM; + + size = round_up(size, cache_line_size()); + pctl = kmalloc_array(nr_pages, size, GFP_KERNEL | __GFP_ZERO); + if (!pctl) { + kfree(llbitmap->pctl); + return -ENOMEM; + } + + llbitmap->nr_pages = nr_pages; + + for (i = 0; i < nr_pages; i++, pctl = (void *)pctl + size) { + struct page *page = llbitmap_read_page(llbitmap, i); + + llbitmap->pctl[i] = pctl; + + if (IS_ERR(page)) { + llbitmap_free_pages(llbitmap); + return PTR_ERR(page); + } + + if (percpu_ref_init(&pctl->active, active_release, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { + __free_page(page); + llbitmap_free_pages(llbitmap); + return -ENOMEM; + } + + pctl->page = page; + pctl->state = page_address(page); + init_waitqueue_head(&pctl->wait); + } + + return 0; +} + +static void llbitmap_init_state(struct llbitmap *llbitmap) +{ + enum llbitmap_state state = BitUnwritten; + unsigned long i; + + if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags)) + state = BitClean; + + for (i = 0; i < llbitmap->chunks; i++) + llbitmap_write(llbitmap, state, i); +} + +/* The return value is only used from resync, where @start == @end. */ +static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap, + unsigned long start, + unsigned long end, + enum llbitmap_action action) +{ + struct mddev *mddev = llbitmap->mddev; + enum llbitmap_state state = BitNone; + bool level_456 = raid_is_456(llbitmap->mddev); + bool need_resync = false; + bool need_recovery = false; + + if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) + return BitNone; + + if (action == BitmapActionInit) { + llbitmap_init_state(llbitmap); + return BitNone; + } + + while (start <= end) { + enum llbitmap_state c = llbitmap_read(llbitmap, start); + + if (c < 0 || c >= BitStateCount) { + pr_err("%s: invalid bit %lu state %d action %d, forcing resync\n", + __func__, start, c, action); + state = BitNeedSync; + goto write_bitmap; + } + + if (c == BitNeedSync) + need_resync = !mddev->degraded; + + state = state_machine[c][action]; + +write_bitmap: + if (unlikely(mddev->degraded)) { + /* For degraded array, mark new data as need sync. */ + if (state == BitDirty && + action == BitmapActionStartwrite) + state = BitNeedSync; + /* + * For degraded array, resync dirty data as well, noted + * if array is still degraded after resync is done, all + * new data will still be dirty until array is clean. + */ + else if (c == BitDirty && + action == BitmapActionStartsync) + state = BitSyncing; + } else if (c == BitUnwritten && state == BitDirty && + action == BitmapActionStartwrite && level_456) { + /* Delay raid456 initial recovery to first write. */ + state = BitNeedSync; + } + + if (state == BitNone) { + start++; + continue; + } + + llbitmap_write(llbitmap, state, start); + + if (state == BitNeedSync) + need_resync = !mddev->degraded; + else if (state == BitDirty && + !timer_pending(&llbitmap->pending_timer)) + mod_timer(&llbitmap->pending_timer, + jiffies + mddev->bitmap_info.daemon_sleep * HZ); + + start++; + } + + if (need_resync && level_456) + need_recovery = true; + + if (need_recovery) { + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + set_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } else if (need_resync) { + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + + return state; +} + +static void llbitmap_raise_barrier(struct llbitmap *llbitmap, int page_idx) +{ + struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; + +retry: + if (likely(percpu_ref_tryget_live(&pctl->active))) { + WRITE_ONCE(pctl->expire, jiffies + llbitmap->barrier_idle * HZ); + return; + } + + wait_event(pctl->wait, !percpu_ref_is_dying(&pctl->active)); + goto retry; +} + +static void llbitmap_release_barrier(struct llbitmap *llbitmap, int page_idx) +{ + struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; + + percpu_ref_put(&pctl->active); +} + +static int llbitmap_suspend_timeout(struct llbitmap *llbitmap, int page_idx) +{ + struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; + + percpu_ref_kill(&pctl->active); + + if (!wait_event_timeout(pctl->wait, percpu_ref_is_zero(&pctl->active), + llbitmap->mddev->bitmap_info.daemon_sleep * HZ)) + return -ETIMEDOUT; + + return 0; +} + +static void llbitmap_resume(struct llbitmap *llbitmap, int page_idx) +{ + struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; + + pctl->expire = LONG_MAX; + percpu_ref_resurrect(&pctl->active); + wake_up(&pctl->wait); +} + +static int llbitmap_check_support(struct mddev *mddev) +{ + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { + pr_notice("md/llbitmap: %s: array with journal cannot have bitmap\n", + mdname(mddev)); + return -EBUSY; + } + + if (mddev->bitmap_info.space == 0) { + if (mddev->bitmap_info.default_space == 0) { + pr_notice("md/llbitmap: %s: no space for bitmap\n", + mdname(mddev)); + return -ENOSPC; + } + } + + if (!mddev->persistent) { + pr_notice("md/llbitmap: %s: array must be persistent\n", + mdname(mddev)); + return -EOPNOTSUPP; + } + + if (mddev->bitmap_info.file) { + pr_notice("md/llbitmap: %s: doesn't support bitmap file\n", + mdname(mddev)); + return -EOPNOTSUPP; + } + + if (mddev->bitmap_info.external) { + pr_notice("md/llbitmap: %s: doesn't support external metadata\n", + mdname(mddev)); + return -EOPNOTSUPP; + } + + if (mddev_is_dm(mddev)) { + pr_notice("md/llbitmap: %s: doesn't support dm-raid\n", + mdname(mddev)); + return -EOPNOTSUPP; + } + + return 0; +} + +static int llbitmap_init(struct llbitmap *llbitmap) +{ + struct mddev *mddev = llbitmap->mddev; + sector_t blocks = mddev->resync_max_sectors; + unsigned long chunksize = MIN_CHUNK_SIZE; + unsigned long chunks = DIV_ROUND_UP(blocks, chunksize); + unsigned long space = mddev->bitmap_info.space << SECTOR_SHIFT; + int ret; + + while (chunks > space) { + chunksize = chunksize << 1; + chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize); + } + + llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE; + llbitmap->chunkshift = ffz(~chunksize); + llbitmap->chunksize = chunksize; + llbitmap->chunks = chunks; + mddev->bitmap_info.daemon_sleep = DEFAULT_DAEMON_SLEEP; + + ret = llbitmap_cache_pages(llbitmap); + if (ret) + return ret; + + llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, + BitmapActionInit); + /* flush initial llbitmap to disk */ + __llbitmap_flush(mddev); + + return 0; +} + +static int llbitmap_read_sb(struct llbitmap *llbitmap) +{ + struct mddev *mddev = llbitmap->mddev; + unsigned long daemon_sleep; + unsigned long chunksize; + unsigned long events; + struct page *sb_page; + bitmap_super_t *sb; + int ret = -EINVAL; + + if (!mddev->bitmap_info.offset) { + pr_err("md/llbitmap: %s: no super block found", mdname(mddev)); + return -EINVAL; + } + + sb_page = llbitmap_read_page(llbitmap, 0); + if (IS_ERR(sb_page)) { + pr_err("md/llbitmap: %s: read super block failed", + mdname(mddev)); + return -EIO; + } + + sb = kmap_local_page(sb_page); + if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) { + pr_err("md/llbitmap: %s: invalid super block magic number", + mdname(mddev)); + goto out_put_page; + } + + if (sb->version != cpu_to_le32(BITMAP_MAJOR_LOCKLESS)) { + pr_err("md/llbitmap: %s: invalid super block version", + mdname(mddev)); + goto out_put_page; + } + + if (memcmp(sb->uuid, mddev->uuid, 16)) { + pr_err("md/llbitmap: %s: bitmap superblock UUID mismatch\n", + mdname(mddev)); + goto out_put_page; + } + + if (mddev->bitmap_info.space == 0) { + int room = le32_to_cpu(sb->sectors_reserved); + + if (room) + mddev->bitmap_info.space = room; + else + mddev->bitmap_info.space = mddev->bitmap_info.default_space; + } + llbitmap->flags = le32_to_cpu(sb->state); + if (test_and_clear_bit(BITMAP_FIRST_USE, &llbitmap->flags)) { + ret = llbitmap_init(llbitmap); + goto out_put_page; + } + + chunksize = le32_to_cpu(sb->chunksize); + if (!is_power_of_2(chunksize)) { + pr_err("md/llbitmap: %s: chunksize not a power of 2", + mdname(mddev)); + goto out_put_page; + } + + if (chunksize < DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors, + mddev->bitmap_info.space << SECTOR_SHIFT)) { + pr_err("md/llbitmap: %s: chunksize too small %lu < %llu / %lu", + mdname(mddev), chunksize, mddev->resync_max_sectors, + mddev->bitmap_info.space); + goto out_put_page; + } + + daemon_sleep = le32_to_cpu(sb->daemon_sleep); + if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) { + pr_err("md/llbitmap: %s: daemon sleep %lu period out of range", + mdname(mddev), daemon_sleep); + goto out_put_page; + } + + events = le64_to_cpu(sb->events); + if (events < mddev->events) { + pr_warn("md/llbitmap :%s: bitmap file is out of date (%lu < %llu) -- forcing full recovery", + mdname(mddev), events, mddev->events); + set_bit(BITMAP_STALE, &llbitmap->flags); + } + + sb->sync_size = cpu_to_le64(mddev->resync_max_sectors); + mddev->bitmap_info.chunksize = chunksize; + mddev->bitmap_info.daemon_sleep = daemon_sleep; + + llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE; + llbitmap->chunksize = chunksize; + llbitmap->chunks = DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors, chunksize); + llbitmap->chunkshift = ffz(~chunksize); + ret = llbitmap_cache_pages(llbitmap); + +out_put_page: + __free_page(sb_page); + kunmap_local(sb); + return ret; +} + +static void llbitmap_pending_timer_fn(struct timer_list *pending_timer) +{ + struct llbitmap *llbitmap = + container_of(pending_timer, struct llbitmap, pending_timer); + + if (work_busy(&llbitmap->daemon_work)) { + pr_warn("md/llbitmap: %s daemon_work not finished in %lu seconds\n", + mdname(llbitmap->mddev), + llbitmap->mddev->bitmap_info.daemon_sleep); + set_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags); + return; + } + + queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work); +} + +static void md_llbitmap_daemon_fn(struct work_struct *work) +{ + struct llbitmap *llbitmap = + container_of(work, struct llbitmap, daemon_work); + unsigned long start; + unsigned long end; + bool restart; + int idx; + + if (llbitmap->mddev->degraded) + return; +retry: + start = 0; + end = min(llbitmap->chunks, PAGE_SIZE - BITMAP_DATA_OFFSET) - 1; + restart = false; + + for (idx = 0; idx < llbitmap->nr_pages; idx++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; + + if (idx > 0) { + start = end + 1; + end = min(end + PAGE_SIZE, llbitmap->chunks - 1); + } + + if (!test_bit(LLPageFlush, &pctl->flags) && + time_before(jiffies, pctl->expire)) { + restart = true; + continue; + } + + if (llbitmap_suspend_timeout(llbitmap, idx) < 0) { + pr_warn("md/llbitmap: %s: %s waiting for page %d timeout\n", + mdname(llbitmap->mddev), __func__, idx); + continue; + } + + llbitmap_state_machine(llbitmap, start, end, BitmapActionDaemon); + llbitmap_resume(llbitmap, idx); + } + + /* + * If the daemon took a long time to finish, retry to prevent missing + * clearing dirty bits. + */ + if (test_and_clear_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags)) + goto retry; + + /* If some page is dirty but not expired, setup timer again */ + if (restart) + mod_timer(&llbitmap->pending_timer, + jiffies + llbitmap->mddev->bitmap_info.daemon_sleep * HZ); +} + +static int llbitmap_create(struct mddev *mddev) +{ + struct llbitmap *llbitmap; + int ret; + + ret = llbitmap_check_support(mddev); + if (ret) + return ret; + + llbitmap = kzalloc(sizeof(*llbitmap), GFP_KERNEL); + if (!llbitmap) + return -ENOMEM; + + llbitmap->mddev = mddev; + llbitmap->io_size = bdev_logical_block_size(mddev->gendisk->part0); + llbitmap->blocks_per_page = PAGE_SIZE / llbitmap->io_size; + + timer_setup(&llbitmap->pending_timer, llbitmap_pending_timer_fn, 0); + INIT_WORK(&llbitmap->daemon_work, md_llbitmap_daemon_fn); + atomic_set(&llbitmap->behind_writes, 0); + init_waitqueue_head(&llbitmap->behind_wait); + + mutex_lock(&mddev->bitmap_info.mutex); + mddev->bitmap = llbitmap; + ret = llbitmap_read_sb(llbitmap); + mutex_unlock(&mddev->bitmap_info.mutex); + if (ret) { + kfree(llbitmap); + mddev->bitmap = NULL; + } + + return ret; +} + +static int llbitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long chunks; + + if (chunksize == 0) + chunksize = llbitmap->chunksize; + + /* If there is enough space, leave the chunksize unchanged. */ + chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize); + while (chunks > mddev->bitmap_info.space << SECTOR_SHIFT) { + chunksize = chunksize << 1; + chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize); + } + + llbitmap->chunkshift = ffz(~chunksize); + llbitmap->chunksize = chunksize; + llbitmap->chunks = chunks; + + return 0; +} + +static int llbitmap_load(struct mddev *mddev) +{ + enum llbitmap_action action = BitmapActionReload; + struct llbitmap *llbitmap = mddev->bitmap; + + if (test_and_clear_bit(BITMAP_STALE, &llbitmap->flags)) + action = BitmapActionStale; + + llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, action); + return 0; +} + +static void llbitmap_destroy(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + if (!llbitmap) + return; + + mutex_lock(&mddev->bitmap_info.mutex); + + timer_delete_sync(&llbitmap->pending_timer); + flush_workqueue(md_llbitmap_io_wq); + flush_workqueue(md_llbitmap_unplug_wq); + + mddev->bitmap = NULL; + llbitmap_free_pages(llbitmap); + kfree(llbitmap); + mutex_unlock(&mddev->bitmap_info.mutex); +} + +static void llbitmap_start_write(struct mddev *mddev, sector_t offset, + unsigned long sectors) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long start = offset >> llbitmap->chunkshift; + unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; + int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + + llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite); + + while (page_start <= page_end) { + llbitmap_raise_barrier(llbitmap, page_start); + page_start++; + } +} + +static void llbitmap_end_write(struct mddev *mddev, sector_t offset, + unsigned long sectors) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long start = offset >> llbitmap->chunkshift; + unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; + int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + + while (page_start <= page_end) { + llbitmap_release_barrier(llbitmap, page_start); + page_start++; + } +} + +static void llbitmap_start_discard(struct mddev *mddev, sector_t offset, + unsigned long sectors) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize); + unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; + int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + + llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard); + + while (page_start <= page_end) { + llbitmap_raise_barrier(llbitmap, page_start); + page_start++; + } +} + +static void llbitmap_end_discard(struct mddev *mddev, sector_t offset, + unsigned long sectors) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize); + unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; + int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + + while (page_start <= page_end) { + llbitmap_release_barrier(llbitmap, page_start); + page_start++; + } +} + +static void llbitmap_unplug_fn(struct work_struct *work) +{ + struct llbitmap_unplug_work *unplug_work = + container_of(work, struct llbitmap_unplug_work, work); + struct llbitmap *llbitmap = unplug_work->llbitmap; + struct blk_plug plug; + int i; + + blk_start_plug(&plug); + + for (i = 0; i < llbitmap->nr_pages; i++) { + if (!test_bit(LLPageDirty, &llbitmap->pctl[i]->flags) || + !test_and_clear_bit(LLPageDirty, &llbitmap->pctl[i]->flags)) + continue; + + llbitmap_write_page(llbitmap, i); + } + + blk_finish_plug(&plug); + md_super_wait(llbitmap->mddev); + complete(unplug_work->done); +} + +static bool llbitmap_dirty(struct llbitmap *llbitmap) +{ + int i; + + for (i = 0; i < llbitmap->nr_pages; i++) + if (test_bit(LLPageDirty, &llbitmap->pctl[i]->flags)) + return true; + + return false; +} + +static void llbitmap_unplug(struct mddev *mddev, bool sync) +{ + DECLARE_COMPLETION_ONSTACK(done); + struct llbitmap *llbitmap = mddev->bitmap; + struct llbitmap_unplug_work unplug_work = { + .llbitmap = llbitmap, + .done = &done, + }; + + if (!llbitmap_dirty(llbitmap)) + return; + + /* + * Issue new bitmap IO under submit_bio() context will deadlock: + * - the bio will wait for bitmap bio to be done, before it can be + * issued; + * - bitmap bio will be added to current->bio_list and wait for this + * bio to be issued; + */ + INIT_WORK_ONSTACK(&unplug_work.work, llbitmap_unplug_fn); + queue_work(md_llbitmap_unplug_wq, &unplug_work.work); + wait_for_completion(&done); + destroy_work_on_stack(&unplug_work.work); +} + +/* + * Force to write all bitmap pages to disk, called when stopping the array, or + * every daemon_sleep seconds when sync_thread is running. + */ +static void __llbitmap_flush(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + struct blk_plug plug; + int i; + + blk_start_plug(&plug); + for (i = 0; i < llbitmap->nr_pages; i++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; + + /* mark all blocks as dirty */ + set_bit(LLPageDirty, &pctl->flags); + bitmap_fill(pctl->dirty, llbitmap->blocks_per_page); + llbitmap_write_page(llbitmap, i); + } + blk_finish_plug(&plug); + md_super_wait(llbitmap->mddev); +} + +static void llbitmap_flush(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + int i; + + for (i = 0; i < llbitmap->nr_pages; i++) + set_bit(LLPageFlush, &llbitmap->pctl[i]->flags); + + timer_delete_sync(&llbitmap->pending_timer); + queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work); + flush_work(&llbitmap->daemon_work); + + __llbitmap_flush(mddev); +} + +/* This is used for raid5 lazy initial recovery */ +static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long p = offset >> llbitmap->chunkshift; + enum llbitmap_state c = llbitmap_read(llbitmap, p); + + return c == BitClean || c == BitDirty; +} + +static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long p = offset >> llbitmap->chunkshift; + int blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); + enum llbitmap_state c = llbitmap_read(llbitmap, p); + + /* always skip unwritten blocks */ + if (c == BitUnwritten) + return blocks; + + /* For degraded array, don't skip */ + if (mddev->degraded) + return 0; + + /* For resync also skip clean/dirty blocks */ + if ((c == BitClean || c == BitDirty) && + test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + return blocks; + + return 0; +} + +static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset, + sector_t *blocks, bool degraded) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long p = offset >> llbitmap->chunkshift; + + /* + * Handle one bit at a time, this is much simpler. And it doesn't matter + * if md_do_sync() loop more times. + */ + *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); + return llbitmap_state_machine(llbitmap, p, p, + BitmapActionStartsync) == BitSyncing; +} + +/* Something is wrong, sync_thread stop at @offset */ +static void llbitmap_end_sync(struct mddev *mddev, sector_t offset, + sector_t *blocks) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long p = offset >> llbitmap->chunkshift; + + *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); + llbitmap_state_machine(llbitmap, p, llbitmap->chunks - 1, + BitmapActionAbortsync); +} + +/* A full sync_thread is finished */ +static void llbitmap_close_sync(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + int i; + + for (i = 0; i < llbitmap->nr_pages; i++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; + + /* let daemon_fn clear dirty bits immediately */ + WRITE_ONCE(pctl->expire, jiffies); + } + + llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, + BitmapActionEndsync); +} + +/* + * sync_thread have reached @sector, update metadata every daemon_sleep seconds, + * just in case sync_thread have to restart after power failure. + */ +static void llbitmap_cond_end_sync(struct mddev *mddev, sector_t sector, + bool force) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + if (sector == 0) { + llbitmap->last_end_sync = jiffies; + return; + } + + if (time_before(jiffies, llbitmap->last_end_sync + + HZ * mddev->bitmap_info.daemon_sleep)) + return; + + wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); + + mddev->curr_resync_completed = sector; + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); + llbitmap_state_machine(llbitmap, 0, sector >> llbitmap->chunkshift, + BitmapActionEndsync); + __llbitmap_flush(mddev); + + llbitmap->last_end_sync = jiffies; + sysfs_notify_dirent_safe(mddev->sysfs_completed); +} + +static bool llbitmap_enabled(void *data, bool flush) +{ + struct llbitmap *llbitmap = data; + + return llbitmap && !test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags); +} + +static void llbitmap_dirty_bits(struct mddev *mddev, unsigned long s, + unsigned long e) +{ + llbitmap_state_machine(mddev->bitmap, s, e, BitmapActionStartwrite); +} + +static void llbitmap_write_sb(struct llbitmap *llbitmap) +{ + int nr_blocks = DIV_ROUND_UP(BITMAP_DATA_OFFSET, llbitmap->io_size); + + bitmap_fill(llbitmap->pctl[0]->dirty, nr_blocks); + llbitmap_write_page(llbitmap, 0); + md_super_wait(llbitmap->mddev); +} + +static void llbitmap_update_sb(void *data) +{ + struct llbitmap *llbitmap = data; + struct mddev *mddev = llbitmap->mddev; + struct page *sb_page; + bitmap_super_t *sb; + + if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) + return; + + sb_page = llbitmap_read_page(llbitmap, 0); + if (IS_ERR(sb_page)) { + pr_err("%s: %s: read super block failed", __func__, + mdname(mddev)); + set_bit(BITMAP_WRITE_ERROR, &llbitmap->flags); + return; + } + + if (mddev->events < llbitmap->events_cleared) + llbitmap->events_cleared = mddev->events; + + sb = kmap_local_page(sb_page); + sb->events = cpu_to_le64(mddev->events); + sb->state = cpu_to_le32(llbitmap->flags); + sb->chunksize = cpu_to_le32(llbitmap->chunksize); + sb->sync_size = cpu_to_le64(mddev->resync_max_sectors); + sb->events_cleared = cpu_to_le64(llbitmap->events_cleared); + sb->sectors_reserved = cpu_to_le32(mddev->bitmap_info.space); + sb->daemon_sleep = cpu_to_le32(mddev->bitmap_info.daemon_sleep); + + kunmap_local(sb); + llbitmap_write_sb(llbitmap); +} + +static int llbitmap_get_stats(void *data, struct md_bitmap_stats *stats) +{ + struct llbitmap *llbitmap = data; + + memset(stats, 0, sizeof(*stats)); + + stats->missing_pages = 0; + stats->pages = llbitmap->nr_pages; + stats->file_pages = llbitmap->nr_pages; + + stats->behind_writes = atomic_read(&llbitmap->behind_writes); + stats->behind_wait = wq_has_sleeper(&llbitmap->behind_wait); + stats->events_cleared = llbitmap->events_cleared; + + return 0; +} + +/* just flag all pages as needing to be written */ +static void llbitmap_write_all(struct mddev *mddev) +{ + int i; + struct llbitmap *llbitmap = mddev->bitmap; + + for (i = 0; i < llbitmap->nr_pages; i++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; + + set_bit(LLPageDirty, &pctl->flags); + bitmap_fill(pctl->dirty, llbitmap->blocks_per_page); + } +} + +static void llbitmap_start_behind_write(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + atomic_inc(&llbitmap->behind_writes); +} + +static void llbitmap_end_behind_write(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + if (atomic_dec_and_test(&llbitmap->behind_writes)) + wake_up(&llbitmap->behind_wait); +} + +static void llbitmap_wait_behind_writes(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + if (!llbitmap) + return; + + wait_event(llbitmap->behind_wait, + atomic_read(&llbitmap->behind_writes) == 0); + +} + +static ssize_t bits_show(struct mddev *mddev, char *page) +{ + struct llbitmap *llbitmap; + int bits[BitStateCount] = {0}; + loff_t start = 0; + + mutex_lock(&mddev->bitmap_info.mutex); + llbitmap = mddev->bitmap; + if (!llbitmap || !llbitmap->pctl) { + mutex_unlock(&mddev->bitmap_info.mutex); + return sprintf(page, "no bitmap\n"); + } + + if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) { + mutex_unlock(&mddev->bitmap_info.mutex); + return sprintf(page, "bitmap io error\n"); + } + + while (start < llbitmap->chunks) { + enum llbitmap_state c = llbitmap_read(llbitmap, start); + + if (c < 0 || c >= BitStateCount) + pr_err("%s: invalid bit %llu state %d\n", + __func__, start, c); + else + bits[c]++; + start++; + } + + mutex_unlock(&mddev->bitmap_info.mutex); + return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n", + bits[BitUnwritten], bits[BitClean], bits[BitDirty], + bits[BitNeedSync], bits[BitSyncing]); +} + +static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits); + +static ssize_t metadata_show(struct mddev *mddev, char *page) +{ + struct llbitmap *llbitmap; + ssize_t ret; + + mutex_lock(&mddev->bitmap_info.mutex); + llbitmap = mddev->bitmap; + if (!llbitmap) { + mutex_unlock(&mddev->bitmap_info.mutex); + return sprintf(page, "no bitmap\n"); + } + + ret = sprintf(page, "chunksize %lu\nchunkshift %lu\nchunks %lu\noffset %llu\ndaemon_sleep %lu\n", + llbitmap->chunksize, llbitmap->chunkshift, + llbitmap->chunks, mddev->bitmap_info.offset, + llbitmap->mddev->bitmap_info.daemon_sleep); + mutex_unlock(&mddev->bitmap_info.mutex); + + return ret; +} + +static struct md_sysfs_entry llbitmap_metadata = __ATTR_RO(metadata); + +static ssize_t +daemon_sleep_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%lu\n", mddev->bitmap_info.daemon_sleep); +} + +static ssize_t +daemon_sleep_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned long timeout; + int rv = kstrtoul(buf, 10, &timeout); + + if (rv) + return rv; + + mddev->bitmap_info.daemon_sleep = timeout; + return len; +} + +static struct md_sysfs_entry llbitmap_daemon_sleep = __ATTR_RW(daemon_sleep); + +static ssize_t +barrier_idle_show(struct mddev *mddev, char *page) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + return sprintf(page, "%lu\n", llbitmap->barrier_idle); +} + +static ssize_t +barrier_idle_store(struct mddev *mddev, const char *buf, size_t len) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long timeout; + int rv = kstrtoul(buf, 10, &timeout); + + if (rv) + return rv; + + llbitmap->barrier_idle = timeout; + return len; +} + +static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle); + +static struct attribute *md_llbitmap_attrs[] = { + &llbitmap_bits.attr, + &llbitmap_metadata.attr, + &llbitmap_daemon_sleep.attr, + &llbitmap_barrier_idle.attr, + NULL +}; + +static struct attribute_group md_llbitmap_group = { + .name = "llbitmap", + .attrs = md_llbitmap_attrs, +}; + +static struct bitmap_operations llbitmap_ops = { + .head = { + .type = MD_BITMAP, + .id = ID_LLBITMAP, + .name = "llbitmap", + }, + + .enabled = llbitmap_enabled, + .create = llbitmap_create, + .resize = llbitmap_resize, + .load = llbitmap_load, + .destroy = llbitmap_destroy, + + .start_write = llbitmap_start_write, + .end_write = llbitmap_end_write, + .start_discard = llbitmap_start_discard, + .end_discard = llbitmap_end_discard, + .unplug = llbitmap_unplug, + .flush = llbitmap_flush, + + .start_behind_write = llbitmap_start_behind_write, + .end_behind_write = llbitmap_end_behind_write, + .wait_behind_writes = llbitmap_wait_behind_writes, + + .blocks_synced = llbitmap_blocks_synced, + .skip_sync_blocks = llbitmap_skip_sync_blocks, + .start_sync = llbitmap_start_sync, + .end_sync = llbitmap_end_sync, + .close_sync = llbitmap_close_sync, + .cond_end_sync = llbitmap_cond_end_sync, + + .update_sb = llbitmap_update_sb, + .get_stats = llbitmap_get_stats, + .dirty_bits = llbitmap_dirty_bits, + .write_all = llbitmap_write_all, + + .group = &md_llbitmap_group, +}; + +int md_llbitmap_init(void) +{ + md_llbitmap_io_wq = alloc_workqueue("md_llbitmap_io", + WQ_MEM_RECLAIM | WQ_UNBOUND, 0); + if (!md_llbitmap_io_wq) + return -ENOMEM; + + md_llbitmap_unplug_wq = alloc_workqueue("md_llbitmap_unplug", + WQ_MEM_RECLAIM | WQ_UNBOUND, 0); + if (!md_llbitmap_unplug_wq) { + destroy_workqueue(md_llbitmap_io_wq); + md_llbitmap_io_wq = NULL; + return -ENOMEM; + } + + return register_md_submodule(&llbitmap_ops.head); +} + +void md_llbitmap_exit(void) +{ + destroy_workqueue(md_llbitmap_io_wq); + md_llbitmap_io_wq = NULL; + destroy_workqueue(md_llbitmap_unplug_wq); + md_llbitmap_unplug_wq = NULL; + unregister_md_submodule(&llbitmap_ops.head); +} diff --git a/drivers/md/md.c b/drivers/md/md.c index 046fe85c76fe..41c476b40c7a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -94,7 +94,6 @@ static struct workqueue_struct *md_wq; * workqueue whith reconfig_mutex grabbed. */ static struct workqueue_struct *md_misc_wq; -struct workqueue_struct *md_bitmap_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); @@ -339,6 +338,7 @@ static int start_readonly; * so all the races disappear. */ static bool create_on_open = true; +static bool legacy_async_del_gendisk = true; /* * We have a system wide 'event count' that is incremented @@ -637,6 +637,12 @@ static void __mddev_put(struct mddev *mddev) return; /* + * If array is freed by stopping array, MD_DELETED is set by + * do_md_stop(), MD_DELETED is still set here in case mddev is freed + * directly by closing a mddev that is created by create_on_open. + */ + set_bit(MD_DELETED, &mddev->flags); + /* * Call queue_work inside the spinlock so that flush_workqueue() after * mddev_find will succeed in waiting for the work to be done. */ @@ -670,8 +676,64 @@ static void active_io_release(struct percpu_ref *ref) static void no_op(struct percpu_ref *r) {} +static bool mddev_set_bitmap_ops(struct mddev *mddev) +{ + struct bitmap_operations *old = mddev->bitmap_ops; + struct md_submodule_head *head; + + if (mddev->bitmap_id == ID_BITMAP_NONE || + (old && old->head.id == mddev->bitmap_id)) + return true; + + xa_lock(&md_submodule); + head = xa_load(&md_submodule, mddev->bitmap_id); + + if (!head) { + pr_warn("md: can't find bitmap id %d\n", mddev->bitmap_id); + goto err; + } + + if (head->type != MD_BITMAP) { + pr_warn("md: invalid bitmap id %d\n", mddev->bitmap_id); + goto err; + } + + mddev->bitmap_ops = (void *)head; + xa_unlock(&md_submodule); + + if (!mddev_is_dm(mddev) && mddev->bitmap_ops->group) { + if (sysfs_create_group(&mddev->kobj, mddev->bitmap_ops->group)) + pr_warn("md: cannot register extra bitmap attributes for %s\n", + mdname(mddev)); + else + /* + * Inform user with KOBJ_CHANGE about new bitmap + * attributes. + */ + kobject_uevent(&mddev->kobj, KOBJ_CHANGE); + } + return true; + +err: + xa_unlock(&md_submodule); + return false; +} + +static void mddev_clear_bitmap_ops(struct mddev *mddev) +{ + if (!mddev_is_dm(mddev) && mddev->bitmap_ops && + mddev->bitmap_ops->group) + sysfs_remove_group(&mddev->kobj, mddev->bitmap_ops->group); + + mddev->bitmap_ops = NULL; +} + int mddev_init(struct mddev *mddev) { + if (!IS_ENABLED(CONFIG_MD_BITMAP)) + mddev->bitmap_id = ID_BITMAP_NONE; + else + mddev->bitmap_id = ID_BITMAP; if (percpu_ref_init(&mddev->active_io, active_io_release, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) @@ -706,7 +768,6 @@ int mddev_init(struct mddev *mddev) mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->level = LEVEL_NONE; - mddev_set_bitmap_ops(mddev); INIT_WORK(&mddev->sync_work, md_start_sync); INIT_WORK(&mddev->del_work, mddev_delayed_delete); @@ -871,15 +932,18 @@ void mddev_unlock(struct mddev *mddev) export_rdev(rdev, mddev); } - /* Call del_gendisk after release reconfig_mutex to avoid - * deadlock (e.g. call del_gendisk under the lock and an - * access to sysfs files waits the lock) - * And MD_DELETED is only used for md raid which is set in - * do_md_stop. dm raid only uses md_stop to stop. So dm raid - * doesn't need to check MD_DELETED when getting reconfig lock - */ - if (test_bit(MD_DELETED, &mddev->flags)) - del_gendisk(mddev->gendisk); + if (!legacy_async_del_gendisk) { + /* + * Call del_gendisk after release reconfig_mutex to avoid + * deadlock (e.g. call del_gendisk under the lock and an + * access to sysfs files waits the lock) + * And MD_DELETED is only used for md raid which is set in + * do_md_stop. dm raid only uses md_stop to stop. So dm raid + * doesn't need to check MD_DELETED when getting reconfig lock + */ + if (test_bit(MD_DELETED, &mddev->flags)) + del_gendisk(mddev->gendisk); + } } EXPORT_SYMBOL_GPL(mddev_unlock); @@ -1010,15 +1074,26 @@ static void super_written(struct bio *bio) wake_up(&mddev->sb_wait); } -void md_super_write(struct mddev *mddev, struct md_rdev *rdev, - sector_t sector, int size, struct page *page) +/** + * md_write_metadata - write metadata to underlying disk, including + * array superblock, badblocks, bitmap superblock and bitmap bits. + * @mddev: the array to write + * @rdev: the underlying disk to write + * @sector: the offset to @rdev + * @size: the length of the metadata + * @page: the metadata + * @offset: the offset to @page + * + * Write @size bytes of @page start from @offset, to @sector of @rdev, Increment + * mddev->pending_writes before returning, and decrement it on completion, + * waking up sb_wait. Caller must call md_super_wait() after issuing io to all + * rdev. If an error occurred, md_error() will be called, and the @rdev will be + * kicked out from @mddev. + */ +void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev, + sector_t sector, int size, struct page *page, + unsigned int offset) { - /* write first size bytes of page to sector of rdev - * Increment mddev->pending_writes before returning - * and decrement it on completion, waking up sb_wait - * if zero is reached. - * If an error occurred, call md_error - */ struct bio *bio; if (!page) @@ -1036,7 +1111,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, atomic_inc(&rdev->nr_pending); bio->bi_iter.bi_sector = sector; - __bio_add_page(bio, page, size, 0); + __bio_add_page(bio, page, size, offset); bio->bi_private = rdev; bio->bi_end_io = super_written; @@ -1346,6 +1421,9 @@ static u64 md_bitmap_events_cleared(struct mddev *mddev) struct md_bitmap_stats stats; int err; + if (!md_bitmap_enabled(mddev, false)) + return 0; + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); if (err) return 0; @@ -1409,13 +1487,13 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, stru mddev->layout = -1; if (sb->state & (1<<MD_SB_CLEAN)) - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; else { if (sb->events_hi == sb->cp_events_hi && sb->events_lo == sb->cp_events_lo) { - mddev->recovery_cp = sb->recovery_cp; + mddev->resync_offset = sb->recovery_cp; } else - mddev->recovery_cp = 0; + mddev->resync_offset = 0; } memcpy(mddev->uuid+0, &sb->set_uuid0, 4); @@ -1541,10 +1619,10 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) mddev->minor_version = sb->minor_version; if (mddev->in_sync) { - sb->recovery_cp = mddev->recovery_cp; + sb->recovery_cp = mddev->resync_offset; sb->cp_events_hi = (mddev->events>>32); sb->cp_events_lo = (u32)mddev->events; - if (mddev->recovery_cp == MaxSector) + if (mddev->resync_offset == MaxSector) sb->state = (1<< MD_SB_CLEAN); } else sb->recovery_cp = 0; @@ -1643,8 +1721,8 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) num_sectors = (sector_t)(2ULL << 32) - 2; do { - md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, - rdev->sb_page); + md_write_metadata(rdev->mddev, rdev, rdev->sb_start, + rdev->sb_size, rdev->sb_page, 0); } while (md_super_wait(rdev->mddev) < 0); return num_sectors; } @@ -1895,7 +1973,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc mddev->bitmap_info.default_space = (4096-1024) >> 9; mddev->reshape_backwards = 0; - mddev->recovery_cp = le64_to_cpu(sb->resync_offset); + mddev->resync_offset = le64_to_cpu(sb->resync_offset); memcpy(mddev->uuid, sb->set_uuid, 16); mddev->max_disks = (4096-256)/2; @@ -2081,7 +2159,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->utime = cpu_to_le64((__u64)mddev->utime); sb->events = cpu_to_le64(mddev->events); if (mddev->in_sync) - sb->resync_offset = cpu_to_le64(mddev->recovery_cp); + sb->resync_offset = cpu_to_le64(mddev->resync_offset); else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) sb->resync_offset = cpu_to_le64(MaxSector); else @@ -2292,8 +2370,8 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) sb->super_offset = cpu_to_le64(rdev->sb_start); sb->sb_csum = calc_sb_1_csum(sb); do { - md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, - rdev->sb_page); + md_write_metadata(rdev->mddev, rdev, rdev->sb_start, + rdev->sb_size, rdev->sb_page, 0); } while (md_super_wait(rdev->mddev) < 0); return num_sectors; @@ -2303,13 +2381,15 @@ static int super_1_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) { + struct mddev *mddev = rdev->mddev; + /* All necessary checks on new >= old have been done */ if (new_offset >= rdev->data_offset) return 1; /* with 1.0 metadata, there is no metadata to tread on * so we can always move back */ - if (rdev->mddev->minor_version == 0) + if (mddev->minor_version == 0) return 1; /* otherwise we must be sure not to step on @@ -2321,8 +2401,7 @@ super_1_allow_new_offset(struct md_rdev *rdev, if (rdev->sb_start + (32+4)*2 > new_offset) return 0; - if (!rdev->mddev->bitmap_info.file) { - struct mddev *mddev = rdev->mddev; + if (md_bitmap_registered(mddev) && !mddev->bitmap_info.file) { struct md_bitmap_stats stats; int err; @@ -2761,7 +2840,7 @@ repeat: /* If this is just a dirty<->clean transition, and the array is clean * and 'events' is odd, we can roll back to the previous clean state */ if (nospares - && (mddev->in_sync && mddev->recovery_cp == MaxSector) + && (mddev->in_sync && mddev->resync_offset == MaxSector) && mddev->can_decrease_events && mddev->events != 1) { mddev->events--; @@ -2794,24 +2873,24 @@ repeat: mddev_add_trace_msg(mddev, "md md_update_sb"); rewrite: - mddev->bitmap_ops->update_sb(mddev->bitmap); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->update_sb(mddev->bitmap); rdev_for_each(rdev, mddev) { if (rdev->sb_loaded != 1) continue; /* no noise on spare devices */ if (!test_bit(Faulty, &rdev->flags)) { - md_super_write(mddev,rdev, - rdev->sb_start, rdev->sb_size, - rdev->sb_page); + md_write_metadata(mddev, rdev, rdev->sb_start, + rdev->sb_size, rdev->sb_page, 0); pr_debug("md: (write) %pg's sb offset: %llu\n", rdev->bdev, (unsigned long long)rdev->sb_start); rdev->sb_events = mddev->events; if (rdev->badblocks.size) { - md_super_write(mddev, rdev, - rdev->badblocks.sector, - rdev->badblocks.size << 9, - rdev->bb_page); + md_write_metadata(mddev, rdev, + rdev->badblocks.sector, + rdev->badblocks.size << 9, + rdev->bb_page, 0); rdev->badblocks.size = 0; } @@ -4140,6 +4219,86 @@ static struct md_sysfs_entry md_new_level = __ATTR(new_level, 0664, new_level_show, new_level_store); static ssize_t +bitmap_type_show(struct mddev *mddev, char *page) +{ + struct md_submodule_head *head; + unsigned long i; + ssize_t len = 0; + + if (mddev->bitmap_id == ID_BITMAP_NONE) + len += sprintf(page + len, "[none] "); + else + len += sprintf(page + len, "none "); + + xa_lock(&md_submodule); + xa_for_each(&md_submodule, i, head) { + if (head->type != MD_BITMAP) + continue; + + if (mddev->bitmap_id == head->id) + len += sprintf(page + len, "[%s] ", head->name); + else + len += sprintf(page + len, "%s ", head->name); + } + xa_unlock(&md_submodule); + + len += sprintf(page + len, "\n"); + return len; +} + +static ssize_t +bitmap_type_store(struct mddev *mddev, const char *buf, size_t len) +{ + struct md_submodule_head *head; + enum md_submodule_id id; + unsigned long i; + int err = 0; + + xa_lock(&md_submodule); + + if (mddev->bitmap_ops) { + err = -EBUSY; + goto out; + } + + if (cmd_match(buf, "none")) { + mddev->bitmap_id = ID_BITMAP_NONE; + goto out; + } + + xa_for_each(&md_submodule, i, head) { + if (head->type == MD_BITMAP && cmd_match(buf, head->name)) { + mddev->bitmap_id = head->id; + goto out; + } + } + + err = kstrtoint(buf, 10, &id); + if (err) + goto out; + + if (id == ID_BITMAP_NONE) { + mddev->bitmap_id = id; + goto out; + } + + head = xa_load(&md_submodule, id); + if (head && head->type == MD_BITMAP) { + mddev->bitmap_id = id; + goto out; + } + + err = -ENOENT; + +out: + xa_unlock(&md_submodule); + return err ? err : len; +} + +static struct md_sysfs_entry md_bitmap_type = +__ATTR(bitmap_type, 0664, bitmap_type_show, bitmap_type_store); + +static ssize_t layout_show(struct mddev *mddev, char *page) { /* just a number, not meaningful for all levels */ @@ -4297,9 +4456,9 @@ __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); static ssize_t resync_start_show(struct mddev *mddev, char *page) { - if (mddev->recovery_cp == MaxSector) + if (mddev->resync_offset == MaxSector) return sprintf(page, "none\n"); - return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); + return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_offset); } static ssize_t @@ -4325,7 +4484,7 @@ resync_start_store(struct mddev *mddev, const char *buf, size_t len) err = -EBUSY; if (!err) { - mddev->recovery_cp = n; + mddev->resync_offset = n; if (mddev->pers) set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); } @@ -4670,6 +4829,9 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len) unsigned long chunk, end_chunk; int err; + if (!md_bitmap_enabled(mddev, false)) + return len; + err = mddev_lock(mddev); if (err) return err; @@ -4829,9 +4991,42 @@ out_unlock: static struct md_sysfs_entry md_metadata = __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); +static bool rdev_needs_recovery(struct md_rdev *rdev, sector_t sectors) +{ + return rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < sectors; +} + +static enum sync_action md_get_active_sync_action(struct mddev *mddev) +{ + struct md_rdev *rdev; + bool is_recover = false; + + if (mddev->resync_offset < MaxSector) + return ACTION_RESYNC; + + if (mddev->reshape_position != MaxSector) + return ACTION_RESHAPE; + + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + if (rdev_needs_recovery(rdev, MaxSector)) { + is_recover = true; + break; + } + } + rcu_read_unlock(); + + return is_recover ? ACTION_RECOVER : ACTION_IDLE; +} + enum sync_action md_sync_action(struct mddev *mddev) { unsigned long recovery = mddev->recovery; + enum sync_action active_action; /* * frozen has the highest priority, means running sync_thread will be @@ -4855,8 +5050,17 @@ enum sync_action md_sync_action(struct mddev *mddev) !test_bit(MD_RECOVERY_NEEDED, &recovery)) return ACTION_IDLE; - if (test_bit(MD_RECOVERY_RESHAPE, &recovery) || - mddev->reshape_position != MaxSector) + /* + * Check if any sync operation (resync/recover/reshape) is + * currently active. This ensures that only one sync operation + * can run at a time. Returns the type of active operation, or + * ACTION_IDLE if none are active. + */ + active_action = md_get_active_sync_action(mddev); + if (active_action != ACTION_IDLE) + return active_action; + + if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) return ACTION_RESHAPE; if (test_bit(MD_RECOVERY_RECOVER, &recovery)) @@ -5700,6 +5904,7 @@ __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, static struct attribute *md_default_attrs[] = { &md_level.attr, &md_new_level.attr, + &md_bitmap_type.attr, &md_layout.attr, &md_raid_disks.attr, &md_uuid.attr, @@ -5749,7 +5954,6 @@ static const struct attribute_group md_redundancy_group = { static const struct attribute_group *md_attr_groups[] = { &md_default_group, - &md_bitmap_group, NULL, }; @@ -5812,6 +6016,13 @@ static void md_kobj_release(struct kobject *ko) { struct mddev *mddev = container_of(ko, struct mddev, kobj); + if (legacy_async_del_gendisk) { + if (mddev->sysfs_state) + sysfs_put(mddev->sysfs_state); + if (mddev->sysfs_level) + sysfs_put(mddev->sysfs_level); + del_gendisk(mddev->gendisk); + } put_disk(mddev->gendisk); } @@ -6015,6 +6226,9 @@ static int md_alloc_and_put(dev_t dev, char *name) { struct mddev *mddev = md_alloc(dev, name); + if (legacy_async_del_gendisk) + pr_warn("md: async del_gendisk mode will be removed in future, please upgrade to mdadm-4.5+\n"); + if (IS_ERR(mddev)) return PTR_ERR(mddev); mddev_put(mddev); @@ -6071,6 +6285,26 @@ static void md_safemode_timeout(struct timer_list *t) static int start_dirty_degraded; +static int md_bitmap_create(struct mddev *mddev) +{ + if (mddev->bitmap_id == ID_BITMAP_NONE) + return -EINVAL; + + if (!mddev_set_bitmap_ops(mddev)) + return -ENOENT; + + return mddev->bitmap_ops->create(mddev); +} + +static void md_bitmap_destroy(struct mddev *mddev) +{ + if (!md_bitmap_registered(mddev)) + return; + + mddev->bitmap_ops->destroy(mddev); + mddev_clear_bitmap_ops(mddev); +} + int md_run(struct mddev *mddev) { int err; @@ -6237,7 +6471,7 @@ int md_run(struct mddev *mddev) } if (err == 0 && pers->sync_request && (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { - err = mddev->bitmap_ops->create(mddev); + err = md_bitmap_create(mddev); if (err) pr_warn("%s: failed to create bitmap (%d)\n", mdname(mddev), err); @@ -6310,7 +6544,7 @@ bitmap_abort: pers->free(mddev, mddev->private); mddev->private = NULL; put_pers(pers); - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); abort: bioset_exit(&mddev->io_clone_set); exit_sync_set: @@ -6330,10 +6564,12 @@ int do_md_run(struct mddev *mddev) if (err) goto out; - err = mddev->bitmap_ops->load(mddev); - if (err) { - mddev->bitmap_ops->destroy(mddev); - goto out; + if (md_bitmap_registered(mddev)) { + err = mddev->bitmap_ops->load(mddev); + if (err) { + md_bitmap_destroy(mddev); + goto out; + } } if (mddev_is_clustered(mddev)) @@ -6417,7 +6653,7 @@ static void md_clean(struct mddev *mddev) mddev->external_size = 0; mddev->dev_sectors = 0; mddev->raid_disks = 0; - mddev->recovery_cp = 0; + mddev->resync_offset = 0; mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->reshape_position = MaxSector; @@ -6425,10 +6661,22 @@ static void md_clean(struct mddev *mddev) mddev->persistent = 0; mddev->level = LEVEL_NONE; mddev->clevel[0] = 0; - /* if UNTIL_STOP is set, it's cleared here */ - mddev->hold_active = 0; - /* Don't clear MD_CLOSING, or mddev can be opened again. */ - mddev->flags &= BIT_ULL_MASK(MD_CLOSING); + + /* + * For legacy_async_del_gendisk mode, it can stop the array in the + * middle of assembling it, then it still can access the array. So + * it needs to clear MD_CLOSING. If not legacy_async_del_gendisk, + * it can't open the array again after stopping it. So it doesn't + * clear MD_CLOSING. + */ + if (legacy_async_del_gendisk && mddev->hold_active) { + clear_bit(MD_CLOSING, &mddev->flags); + } else { + /* if UNTIL_STOP is set, it's cleared here */ + mddev->hold_active = 0; + /* Don't clear MD_CLOSING, or mddev can be opened again. */ + mddev->flags &= BIT_ULL_MASK(MD_CLOSING); + } mddev->sb_flags = 0; mddev->ro = MD_RDWR; mddev->metadata_type[0] = 0; @@ -6472,7 +6720,8 @@ static void __md_stop_writes(struct mddev *mddev) mddev->pers->quiesce(mddev, 0); } - mddev->bitmap_ops->flush(mddev); + if (md_bitmap_enabled(mddev, true)) + mddev->bitmap_ops->flush(mddev); if (md_is_rdwr(mddev) && ((!mddev->in_sync && !mddev_is_clustered(mddev)) || @@ -6499,7 +6748,8 @@ EXPORT_SYMBOL_GPL(md_stop_writes); static void mddev_detach(struct mddev *mddev) { - mddev->bitmap_ops->wait_behind_writes(mddev); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->wait_behind_writes(mddev); if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); @@ -6515,7 +6765,7 @@ static void __md_stop(struct mddev *mddev) { struct md_personality *pers = mddev->pers; - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); mddev_detach(mddev); spin_lock(&mddev->lock); mddev->pers = NULL; @@ -6652,7 +6902,8 @@ static int do_md_stop(struct mddev *mddev, int mode) export_array(mddev); md_clean(mddev); - set_bit(MD_DELETED, &mddev->flags); + if (!legacy_async_del_gendisk) + set_bit(MD_DELETED, &mddev->flags); } md_new_event(); sysfs_notify_dirent_safe(mddev->sysfs_state); @@ -7232,6 +7483,9 @@ static int set_bitmap_file(struct mddev *mddev, int fd) { int err = 0; + if (!md_bitmap_registered(mddev)) + return -EINVAL; + if (mddev->pers) { if (!mddev->pers->quiesce || !mddev->thread) return -EBUSY; @@ -7288,16 +7542,16 @@ static int set_bitmap_file(struct mddev *mddev, int fd) err = 0; if (mddev->pers) { if (fd >= 0) { - err = mddev->bitmap_ops->create(mddev); + err = md_bitmap_create(mddev); if (!err) err = mddev->bitmap_ops->load(mddev); if (err) { - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); fd = -1; } } else if (fd < 0) { - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); } } @@ -7362,9 +7616,9 @@ int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) * openned */ if (info->state & (1<<MD_SB_CLEAN)) - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; else - mddev->recovery_cp = 0; + mddev->resync_offset = 0; mddev->persistent = ! info->not_persistent; mddev->external = 0; @@ -7604,12 +7858,12 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; - rv = mddev->bitmap_ops->create(mddev); + rv = md_bitmap_create(mddev); if (!rv) rv = mddev->bitmap_ops->load(mddev); if (rv) - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); } else { struct md_bitmap_stats stats; @@ -7635,7 +7889,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) put_cluster_ops(mddev); mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; } - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); mddev->bitmap_info.offset = 0; } } @@ -7672,9 +7926,9 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev) * 4 sectors (with a BIG number of cylinders...). This drives * dosfs just mad... ;-) */ -static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int md_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct mddev *mddev = bdev->bd_disk->private_data; + struct mddev *mddev = disk->private_data; geo->heads = 2; geo->sectors = 4; @@ -8303,7 +8557,7 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) seq_printf(seq, "\tresync=REMOTE"); return 1; } - if (mddev->recovery_cp < MaxSector) { + if (mddev->resync_offset < MaxSector) { seq_printf(seq, "\tresync=PENDING"); return 1; } @@ -8416,6 +8670,9 @@ static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev) unsigned long chunk_kb; int err; + if (!md_bitmap_enabled(mddev, false)) + return; + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); if (err) return; @@ -8798,18 +9055,24 @@ EXPORT_SYMBOL_GPL(md_submit_discard_bio); static void md_bitmap_start(struct mddev *mddev, struct md_io_clone *md_io_clone) { + md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ? + mddev->bitmap_ops->start_discard : + mddev->bitmap_ops->start_write; + if (mddev->pers->bitmap_sector) mddev->pers->bitmap_sector(mddev, &md_io_clone->offset, &md_io_clone->sectors); - mddev->bitmap_ops->start_write(mddev, md_io_clone->offset, - md_io_clone->sectors); + fn(mddev, md_io_clone->offset, md_io_clone->sectors); } static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone) { - mddev->bitmap_ops->end_write(mddev, md_io_clone->offset, - md_io_clone->sectors); + md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ? + mddev->bitmap_ops->end_discard : + mddev->bitmap_ops->end_write; + + fn(mddev, md_io_clone->offset, md_io_clone->sectors); } static void md_end_clone_io(struct bio *bio) @@ -8818,7 +9081,7 @@ static void md_end_clone_io(struct bio *bio) struct bio *orig_bio = md_io_clone->orig_bio; struct mddev *mddev = md_io_clone->mddev; - if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap) + if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false)) md_bitmap_end(mddev, md_io_clone); if (bio->bi_status && !orig_bio->bi_status) @@ -8845,9 +9108,10 @@ static void md_clone_bio(struct mddev *mddev, struct bio **bio) if (blk_queue_io_stat(bdev->bd_disk->queue)) md_io_clone->start_time = bio_start_io_acct(*bio); - if (bio_data_dir(*bio) == WRITE && mddev->bitmap) { + if (bio_data_dir(*bio) == WRITE && md_bitmap_enabled(mddev, false)) { md_io_clone->offset = (*bio)->bi_iter.bi_sector; md_io_clone->sectors = bio_sectors(*bio); + md_io_clone->rw = op_stat_group(bio_op(*bio)); md_bitmap_start(mddev, md_io_clone); } @@ -8869,7 +9133,7 @@ void md_free_cloned_bio(struct bio *bio) struct bio *orig_bio = md_io_clone->orig_bio; struct mddev *mddev = md_io_clone->mddev; - if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap) + if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false)) md_bitmap_end(mddev, md_io_clone); if (bio->bi_status && !orig_bio->bi_status) @@ -8935,6 +9199,39 @@ static sector_t md_sync_max_sectors(struct mddev *mddev, } } +/* + * If lazy recovery is requested and all rdevs are in sync, select the rdev with + * the higest index to perfore recovery to build initial xor data, this is the + * same as old bitmap. + */ +static bool mddev_select_lazy_recover_rdev(struct mddev *mddev) +{ + struct md_rdev *recover_rdev = NULL; + struct md_rdev *rdev; + bool ret = false; + + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + if (rdev->raid_disk < 0) + continue; + + if (test_bit(Faulty, &rdev->flags) || + !test_bit(In_sync, &rdev->flags)) + break; + + if (!recover_rdev || recover_rdev->raid_disk < rdev->raid_disk) + recover_rdev = rdev; + } + + if (recover_rdev) { + clear_bit(In_sync, &recover_rdev->flags); + ret = true; + } + + rcu_read_unlock(); + return ret; +} + static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) { sector_t start = 0; @@ -8946,7 +9243,7 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) return mddev->resync_min; case ACTION_RESYNC: if (!mddev->bitmap) - return mddev->recovery_cp; + return mddev->resync_offset; return 0; case ACTION_RESHAPE: /* @@ -8962,14 +9259,18 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) start = MaxSector; rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - !test_bit(Journal, &rdev->flags) && - !test_bit(Faulty, &rdev->flags) && - !test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < start) + if (rdev_needs_recovery(rdev, start)) start = rdev->recovery_offset; rcu_read_unlock(); + /* + * If there are no spares, and raid456 lazy initial recover is + * requested. + */ + if (test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery) && + start == MaxSector && mddev_select_lazy_recover_rdev(mddev)) + start = 0; + /* If there is a bitmap, we need to make sure all * writes that started before we added a spare * complete before we start doing a recovery. @@ -8990,19 +9291,12 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) static bool sync_io_within_limit(struct mddev *mddev) { - int io_sectors; - /* * For raid456, sync IO is stripe(4k) per IO, for other levels, it's * RESYNC_PAGES(64k) per IO. */ - if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6) - io_sectors = 8; - else - io_sectors = 128; - return atomic_read(&mddev->recovery_active) < - io_sectors * sync_io_depth(mddev); + (raid_is_456(mddev) ? 8 : 128) * sync_io_depth(mddev); } #define SYNC_MARKS 10 @@ -9054,6 +9348,11 @@ void md_do_sync(struct md_thread *thread) } action = md_sync_action(mddev); + if (action == ACTION_FROZEN || action == ACTION_IDLE) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + goto skip; + } + desc = md_sync_action_name(action); mddev->last_sync_action = action; @@ -9184,8 +9483,8 @@ void md_do_sync(struct md_thread *thread) atomic_read(&mddev->recovery_active) == 0); mddev->curr_resync_completed = j; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && - j > mddev->recovery_cp) - mddev->recovery_cp = j; + j > mddev->resync_offset) + mddev->resync_offset = j; update_time = jiffies; set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); sysfs_notify_dirent_safe(mddev->sysfs_completed); @@ -9207,6 +9506,12 @@ void md_do_sync(struct md_thread *thread) if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; + if (mddev->bitmap_ops && mddev->bitmap_ops->skip_sync_blocks) { + sectors = mddev->bitmap_ops->skip_sync_blocks(mddev, j); + if (sectors) + goto update; + } + sectors = mddev->pers->sync_request(mddev, j, max_sectors, &skipped); if (sectors == 0) { @@ -9222,6 +9527,7 @@ void md_do_sync(struct md_thread *thread) if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; +update: j += sectors; if (j > max_sectors) /* when skipping, extra large numbers can be returned. */ @@ -9305,19 +9611,19 @@ void md_do_sync(struct md_thread *thread) mddev->curr_resync > MD_RESYNC_ACTIVE) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { - if (mddev->curr_resync >= mddev->recovery_cp) { + if (mddev->curr_resync >= mddev->resync_offset) { pr_debug("md: checkpointing %s of %s.\n", desc, mdname(mddev)); if (test_bit(MD_RECOVERY_ERROR, &mddev->recovery)) - mddev->recovery_cp = + mddev->resync_offset = mddev->curr_resync_completed; else - mddev->recovery_cp = + mddev->resync_offset = mddev->curr_resync; } } else - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; } else { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) mddev->curr_resync = MaxSector; @@ -9325,12 +9631,8 @@ void md_do_sync(struct md_thread *thread) test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - mddev->delta_disks >= 0 && - !test_bit(Journal, &rdev->flags) && - !test_bit(Faulty, &rdev->flags) && - !test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < mddev->curr_resync) + if (mddev->delta_disks >= 0 && + rdev_needs_recovery(rdev, mddev->curr_resync)) rdev->recovery_offset = mddev->curr_resync; rcu_read_unlock(); } @@ -9421,6 +9723,12 @@ static bool rdev_is_spare(struct md_rdev *rdev) static bool rdev_addable(struct md_rdev *rdev) { + struct mddev *mddev; + + mddev = READ_ONCE(rdev->mddev); + if (!mddev) + return false; + /* rdev is already used, don't add it again. */ if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 || test_bit(Faulty, &rdev->flags)) @@ -9431,7 +9739,7 @@ static bool rdev_addable(struct md_rdev *rdev) return true; /* Allow to add if array is read-write. */ - if (md_is_rdwr(rdev->mddev)) + if (md_is_rdwr(mddev)) return true; /* @@ -9529,14 +9837,16 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); return true; } /* Check if resync is in progress. */ - if (mddev->recovery_cp < MaxSector) { + if (mddev->resync_offset < MaxSector) { remove_spares(mddev, NULL); set_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); return true; } @@ -9546,7 +9856,7 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) * re-add. */ *spares = remove_and_add_spares(mddev, NULL); - if (*spares) { + if (*spares || test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery)) { clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); @@ -9604,7 +9914,7 @@ static void md_start_sync(struct work_struct *ws) * We are adding a device or devices to an array which has the bitmap * stored on all devices. So make sure all bitmap pages get written. */ - if (spares) + if (spares && md_bitmap_enabled(mddev, true)) mddev->bitmap_ops->write_all(mddev); name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? @@ -9692,7 +10002,7 @@ static void unregister_sync_thread(struct mddev *mddev) */ void md_check_recovery(struct mddev *mddev) { - if (mddev->bitmap) + if (md_bitmap_enabled(mddev, false) && mddev->bitmap_ops->daemon_work) mddev->bitmap_ops->daemon_work(mddev); if (signal_pending(current)) { @@ -9714,7 +10024,7 @@ void md_check_recovery(struct mddev *mddev) test_bit(MD_RECOVERY_DONE, &mddev->recovery) || (mddev->external == 0 && mddev->safemode == 1) || (mddev->safemode == 2 - && !mddev->in_sync && mddev->recovery_cp == MaxSector) + && !mddev->in_sync && mddev->resync_offset == MaxSector) )) return; @@ -9759,6 +10069,7 @@ void md_check_recovery(struct mddev *mddev) } clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); @@ -9771,8 +10082,8 @@ void md_check_recovery(struct mddev *mddev) * remove disk. */ rdev_for_each_safe(rdev, tmp, mddev) { - if (test_and_clear_bit(ClusterRemove, &rdev->flags) && - rdev->raid_disk < 0) + if (rdev->raid_disk < 0 && + test_and_clear_bit(ClusterRemove, &rdev->flags)) md_kick_rdev_from_array(rdev); } } @@ -9869,6 +10180,7 @@ void md_reap_sync_thread(struct mddev *mddev) clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); /* * We call mddev->cluster_ops->update_size here because sync_size could * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, @@ -10016,8 +10328,16 @@ static void md_geninit(void) static int __init md_init(void) { - int ret = -ENOMEM; + int ret = md_bitmap_init(); + if (ret) + return ret; + + ret = md_llbitmap_init(); + if (ret) + goto err_bitmap; + + ret = -ENOMEM; md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); if (!md_wq) goto err_wq; @@ -10026,11 +10346,6 @@ static int __init md_init(void) if (!md_misc_wq) goto err_misc_wq; - md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, - 0); - if (!md_bitmap_wq) - goto err_bitmap_wq; - ret = __register_blkdev(MD_MAJOR, "md", md_probe); if (ret < 0) goto err_md; @@ -10049,12 +10364,13 @@ static int __init md_init(void) err_mdp: unregister_blkdev(MD_MAJOR, "md"); err_md: - destroy_workqueue(md_bitmap_wq); -err_bitmap_wq: destroy_workqueue(md_misc_wq); err_misc_wq: destroy_workqueue(md_wq); err_wq: + md_llbitmap_exit(); +err_bitmap: + md_bitmap_exit(); return ret; } @@ -10072,14 +10388,17 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); if (ret) pr_info("md-cluster: resize failed\n"); - else + else if (md_bitmap_enabled(mddev, false)) mddev->bitmap_ops->update_sb(mddev->bitmap); } /* Check for change of roles in the active devices */ rdev_for_each_safe(rdev2, tmp, mddev) { - if (test_bit(Faulty, &rdev2->flags)) + if (test_bit(Faulty, &rdev2->flags)) { + if (test_bit(ClusterRemove, &rdev2->flags)) + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); continue; + } /* Check if the roles changed */ role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); @@ -10357,8 +10676,8 @@ static __exit void md_exit(void) spin_unlock(&all_mddevs_lock); destroy_workqueue(md_misc_wq); - destroy_workqueue(md_bitmap_wq); destroy_workqueue(md_wq); + md_bitmap_exit(); } subsys_initcall(md_init); @@ -10377,6 +10696,7 @@ module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); module_param(create_on_open, bool, S_IRUSR|S_IWUSR); +module_param(legacy_async_del_gendisk, bool, 0600); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MD RAID framework"); diff --git a/drivers/md/md.h b/drivers/md/md.h index 67b365621507..1979c2d4fe89 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -26,7 +26,7 @@ enum md_submodule_type { MD_PERSONALITY = 0, MD_CLUSTER, - MD_BITMAP, /* TODO */ + MD_BITMAP, }; enum md_submodule_id { @@ -38,8 +38,9 @@ enum md_submodule_id { ID_RAID6 = 6, ID_RAID10 = 10, ID_CLUSTER, - ID_BITMAP, /* TODO */ - ID_LLBITMAP, /* TODO */ + ID_BITMAP, + ID_LLBITMAP, + ID_BITMAP_NONE, }; struct md_submodule_head { @@ -523,7 +524,7 @@ struct mddev { unsigned long normal_io_events; /* IO event timestamp */ atomic_t recovery_active; /* blocks scheduled, but not written */ wait_queue_head_t recovery_wait; - sector_t recovery_cp; + sector_t resync_offset; sector_t resync_min; /* user requested sync * starts here */ sector_t resync_max; /* resync should pause @@ -565,6 +566,7 @@ struct mddev { struct percpu_ref writes_pending; int sync_checkers; /* # of threads checking writes_pending */ + enum md_submodule_id bitmap_id; void *bitmap; /* the bitmap for the device */ struct bitmap_operations *bitmap_ops; struct { @@ -665,6 +667,8 @@ enum recovery_flags { MD_RECOVERY_RESHAPE, /* remote node is running resync thread */ MD_RESYNCING_REMOTE, + /* raid456 lazy initial recover */ + MD_RECOVERY_LAZY_RECOVER, }; enum md_ro_state { @@ -796,7 +800,6 @@ struct md_sysfs_entry { ssize_t (*show)(struct mddev *, char *); ssize_t (*store)(struct mddev *, const char *, size_t); }; -extern const struct attribute_group md_bitmap_group; static inline struct kernfs_node *sysfs_get_dirent_safe(struct kernfs_node *sd, char *name) { @@ -873,6 +876,7 @@ struct md_io_clone { unsigned long start_time; sector_t offset; unsigned long sectors; + enum stat_group rw; struct bio bio_clone; }; @@ -909,8 +913,9 @@ void md_account_bio(struct mddev *mddev, struct bio **bio); void md_free_cloned_bio(struct bio *bio); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); -extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, - sector_t sector, int size, struct page *page); +void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev, + sector_t sector, int size, struct page *page, + unsigned int offset); extern int md_super_wait(struct mddev *mddev); extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, blk_opf_t opf, bool metadata_op); @@ -1013,7 +1018,6 @@ struct mdu_array_info_s; struct mdu_disk_info_s; extern int mdp_major; -extern struct workqueue_struct *md_bitmap_wq; void md_autostart_arrays(int part); int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info); int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info); @@ -1034,6 +1038,12 @@ static inline bool mddev_is_dm(struct mddev *mddev) return !mddev->gendisk; } +static inline bool raid_is_456(struct mddev *mddev) +{ + return mddev->level == ID_RAID4 || mddev->level == ID_RAID5 || + mddev->level == ID_RAID6; +} + static inline void mddev_trace_remap(struct mddev *mddev, struct bio *bio, sector_t sector) { diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index cbe2a9054cb9..e443e478645a 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -382,6 +382,7 @@ static int raid0_set_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_hw_sectors = mddev->chunk_sectors; lim.max_write_zeroes_sectors = mddev->chunk_sectors; + lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * mddev->raid_disks; lim.chunk_sectors = mddev->chunk_sectors; @@ -463,21 +464,16 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) zone = find_zone(conf, &start); if (bio_end_sector(bio) > zone->zone_end) { - struct bio *split = bio_split(bio, - zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO, - &mddev->bio_set); - - if (IS_ERR(split)) { - bio->bi_status = errno_to_blk_status(PTR_ERR(split)); - bio_endio(bio); + bio = bio_submit_split_bioset(bio, + zone->zone_end - bio->bi_iter.bi_sector, + &mddev->bio_set); + if (!bio) return; - } - bio_chain(split, bio); - submit_bio_noacct(bio); - bio = split; + end = zone->zone_end; - } else + } else { end = bio_end_sector(bio); + } orig_end = end; if (zone != conf->strip_zone) @@ -612,17 +608,10 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) : sector_div(sector, chunk_sects)); if (sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, sectors, GFP_NOIO, + bio = bio_submit_split_bioset(bio, sectors, &mddev->bio_set); - - if (IS_ERR(split)) { - bio->bi_status = errno_to_blk_status(PTR_ERR(split)); - bio_endio(bio); + if (!bio) return true; - } - bio_chain(split, bio); - raid0_map_submit_bio(mddev, bio); - bio = split; } raid0_map_submit_bio(mddev, bio); @@ -674,7 +663,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev) mddev->raid_disks--; mddev->delta_disks = -1; /* make sure it will be not marked as dirty */ - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); create_strip_zones(mddev, &priv_conf); @@ -717,7 +706,7 @@ static void *raid0_takeover_raid10(struct mddev *mddev) mddev->raid_disks += mddev->delta_disks; mddev->degraded = 0; /* make sure it will be not marked as dirty */ - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); create_strip_zones(mddev, &priv_conf); @@ -760,7 +749,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev) mddev->delta_disks = 1 - mddev->raid_disks; mddev->raid_disks = 1; /* make sure it will be not marked as dirty */ - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); create_strip_zones(mddev, &priv_conf); diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index b8b3a9069701..521625756128 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -140,7 +140,7 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio, * If bitmap is not enabled, it's safe to submit the io directly, and * this can get optimal performance. */ - if (!mddev->bitmap_ops->enabled(mddev)) { + if (!md_bitmap_enabled(mddev, true)) { raid1_submit_write(bio); return true; } @@ -283,7 +283,7 @@ static inline int raid1_check_read_range(struct md_rdev *rdev, static inline bool raid1_should_read_first(struct mddev *mddev, sector_t this_sector, int len) { - if ((mddev->recovery_cp < this_sector + len)) + if ((mddev->resync_offset < this_sector + len)) return true; if (mddev_is_clustered(mddev) && diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 64b8176907a9..592a40233004 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -127,10 +127,9 @@ static inline struct r1bio *get_resync_r1bio(struct bio *bio) return get_resync_pages(bio)->raid_bio; } -static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) +static void *r1bio_pool_alloc(gfp_t gfp_flags, struct r1conf *conf) { - struct pool_info *pi = data; - int size = offsetof(struct r1bio, bios[pi->raid_disks]); + int size = offsetof(struct r1bio, bios[conf->raid_disks * 2]); /* allocate a r1bio with room for raid_disks entries in the bios array */ return kzalloc(size, gfp_flags); @@ -145,18 +144,18 @@ static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) { - struct pool_info *pi = data; + struct r1conf *conf = data; struct r1bio *r1_bio; struct bio *bio; int need_pages; int j; struct resync_pages *rps; - r1_bio = r1bio_pool_alloc(gfp_flags, pi); + r1_bio = r1bio_pool_alloc(gfp_flags, conf); if (!r1_bio) return NULL; - rps = kmalloc_array(pi->raid_disks, sizeof(struct resync_pages), + rps = kmalloc_array(conf->raid_disks * 2, sizeof(struct resync_pages), gfp_flags); if (!rps) goto out_free_r1bio; @@ -164,11 +163,11 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) /* * Allocate bios : 1 for reading, n-1 for writing */ - for (j = pi->raid_disks ; j-- ; ) { + for (j = conf->raid_disks * 2; j-- ; ) { bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); if (!bio) goto out_free_bio; - bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); + bio_init_inline(bio, NULL, RESYNC_PAGES, 0); r1_bio->bios[j] = bio; } /* @@ -177,11 +176,11 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) * If this is a user-requested check/repair, allocate * RESYNC_PAGES for each bio. */ - if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) - need_pages = pi->raid_disks; + if (test_bit(MD_RECOVERY_REQUESTED, &conf->mddev->recovery)) + need_pages = conf->raid_disks * 2; else need_pages = 1; - for (j = 0; j < pi->raid_disks; j++) { + for (j = 0; j < conf->raid_disks * 2; j++) { struct resync_pages *rp = &rps[j]; bio = r1_bio->bios[j]; @@ -207,7 +206,7 @@ out_free_pages: resync_free_pages(&rps[j]); out_free_bio: - while (++j < pi->raid_disks) { + while (++j < conf->raid_disks * 2) { bio_uninit(r1_bio->bios[j]); kfree(r1_bio->bios[j]); } @@ -220,12 +219,12 @@ out_free_r1bio: static void r1buf_pool_free(void *__r1_bio, void *data) { - struct pool_info *pi = data; + struct r1conf *conf = data; int i; struct r1bio *r1bio = __r1_bio; struct resync_pages *rp = NULL; - for (i = pi->raid_disks; i--; ) { + for (i = conf->raid_disks * 2; i--; ) { rp = get_resync_pages(r1bio->bios[i]); resync_free_pages(rp); bio_uninit(r1bio->bios[i]); @@ -255,7 +254,7 @@ static void free_r1bio(struct r1bio *r1_bio) struct r1conf *conf = r1_bio->mddev->private; put_all_bios(conf, r1_bio); - mempool_free(r1_bio, &conf->r1bio_pool); + mempool_free(r1_bio, conf->r1bio_pool); } static void put_buf(struct r1bio *r1_bio) @@ -1226,7 +1225,7 @@ static void alloc_behind_master_bio(struct r1bio *r1_bio, int i = 0; struct bio *behind_bio = NULL; - behind_bio = bio_alloc_bioset(NULL, vcnt, 0, GFP_NOIO, + behind_bio = bio_alloc_bioset(NULL, vcnt, bio->bi_opf, GFP_NOIO, &r1_bio->mddev->bio_set); /* discard op, we don't support writezero/writesame yet */ @@ -1305,9 +1304,8 @@ alloc_r1bio(struct mddev *mddev, struct bio *bio) struct r1conf *conf = mddev->private; struct r1bio *r1_bio; - r1_bio = mempool_alloc(&conf->r1bio_pool, GFP_NOIO); - /* Ensure no bio records IO_BLOCKED */ - memset(r1_bio->bios, 0, conf->raid_disks * sizeof(r1_bio->bios[0])); + r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); + memset(r1_bio, 0, offsetof(struct r1bio, bios[conf->raid_disks * 2])); init_r1bio(r1_bio, mddev, bio); return r1_bio; } @@ -1319,7 +1317,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, struct raid1_info *mirror; struct bio *read_bio; int max_sectors; - int rdisk, error; + int rdisk; bool r1bio_existed = !!r1_bio; /* @@ -1368,7 +1366,8 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, (unsigned long long)r1_bio->sector, mirror->rdev->bdev); - if (test_bit(WriteMostly, &mirror->rdev->flags)) { + if (test_bit(WriteMostly, &mirror->rdev->flags) && + md_bitmap_enabled(mddev, false)) { /* * Reading from a write-mostly device must take care not to * over-take any writes that are 'behind' @@ -1378,16 +1377,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, } if (max_sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, max_sectors, - gfp, &conf->bio_split); - - if (IS_ERR(split)) { - error = PTR_ERR(split); + bio = bio_submit_split_bioset(bio, max_sectors, + &conf->bio_split); + if (!bio) { + set_bit(R1BIO_Returned, &r1_bio->state); goto err_handle; } - bio_chain(split, bio); - submit_bio_noacct(bio); - bio = split; + r1_bio->master_bio = bio; r1_bio->sectors = max_sectors; } @@ -1415,8 +1411,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, err_handle: atomic_dec(&mirror->rdev->nr_pending); - bio->bi_status = errno_to_blk_status(error); - set_bit(R1BIO_Uptodate, &r1_bio->state); raid_end_bio_io(r1_bio); } @@ -1454,12 +1448,36 @@ retry: return true; } +static void raid1_start_write_behind(struct mddev *mddev, struct r1bio *r1_bio, + struct bio *bio) +{ + unsigned long max_write_behind = mddev->bitmap_info.max_write_behind; + struct md_bitmap_stats stats; + int err; + + /* behind write rely on bitmap, see bitmap_operations */ + if (!md_bitmap_enabled(mddev, false)) + return; + + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); + if (err) + return; + + /* Don't do behind IO if reader is waiting, or there are too many. */ + if (!stats.behind_wait && stats.behind_writes < max_write_behind) + alloc_behind_master_bio(r1_bio, bio); + + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) + mddev->bitmap_ops->start_behind_write(mddev); + +} + static void raid1_write_request(struct mddev *mddev, struct bio *bio, int max_write_sectors) { struct r1conf *conf = mddev->private; struct r1bio *r1_bio; - int i, disks, k, error; + int i, disks, k; unsigned long flags; int first_clone; int max_sectors; @@ -1563,10 +1581,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, * complexity of supporting that is not worth * the benefit. */ - if (bio->bi_opf & REQ_ATOMIC) { - error = -EIO; + if (bio->bi_opf & REQ_ATOMIC) goto err_handle; - } good_sectors = first_bad - r1_bio->sector; if (good_sectors < max_sectors) @@ -1586,16 +1602,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, max_sectors = min_t(int, max_sectors, BIO_MAX_VECS * (PAGE_SIZE >> 9)); if (max_sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, max_sectors, - GFP_NOIO, &conf->bio_split); - - if (IS_ERR(split)) { - error = PTR_ERR(split); + bio = bio_submit_split_bioset(bio, max_sectors, + &conf->bio_split); + if (!bio) { + set_bit(R1BIO_Returned, &r1_bio->state); goto err_handle; } - bio_chain(split, bio); - submit_bio_noacct(bio); - bio = split; + r1_bio->master_bio = bio; r1_bio->sectors = max_sectors; } @@ -1614,22 +1627,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, continue; if (first_clone) { - unsigned long max_write_behind = - mddev->bitmap_info.max_write_behind; - struct md_bitmap_stats stats; - int err; - - /* do behind I/O ? - * Not if there are too many, or cannot - * allocate memory, or a reader on WriteMostly - * is waiting for behind writes to flush */ - err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); - if (!err && write_behind && !stats.behind_wait && - stats.behind_writes < max_write_behind) - alloc_behind_master_bio(r1_bio, bio); - - if (test_bit(R1BIO_BehindIO, &r1_bio->state)) - mddev->bitmap_ops->start_behind_write(mddev); + if (write_behind) + raid1_start_write_behind(mddev, r1_bio, bio); first_clone = 0; } @@ -1685,8 +1684,6 @@ err_handle: } } - bio->bi_status = errno_to_blk_status(error); - set_bit(R1BIO_Uptodate, &r1_bio->state); raid_end_bio_io(r1_bio); } @@ -2059,7 +2056,7 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio) /* make sure these bits don't get cleared. */ do { - mddev->bitmap_ops->end_sync(mddev, s, &sync_blocks); + md_bitmap_end_sync(mddev, s, &sync_blocks); s += sync_blocks; sectors_to_go -= sync_blocks; } while (sectors_to_go > 0); @@ -2747,7 +2744,7 @@ static int init_resync(struct r1conf *conf) BUG_ON(mempool_initialized(&conf->r1buf_pool)); return mempool_init(&conf->r1buf_pool, buffs, r1buf_pool_alloc, - r1buf_pool_free, conf->poolinfo); + r1buf_pool_free, conf); } static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf) @@ -2757,7 +2754,7 @@ static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf) struct bio *bio; int i; - for (i = conf->poolinfo->raid_disks; i--; ) { + for (i = conf->raid_disks * 2; i--; ) { bio = r1bio->bios[i]; rps = bio->bi_private; bio_reset(bio, NULL, 0); @@ -2806,12 +2803,13 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, * We can find the current addess in mddev->curr_resync */ if (mddev->curr_resync < max_sector) /* aborted */ - mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync, - &sync_blocks); + md_bitmap_end_sync(mddev, mddev->curr_resync, + &sync_blocks); else /* completed sync */ conf->fullsync = 0; - mddev->bitmap_ops->close_sync(mddev); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->close_sync(mddev); close_sync(conf); if (mddev_is_clustered(mddev)) { @@ -2822,7 +2820,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, } if (mddev->bitmap == NULL && - mddev->recovery_cp == MaxSector && + mddev->resync_offset == MaxSector && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && conf->fullsync == 0) { *skipped = 1; @@ -2831,7 +2829,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, /* before building a request, check if we can skip these blocks.. * This call the bitmap_start_sync doesn't actually record anything */ - if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, true) && + if (!md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, true) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* We can skip this block, and probably several more */ *skipped = 1; @@ -2848,10 +2846,11 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, /* we are incrementing sector_nr below. To be safe, we check against * sector_nr + two times RESYNC_SECTORS */ - - mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, - mddev_is_clustered(mddev) && - (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, + mddev_is_clustered(mddev) && + (sector_nr + 2 * RESYNC_SECTORS > + conf->cluster_sync_high)); if (raise_barrier(conf, sector_nr)) return 0; @@ -3006,8 +3005,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, if (len == 0) break; if (sync_blocks == 0) { - if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, - &sync_blocks, still_degraded) && + if (!md_bitmap_start_sync(mddev, sector_nr, + &sync_blocks, still_degraded) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) break; @@ -3085,6 +3084,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) int i; struct raid1_info *disk; struct md_rdev *rdev; + size_t r1bio_size; int err = -ENOMEM; conf = kzalloc(sizeof(struct r1conf), GFP_KERNEL); @@ -3121,21 +3121,15 @@ static struct r1conf *setup_conf(struct mddev *mddev) if (!conf->tmppage) goto abort; - conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL); - if (!conf->poolinfo) - goto abort; - conf->poolinfo->raid_disks = mddev->raid_disks * 2; - err = mempool_init(&conf->r1bio_pool, NR_RAID_BIOS, r1bio_pool_alloc, - rbio_pool_free, conf->poolinfo); - if (err) + r1bio_size = offsetof(struct r1bio, bios[mddev->raid_disks * 2]); + conf->r1bio_pool = mempool_create_kmalloc_pool(NR_RAID_BIOS, r1bio_size); + if (!conf->r1bio_pool) goto abort; err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0); if (err) goto abort; - conf->poolinfo->mddev = mddev; - err = -EINVAL; spin_lock_init(&conf->device_lock); conf->raid_disks = mddev->raid_disks; @@ -3198,10 +3192,9 @@ static struct r1conf *setup_conf(struct mddev *mddev) abort: if (conf) { - mempool_exit(&conf->r1bio_pool); + mempool_destroy(conf->r1bio_pool); kfree(conf->mirrors); safe_put_page(conf->tmppage); - kfree(conf->poolinfo); kfree(conf->nr_pending); kfree(conf->nr_waiting); kfree(conf->nr_queued); @@ -3219,6 +3212,7 @@ static int raid1_set_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; + lim.max_hw_wzeroes_unmap_sectors = 0; lim.features |= BLK_FEAT_ATOMIC_WRITES; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); if (err) @@ -3282,9 +3276,9 @@ static int raid1_run(struct mddev *mddev) } if (conf->raid_disks - mddev->degraded == 1) - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; - if (mddev->recovery_cp != MaxSector) + if (mddev->resync_offset != MaxSector) pr_info("md/raid1:%s: not clean -- starting background reconstruction\n", mdname(mddev)); pr_info("md/raid1:%s: active with %d out of %d mirrors\n", @@ -3311,10 +3305,9 @@ static void raid1_free(struct mddev *mddev, void *priv) { struct r1conf *conf = priv; - mempool_exit(&conf->r1bio_pool); + mempool_destroy(conf->r1bio_pool); kfree(conf->mirrors); safe_put_page(conf->tmppage); - kfree(conf->poolinfo); kfree(conf->nr_pending); kfree(conf->nr_waiting); kfree(conf->nr_queued); @@ -3333,20 +3326,22 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors) * worth it. */ sector_t newsize = raid1_size(mddev, sectors, 0); - int ret; if (mddev->external_size && mddev->array_sectors > newsize) return -EINVAL; - ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); - if (ret) - return ret; + if (md_bitmap_enabled(mddev, false)) { + int ret = mddev->bitmap_ops->resize(mddev, newsize, 0); + + if (ret) + return ret; + } md_set_array_sectors(mddev, newsize); if (sectors > mddev->dev_sectors && - mddev->recovery_cp > mddev->dev_sectors) { - mddev->recovery_cp = mddev->dev_sectors; + mddev->resync_offset > mddev->dev_sectors) { + mddev->resync_offset = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } mddev->dev_sectors = sectors; @@ -3367,17 +3362,13 @@ static int raid1_reshape(struct mddev *mddev) * At the same time, we "pack" the devices so that all the missing * devices have the higher raid_disk numbers. */ - mempool_t newpool, oldpool; - struct pool_info *newpoolinfo; + mempool_t *newpool, *oldpool; + size_t new_r1bio_size; struct raid1_info *newmirrors; struct r1conf *conf = mddev->private; int cnt, raid_disks; unsigned long flags; int d, d2; - int ret; - - memset(&newpool, 0, sizeof(newpool)); - memset(&oldpool, 0, sizeof(oldpool)); /* Cannot change chunk_size, layout, or level */ if (mddev->chunk_sectors != mddev->new_chunk_sectors || @@ -3403,24 +3394,16 @@ static int raid1_reshape(struct mddev *mddev) return -EBUSY; } - newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL); - if (!newpoolinfo) + new_r1bio_size = offsetof(struct r1bio, bios[raid_disks * 2]); + newpool = mempool_create_kmalloc_pool(NR_RAID_BIOS, new_r1bio_size); + if (!newpool) { return -ENOMEM; - newpoolinfo->mddev = mddev; - newpoolinfo->raid_disks = raid_disks * 2; - - ret = mempool_init(&newpool, NR_RAID_BIOS, r1bio_pool_alloc, - rbio_pool_free, newpoolinfo); - if (ret) { - kfree(newpoolinfo); - return ret; } newmirrors = kzalloc(array3_size(sizeof(struct raid1_info), raid_disks, 2), GFP_KERNEL); if (!newmirrors) { - kfree(newpoolinfo); - mempool_exit(&newpool); + mempool_destroy(newpool); return -ENOMEM; } @@ -3429,7 +3412,6 @@ static int raid1_reshape(struct mddev *mddev) /* ok, everything is stopped */ oldpool = conf->r1bio_pool; conf->r1bio_pool = newpool; - init_waitqueue_head(&conf->r1bio_pool.wait); for (d = d2 = 0; d < conf->raid_disks; d++) { struct md_rdev *rdev = conf->mirrors[d].rdev; @@ -3446,8 +3428,6 @@ static int raid1_reshape(struct mddev *mddev) } kfree(conf->mirrors); conf->mirrors = newmirrors; - kfree(conf->poolinfo); - conf->poolinfo = newpoolinfo; spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded += (raid_disks - conf->raid_disks); @@ -3461,7 +3441,7 @@ static int raid1_reshape(struct mddev *mddev) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); - mempool_exit(&oldpool); + mempool_destroy(oldpool); return 0; } diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 33f318fcc268..2ebe35aaa534 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -49,22 +49,6 @@ struct raid1_info { sector_t seq_start; }; -/* - * memory pools need a pointer to the mddev, so they can force an unplug - * when memory is tight, and a count of the number of drives that the - * pool was allocated for, so they know how much to allocate and free. - * mddev->raid_disks cannot be used, as it can change while a pool is active - * These two datums are stored in a kmalloced struct. - * The 'raid_disks' here is twice the raid_disks in r1conf. - * This allows space for each 'real' device can have a replacement in the - * second half of the array. - */ - -struct pool_info { - struct mddev *mddev; - int raid_disks; -}; - struct r1conf { struct mddev *mddev; struct raid1_info *mirrors; /* twice 'raid_disks' to @@ -114,11 +98,7 @@ struct r1conf { */ int recovery_disabled; - /* poolinfo contains information about the content of the - * mempools - it changes when the array grows or shrinks - */ - struct pool_info *poolinfo; - mempool_t r1bio_pool; + mempool_t *r1bio_pool; mempool_t r1buf_pool; struct bio_set bio_split; @@ -198,7 +178,9 @@ enum r1bio_state { * any write was successful. Otherwise we call when * any write-behind write succeeds, otherwise we call * with failure when last write completes (and all failed). - * Record that bi_end_io was called with this flag... + * + * And for bio_split errors, record that bi_end_io was called + * with this flag... */ R1BIO_Returned, /* If a write for this request means we can clear some diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 95dc354a86a0..14dcd5142eb4 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -163,14 +163,14 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); if (!bio) goto out_free_bio; - bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); + bio_init_inline(bio, NULL, RESYNC_PAGES, 0); r10_bio->devs[j].bio = bio; if (!conf->have_replacement) continue; bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); if (!bio) goto out_free_bio; - bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); + bio_init_inline(bio, NULL, RESYNC_PAGES, 0); r10_bio->devs[j].repl_bio = bio; } /* @@ -322,10 +322,12 @@ static void raid_end_bio_io(struct r10bio *r10_bio) struct bio *bio = r10_bio->master_bio; struct r10conf *conf = r10_bio->mddev->private; - if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) - bio->bi_status = BLK_STS_IOERR; + if (!test_and_set_bit(R10BIO_Returned, &r10_bio->state)) { + if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + } - bio_endio(bio); /* * Wake up any possible resync thread that waits for the device * to go idle. @@ -1154,7 +1156,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, int slot = r10_bio->read_slot; struct md_rdev *err_rdev = NULL; gfp_t gfp = GFP_NOIO; - int error; if (slot >= 0 && r10_bio->devs[slot].rdev) { /* @@ -1203,17 +1204,15 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, rdev->bdev, (unsigned long long)r10_bio->sector); if (max_sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, max_sectors, - gfp, &conf->bio_split); - if (IS_ERR(split)) { - error = PTR_ERR(split); - goto err_handle; - } - bio_chain(split, bio); allow_barrier(conf); - submit_bio_noacct(bio); + bio = bio_submit_split_bioset(bio, max_sectors, + &conf->bio_split); wait_barrier(conf, false); - bio = split; + if (!bio) { + set_bit(R10BIO_Returned, &r10_bio->state); + goto err_handle; + } + r10_bio->master_bio = bio; r10_bio->sectors = max_sectors; } @@ -1241,8 +1240,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, return; err_handle: atomic_dec(&rdev->nr_pending); - bio->bi_status = errno_to_blk_status(error); - set_bit(R10BIO_Uptodate, &r10_bio->state); raid_end_bio_io(r10_bio); } @@ -1351,7 +1348,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, int i, k; sector_t sectors; int max_sectors; - int error; if ((mddev_is_clustered(mddev) && mddev->cluster_ops->area_resyncing(mddev, WRITE, @@ -1465,10 +1461,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, * complexity of supporting that is not worth * the benefit. */ - if (bio->bi_opf & REQ_ATOMIC) { - error = -EIO; + if (bio->bi_opf & REQ_ATOMIC) goto err_handle; - } good_sectors = first_bad - dev_sector; if (good_sectors < max_sectors) @@ -1489,17 +1483,15 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, r10_bio->sectors = max_sectors; if (r10_bio->sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, r10_bio->sectors, - GFP_NOIO, &conf->bio_split); - if (IS_ERR(split)) { - error = PTR_ERR(split); - goto err_handle; - } - bio_chain(split, bio); allow_barrier(conf); - submit_bio_noacct(bio); + bio = bio_submit_split_bioset(bio, r10_bio->sectors, + &conf->bio_split); wait_barrier(conf, false); - bio = split; + if (!bio) { + set_bit(R10BIO_Returned, &r10_bio->state); + goto err_handle; + } + r10_bio->master_bio = bio; } @@ -1531,8 +1523,6 @@ err_handle: } } - bio->bi_status = errno_to_blk_status(error); - set_bit(R10BIO_Uptodate, &r10_bio->state); raid_end_bio_io(r10_bio); } @@ -1679,7 +1669,9 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) bio_endio(bio); return 0; } + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); allow_barrier(conf); /* Resend the fist split part */ submit_bio_noacct(split); @@ -1694,7 +1686,9 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) bio_endio(bio); return 0; } + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); allow_barrier(conf); /* Resend the second split part */ submit_bio_noacct(bio); @@ -2117,7 +2111,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) int last = conf->geo.raid_disks - 1; struct raid10_info *p; - if (mddev->recovery_cp < MaxSector) + if (mddev->resync_offset < MaxSector) /* only hot-add to in-sync arrays, as recovery is * very different from resync */ @@ -3185,7 +3179,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * of a clean array, like RAID1 does. */ if (mddev->bitmap == NULL && - mddev->recovery_cp == MaxSector && + mddev->resync_offset == MaxSector && mddev->reshape_position == MaxSector && !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && @@ -3221,15 +3215,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (mddev->curr_resync < max_sector) { /* aborted */ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - mddev->bitmap_ops->end_sync(mddev, - mddev->curr_resync, - &sync_blocks); + md_bitmap_end_sync(mddev, mddev->curr_resync, + &sync_blocks); else for (i = 0; i < conf->geo.raid_disks; i++) { sector_t sect = raid10_find_virt(conf, mddev->curr_resync, i); - mddev->bitmap_ops->end_sync(mddev, sect, - &sync_blocks); + md_bitmap_end_sync(mddev, sect, &sync_blocks); } } else { /* completed sync */ @@ -3249,7 +3241,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } conf->fullsync = 0; } - mddev->bitmap_ops->close_sync(mddev); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->close_sync(mddev); close_sync(conf); *skipped = 1; return sectors_skipped; @@ -3351,9 +3344,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * we only need to recover the block if it is set in * the bitmap */ - must_sync = mddev->bitmap_ops->start_sync(mddev, sect, - &sync_blocks, - true); + must_sync = md_bitmap_start_sync(mddev, sect, + &sync_blocks, true); if (sync_blocks < max_sync) max_sync = sync_blocks; if (!must_sync && @@ -3396,9 +3388,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } } - must_sync = mddev->bitmap_ops->start_sync(mddev, sect, - &sync_blocks, still_degraded); - + md_bitmap_start_sync(mddev, sect, &sync_blocks, + still_degraded); any_working = 0; for (j=0; j<conf->copies;j++) { int k; @@ -3570,13 +3561,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * safety reason, which ensures curr_resync_completed is * updated in bitmap_cond_end_sync. */ - mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); - if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, - &sync_blocks, - mddev->degraded) && + if (!md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, + mddev->degraded) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* We can skip this block */ @@ -4008,6 +3999,7 @@ static int raid10_set_queue_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; + lim.max_hw_wzeroes_unmap_sectors = 0; lim.io_min = mddev->chunk_sectors << 9; lim.chunk_sectors = mddev->chunk_sectors; lim.io_opt = lim.io_min * raid10_nr_stripes(conf); @@ -4145,7 +4137,7 @@ static int raid10_run(struct mddev *mddev) disk->recovery_disabled = mddev->recovery_disabled - 1; } - if (mddev->recovery_cp != MaxSector) + if (mddev->resync_offset != MaxSector) pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n", mdname(mddev)); pr_info("md/raid10:%s: active with %d out of %d devices\n", @@ -4225,7 +4217,6 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) */ struct r10conf *conf = mddev->private; sector_t oldsize, size; - int ret; if (mddev->reshape_position != MaxSector) return -EBUSY; @@ -4239,14 +4230,17 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) mddev->array_sectors > size) return -EINVAL; - ret = mddev->bitmap_ops->resize(mddev, size, 0, false); - if (ret) - return ret; + if (md_bitmap_enabled(mddev, false)) { + int ret = mddev->bitmap_ops->resize(mddev, size, 0); + + if (ret) + return ret; + } md_set_array_sectors(mddev, size); if (sectors > mddev->dev_sectors && - mddev->recovery_cp > oldsize) { - mddev->recovery_cp = oldsize; + mddev->resync_offset > oldsize) { + mddev->resync_offset = oldsize; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } calc_sectors(conf, sectors); @@ -4275,7 +4269,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) mddev->delta_disks = mddev->raid_disks; mddev->raid_disks *= 2; /* make sure it will be not marked as dirty */ - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; mddev->dev_sectors = size; conf = setup_conf(mddev); @@ -4507,8 +4501,9 @@ static int raid10_start_reshape(struct mddev *mddev) oldsize = raid10_size(mddev, 0, 0); newsize = raid10_size(mddev, 0, conf->geo.raid_disks); - if (!mddev_is_clustered(mddev)) { - ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); + if (!mddev_is_clustered(mddev) && + md_bitmap_enabled(mddev, false)) { + ret = mddev->bitmap_ops->resize(mddev, newsize, 0); if (ret) goto abort; else @@ -4530,13 +4525,14 @@ static int raid10_start_reshape(struct mddev *mddev) MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize)) goto out; - ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); + /* cluster can't be setup without bitmap */ + ret = mddev->bitmap_ops->resize(mddev, newsize, 0); if (ret) goto abort; ret = mddev->cluster_ops->resize_bitmaps(mddev, newsize, oldsize); if (ret) { - mddev->bitmap_ops->resize(mddev, oldsize, 0, false); + mddev->bitmap_ops->resize(mddev, oldsize, 0); goto abort; } } @@ -5087,8 +5083,8 @@ static void raid10_finish_reshape(struct mddev *mddev) return; if (mddev->delta_disks > 0) { - if (mddev->recovery_cp > mddev->resync_max_sectors) { - mddev->recovery_cp = mddev->resync_max_sectors; + if (mddev->resync_offset > mddev->resync_max_sectors) { + mddev->resync_offset = mddev->resync_max_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } mddev->resync_max_sectors = mddev->array_sectors; diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 3f16ad6904a9..da00a55f7a55 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -165,6 +165,8 @@ enum r10bio_state { * so that raid10d knows what to do with them. */ R10BIO_ReadError, +/* For bio_split errors, record that bi_end_io was called. */ + R10BIO_Returned, /* If a write for this request means we can clear some * known-bad-block records, we set this flag. */ diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index c0fb335311aa..56b234683ee6 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -1163,7 +1163,7 @@ static int ppl_load_distributed(struct ppl_log *log) le64_to_cpu(pplhdr->generation)); /* attempt to recover from log if we are starting a dirty array */ - if (pplhdr && !mddev->pers && mddev->recovery_cp != MaxSector) + if (pplhdr && !mddev->pers && mddev->resync_offset != MaxSector) ret = ppl_recover(log, pplhdr, pplhdr_offset); /* write empty header if we are starting the array */ @@ -1422,14 +1422,14 @@ int ppl_init_log(struct r5conf *conf) if (ret) { goto err; - } else if (!mddev->pers && mddev->recovery_cp == 0 && + } else if (!mddev->pers && mddev->resync_offset == 0 && ppl_conf->recovered_entries > 0 && ppl_conf->mismatch_count == 0) { /* * If we are starting a dirty array and the recovery succeeds * without any issues, set the array as clean. */ - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); } else if (mddev->pers && ppl_conf->mismatch_count > 0) { /* no mismatch allowed when enabling PPL for a running array */ diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7ec61ee7b218..24b32a0c95b4 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3740,7 +3740,7 @@ static int want_replace(struct stripe_head *sh, int disk_idx) && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && (rdev->recovery_offset <= sh->sector - || rdev->mddev->recovery_cp <= sh->sector)) + || rdev->mddev->resync_offset <= sh->sector)) rv = 1; return rv; } @@ -3832,7 +3832,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, * is missing/faulty, then we need to read everything we can. */ if (!force_rcw && - sh->sector < sh->raid_conf->mddev->recovery_cp) + sh->sector < sh->raid_conf->mddev->resync_offset) /* reconstruct-write isn't being forced */ return 0; for (i = 0; i < s->failed && i < 2; i++) { @@ -4097,7 +4097,8 @@ static int handle_stripe_dirtying(struct r5conf *conf, int disks) { int rmw = 0, rcw = 0, i; - sector_t recovery_cp = conf->mddev->recovery_cp; + struct mddev *mddev = conf->mddev; + sector_t resync_offset = mddev->resync_offset; /* Check whether resync is now happening or should start. * If yes, then the array is dirty (after unclean shutdown or @@ -4107,15 +4108,21 @@ static int handle_stripe_dirtying(struct r5conf *conf, * generate correct data from the parity. */ if (conf->rmw_level == PARITY_DISABLE_RMW || - (recovery_cp < MaxSector && sh->sector >= recovery_cp && + (resync_offset < MaxSector && sh->sector >= resync_offset && s->failed == 0)) { /* Calculate the real rcw later - for now make it * look like rcw is cheaper */ rcw = 1; rmw = 2; - pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n", - conf->rmw_level, (unsigned long long)recovery_cp, + pr_debug("force RCW rmw_level=%u, resync_offset=%llu sh->sector=%llu\n", + conf->rmw_level, (unsigned long long)resync_offset, (unsigned long long)sh->sector); + } else if (mddev->bitmap_ops && mddev->bitmap_ops->blocks_synced && + !mddev->bitmap_ops->blocks_synced(mddev, sh->sector)) { + /* The initial recover is not done, must read everything */ + rcw = 1; rmw = 2; + pr_debug("force RCW by lazy recovery, sh->sector=%llu\n", + sh->sector); } else for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ struct r5dev *dev = &sh->dev[i]; @@ -4148,7 +4155,7 @@ static int handle_stripe_dirtying(struct r5conf *conf, set_bit(STRIPE_HANDLE, &sh->state); if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { /* prefer read-modify-write, but need to get some data */ - mddev_add_trace_msg(conf->mddev, "raid5 rmw %llu %d", + mddev_add_trace_msg(mddev, "raid5 rmw %llu %d", sh->sector, rmw); for (i = disks; i--; ) { @@ -4227,8 +4234,8 @@ static int handle_stripe_dirtying(struct r5conf *conf, set_bit(STRIPE_DELAYED, &sh->state); } } - if (rcw && !mddev_is_dm(conf->mddev)) - blk_add_trace_msg(conf->mddev->gendisk->queue, + if (rcw && !mddev_is_dm(mddev)) + blk_add_trace_msg(mddev->gendisk->queue, "raid5 rcw %llu %d %d %d", (unsigned long long)sh->sector, rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); @@ -4698,10 +4705,21 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) } } else if (test_bit(In_sync, &rdev->flags)) set_bit(R5_Insync, &dev->flags); - else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset) - /* in sync if before recovery_offset */ - set_bit(R5_Insync, &dev->flags); - else if (test_bit(R5_UPTODATE, &dev->flags) && + else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= + rdev->recovery_offset) { + /* + * in sync if: + * - normal IO, or + * - resync IO that is not lazy recovery + * + * For lazy recovery, we have to mark the rdev without + * In_sync as failed, to build initial xor data. + */ + if (!test_bit(STRIPE_SYNCING, &sh->state) || + !test_bit(MD_RECOVERY_LAZY_RECOVER, + &conf->mddev->recovery)) + set_bit(R5_Insync, &dev->flags); + } else if (test_bit(R5_UPTODATE, &dev->flags) && test_bit(R5_Expanded, &dev->flags)) /* If we've reshaped into here, we assume it is Insync. * We will shortly update recovery_offset to make @@ -4770,14 +4788,14 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) if (test_bit(STRIPE_SYNCING, &sh->state)) { /* If there is a failed device being replaced, * we must be recovering. - * else if we are after recovery_cp, we must be syncing + * else if we are after resync_offset, we must be syncing * else if MD_RECOVERY_REQUESTED is set, we also are syncing. * else we can only be replacing * sync and recovery both need to read all devices, and so * use the same flag. */ if (do_recovery || - sh->sector >= conf->mddev->recovery_cp || + sh->sector >= conf->mddev->resync_offset || test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) s->syncing = 1; else @@ -5468,17 +5486,17 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) { - struct bio *split; sector_t sector = raid_bio->bi_iter.bi_sector; unsigned chunk_sects = mddev->chunk_sectors; unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); if (sectors < bio_sectors(raid_bio)) { struct r5conf *conf = mddev->private; - split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split); - bio_chain(split, raid_bio); - submit_bio_noacct(raid_bio); - raid_bio = split; + + raid_bio = bio_submit_split_bioset(raid_bio, sectors, + &conf->bio_split); + if (!raid_bio) + return NULL; } if (!raid5_read_one_chunk(mddev, raid_bio)) @@ -6492,11 +6510,12 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n } if (mddev->curr_resync < max_sector) /* aborted */ - mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync, - &sync_blocks); + md_bitmap_end_sync(mddev, mddev->curr_resync, + &sync_blocks); else /* completed sync */ conf->fullsync = 0; - mddev->bitmap_ops->close_sync(mddev); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->close_sync(mddev); return 0; } @@ -6525,8 +6544,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n } if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && !conf->fullsync && - !mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, - true) && + !md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, true) && sync_blocks >= RAID5_STRIPE_SECTORS(conf)) { /* we can skip this block, and probably more */ do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf)); @@ -6535,7 +6553,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n return sync_blocks * RAID5_STRIPE_SECTORS(conf); } - mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false); sh = raid5_get_active_stripe(conf, NULL, sector_nr, R5_GAS_NOBLOCK); @@ -6557,9 +6576,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n still_degraded = true; } - mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, - still_degraded); - + md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, still_degraded); set_bit(STRIPE_SYNC_REQUESTED, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); @@ -6763,7 +6780,8 @@ static void raid5d(struct md_thread *thread) /* Now is a good time to flush some bitmap updates */ conf->seq_flush++; spin_unlock_irq(&conf->device_lock); - mddev->bitmap_ops->unplug(mddev, true); + if (md_bitmap_enabled(mddev, true)) + mddev->bitmap_ops->unplug(mddev, true); spin_lock_irq(&conf->device_lock); conf->seq_write = conf->seq_flush; activate_bit_delay(conf, conf->temp_inactive_list); @@ -7732,6 +7750,7 @@ static int raid5_set_limits(struct mddev *mddev) lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE; lim.discard_granularity = stripe; lim.max_write_zeroes_sectors = 0; + lim.max_hw_wzeroes_unmap_sectors = 0; mddev_stack_rdev_limits(mddev, &lim, 0); rdev_for_each(rdev, mddev) queue_limits_stack_bdev(&lim, rdev->bdev, rdev->new_data_offset, @@ -7780,7 +7799,7 @@ static int raid5_run(struct mddev *mddev) int first = 1; int ret = -EIO; - if (mddev->recovery_cp != MaxSector) + if (mddev->resync_offset != MaxSector) pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", mdname(mddev)); @@ -7921,7 +7940,7 @@ static int raid5_run(struct mddev *mddev) mdname(mddev)); mddev->ro = 1; set_disk_ro(mddev->gendisk, 1); - } else if (mddev->recovery_cp == MaxSector) + } else if (mddev->resync_offset == MaxSector) set_bit(MD_JOURNAL_CLEAN, &mddev->flags); } @@ -7988,7 +8007,7 @@ static int raid5_run(struct mddev *mddev) mddev->resync_max_sectors = mddev->dev_sectors; if (mddev->degraded > dirty_parity_disks && - mddev->recovery_cp != MaxSector) { + mddev->resync_offset != MaxSector) { if (test_bit(MD_HAS_PPL, &mddev->flags)) pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n", mdname(mddev)); @@ -8312,7 +8331,6 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) */ sector_t newsize; struct r5conf *conf = mddev->private; - int ret; if (raid5_has_log(conf) || raid5_has_ppl(conf)) return -EINVAL; @@ -8322,14 +8340,17 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) mddev->array_sectors > newsize) return -EINVAL; - ret = mddev->bitmap_ops->resize(mddev, sectors, 0, false); - if (ret) - return ret; + if (md_bitmap_enabled(mddev, false)) { + int ret = mddev->bitmap_ops->resize(mddev, sectors, 0); + + if (ret) + return ret; + } md_set_array_sectors(mddev, newsize); if (sectors > mddev->dev_sectors && - mddev->recovery_cp > mddev->dev_sectors) { - mddev->recovery_cp = mddev->dev_sectors; + mddev->resync_offset > mddev->dev_sectors) { + mddev->resync_offset = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } mddev->dev_sectors = sectors; @@ -8423,7 +8444,7 @@ static int raid5_start_reshape(struct mddev *mddev) return -EINVAL; /* raid5 can't handle concurrent reshape and recovery */ - if (mddev->recovery_cp < MaxSector) + if (mddev->resync_offset < MaxSector) return -EBUSY; for (i = 0; i < conf->raid_disks; i++) if (conf->disks[i].replacement) @@ -8648,7 +8669,7 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level) mddev->raid_disks += 1; mddev->delta_disks = 1; /* make sure it will be not marked as dirty */ - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; return setup_conf(mddev); } |