diff options
Diffstat (limited to 'drivers/md')
63 files changed, 1345 insertions, 847 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 0b1870a09e1f..ddb37f6670de 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -139,7 +139,7 @@ config MD_RAID456 tristate "RAID-4/RAID-5/RAID-6 mode" depends on BLK_DEV_MD select RAID6_PQ - select LIBCRC32C + select CRC32 select ASYNC_MEMCPY select ASYNC_XOR select ASYNC_PQ @@ -267,6 +267,7 @@ config DM_CRYPT depends on BLK_DEV_DM depends on (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n) depends on (TRUSTED_KEYS || TRUSTED_KEYS=n) + select CRC32 select CRYPTO select CRYPTO_CBC select CRYPTO_ESSIV diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index ef6abf33f926..45ca134cbf02 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -82,7 +82,7 @@ static void moving_init(struct moving_io *io) bio_init(bio, NULL, bio->bi_inline_vecs, DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS), 0); bio_get(bio); - bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); bio->bi_iter.bi_size = KEY_SIZE(&io->w->key) << 9; bio->bi_private = &io->cl; diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index 68b02216033d..d39dec34b7a3 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -123,7 +123,7 @@ void bch_cache_accounting_destroy(struct cache_accounting *acc) kobject_put(&acc->day.kobj); atomic_set(&acc->closing, 1); - if (del_timer_sync(&acc->timer)) + if (timer_delete_sync(&acc->timer)) closure_return(&acc->cl); } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e42f1400cea9..813b38aec3e4 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -546,7 +546,7 @@ static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid) static struct uuid_entry *uuid_find_empty(struct cache_set *c) { - static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"; + static const char zero_uuid[16] = { 0 }; return uuid_find(c, zero_uuid); } diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index c1d28e365910..453efbbdc8ee 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -334,7 +334,7 @@ static void dirty_init(struct keybuf_key *w) bio_init(bio, NULL, bio->bi_inline_vecs, DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0); if (!io->dc->writeback_percent) - bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9; bio->bi_private = w; diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index aab8240429b0..f0b5a6931161 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -68,6 +68,8 @@ #define LIST_DIRTY 1 #define LIST_SIZE 2 +#define SCAN_RESCHED_CYCLE 16 + /*--------------------------------------------------------------*/ /* @@ -2234,7 +2236,7 @@ int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t c } EXPORT_SYMBOL_GPL(dm_bufio_issue_discard); -static bool forget_buffer(struct dm_bufio_client *c, sector_t block) +static void forget_buffer(struct dm_bufio_client *c, sector_t block) { struct dm_buffer *b; @@ -2249,8 +2251,6 @@ static bool forget_buffer(struct dm_bufio_client *c, sector_t block) cache_put_and_wake(c, b); } } - - return b ? true : false; } /* @@ -2426,7 +2426,12 @@ static void __scan(struct dm_bufio_client *c) atomic_long_dec(&c->need_shrink); freed++; - cond_resched(); + + if (unlikely(freed % SCAN_RESCHED_CYCLE == 0)) { + dm_bufio_unlock(c); + cond_resched(); + dm_bufio_lock(c); + } } } } diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 9cb797a561d6..a10d75a562db 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -406,6 +406,12 @@ struct cache { mempool_t migration_pool; struct bio_set bs; + + /* + * Cache_size entries. Set bits indicate blocks mapped beyond the + * target length, which are marked for invalidation. + */ + unsigned long *invalid_bitset; }; struct per_bio_data { @@ -1922,6 +1928,9 @@ static void __destroy(struct cache *cache) if (cache->discard_bitset) free_bitset(cache->discard_bitset); + if (cache->invalid_bitset) + free_bitset(cache->invalid_bitset); + if (cache->copier) dm_kcopyd_client_destroy(cache->copier); @@ -2510,6 +2519,13 @@ static int cache_create(struct cache_args *ca, struct cache **result) } clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); + cache->invalid_bitset = alloc_bitset(from_cblock(cache->cache_size)); + if (!cache->invalid_bitset) { + *error = "could not allocate bitset for invalid blocks"; + goto bad; + } + clear_bitset(cache->invalid_bitset, from_cblock(cache->cache_size)); + cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); if (IS_ERR(cache->copier)) { *error = "could not create kcopyd client"; @@ -2808,6 +2824,24 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, return policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); } +static int load_filtered_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, + bool dirty, uint32_t hint, bool hint_valid) +{ + struct cache *cache = context; + + if (from_oblock(oblock) >= from_oblock(cache->origin_blocks)) { + if (dirty) { + DMERR("%s: unable to shrink origin; cache block %u is dirty", + cache_device_name(cache), from_cblock(cblock)); + return -EFBIG; + } + set_bit(from_cblock(cblock), cache->invalid_bitset); + return 0; + } + + return load_mapping(context, oblock, cblock, dirty, hint, hint_valid); +} + /* * The discard block size in the on disk metadata is not * necessarily the same as we're currently using. So we have to @@ -2899,6 +2933,27 @@ static dm_cblock_t get_cache_dev_size(struct cache *cache) return to_cblock(size); } +static bool can_resume(struct cache *cache) +{ + /* + * Disallow retrying the resume operation for devices that failed the + * first resume attempt, as the failure leaves the policy object partially + * initialized. Retrying could trigger BUG_ON when loading cache mappings + * into the incomplete policy object. + */ + if (cache->sized && !cache->loaded_mappings) { + if (get_cache_mode(cache) != CM_WRITE) + DMERR("%s: unable to resume a failed-loaded cache, please check metadata.", + cache_device_name(cache)); + else + DMERR("%s: unable to resume cache due to missing proper cache table reload", + cache_device_name(cache)); + return false; + } + + return true; +} + static bool can_resize(struct cache *cache, dm_cblock_t new_size) { if (from_cblock(new_size) > from_cblock(cache->cache_size)) { @@ -2941,12 +2996,33 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) return 0; } +static int truncate_oblocks(struct cache *cache) +{ + uint32_t nr_blocks = from_cblock(cache->cache_size); + uint32_t i; + int r; + + for_each_set_bit(i, cache->invalid_bitset, nr_blocks) { + r = dm_cache_remove_mapping(cache->cmd, to_cblock(i)); + if (r) { + DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", + cache_device_name(cache)); + return r; + } + } + + return 0; +} + static int cache_preresume(struct dm_target *ti) { int r = 0; struct cache *cache = ti->private; dm_cblock_t csize = get_cache_dev_size(cache); + if (!can_resume(cache)) + return -EINVAL; + /* * Check to see if the cache has resized. */ @@ -2962,11 +3038,25 @@ static int cache_preresume(struct dm_target *ti) } if (!cache->loaded_mappings) { + /* + * The fast device could have been resized since the last + * failed preresume attempt. To be safe we start by a blank + * bitset for cache blocks. + */ + clear_bitset(cache->invalid_bitset, from_cblock(cache->cache_size)); + r = dm_cache_load_mappings(cache->cmd, cache->policy, - load_mapping, cache); + load_filtered_mapping, cache); if (r) { DMERR("%s: could not load cache mappings", cache_device_name(cache)); - metadata_operation_failed(cache, "dm_cache_load_mappings", r); + if (r != -EFBIG) + metadata_operation_failed(cache, "dm_cache_load_mappings", r); + return r; + } + + r = truncate_oblocks(cache); + if (r) { + metadata_operation_failed(cache, "dm_cache_remove_mapping", r); return r; } @@ -3426,7 +3516,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {2, 2, 0}, + .version = {2, 3, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 3637761f3585..f3a3f2ef6322 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -141,6 +141,7 @@ struct mapped_device { #ifdef CONFIG_BLK_DEV_ZONED unsigned int nr_zones; void *zone_revalidate_map; + struct task_struct *revalidate_map_task; #endif #ifdef CONFIG_IMA diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 78c975d7cd5f..9dfdb63220d7 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -17,6 +17,7 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/blk-integrity.h> +#include <linux/crc32.h> #include <linux/mempool.h> #include <linux/slab.h> #include <linux/crypto.h> @@ -125,7 +126,6 @@ struct iv_lmk_private { #define TCW_WHITENING_SIZE 16 struct iv_tcw_private { - struct crypto_shash *crc32_tfm; u8 *iv_seed; u8 *whitening; }; @@ -607,10 +607,6 @@ static void crypt_iv_tcw_dtr(struct crypt_config *cc) tcw->iv_seed = NULL; kfree_sensitive(tcw->whitening); tcw->whitening = NULL; - - if (tcw->crc32_tfm && !IS_ERR(tcw->crc32_tfm)) - crypto_free_shash(tcw->crc32_tfm); - tcw->crc32_tfm = NULL; } static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti, @@ -628,13 +624,6 @@ static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti, return -EINVAL; } - tcw->crc32_tfm = crypto_alloc_shash("crc32", 0, - CRYPTO_ALG_ALLOCATES_MEMORY); - if (IS_ERR(tcw->crc32_tfm)) { - ti->error = "Error initializing CRC32 in TCW"; - return PTR_ERR(tcw->crc32_tfm); - } - tcw->iv_seed = kzalloc(cc->iv_size, GFP_KERNEL); tcw->whitening = kzalloc(TCW_WHITENING_SIZE, GFP_KERNEL); if (!tcw->iv_seed || !tcw->whitening) { @@ -668,36 +657,28 @@ static int crypt_iv_tcw_wipe(struct crypt_config *cc) return 0; } -static int crypt_iv_tcw_whitening(struct crypt_config *cc, - struct dm_crypt_request *dmreq, - u8 *data) +static void crypt_iv_tcw_whitening(struct crypt_config *cc, + struct dm_crypt_request *dmreq, u8 *data) { struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; __le64 sector = cpu_to_le64(dmreq->iv_sector); u8 buf[TCW_WHITENING_SIZE]; - SHASH_DESC_ON_STACK(desc, tcw->crc32_tfm); - int i, r; + int i; /* xor whitening with sector number */ crypto_xor_cpy(buf, tcw->whitening, (u8 *)§or, 8); crypto_xor_cpy(&buf[8], tcw->whitening + 8, (u8 *)§or, 8); /* calculate crc32 for every 32bit part and xor it */ - desc->tfm = tcw->crc32_tfm; - for (i = 0; i < 4; i++) { - r = crypto_shash_digest(desc, &buf[i * 4], 4, &buf[i * 4]); - if (r) - goto out; - } + for (i = 0; i < 4; i++) + put_unaligned_le32(crc32(0, &buf[i * 4], 4), &buf[i * 4]); crypto_xor(&buf[0], &buf[12], 4); crypto_xor(&buf[4], &buf[8], 4); /* apply whitening (8 bytes) to whole sector */ for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++) crypto_xor(data + i * 8, buf, 8); -out: memzero_explicit(buf, sizeof(buf)); - return r; } static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, @@ -707,13 +688,12 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; __le64 sector = cpu_to_le64(dmreq->iv_sector); u8 *src; - int r = 0; /* Remove whitening from ciphertext */ if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) { sg = crypt_get_sg_data(cc, dmreq->sg_in); src = kmap_local_page(sg_page(sg)); - r = crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset); + crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset); kunmap_local(src); } @@ -723,7 +703,7 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, crypto_xor_cpy(&iv[8], tcw->iv_seed + 8, (u8 *)§or, cc->iv_size - 8); - return r; + return 0; } static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, @@ -731,7 +711,6 @@ static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, { struct scatterlist *sg; u8 *dst; - int r; if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) return 0; @@ -739,10 +718,10 @@ static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, /* Apply whitening on ciphertext */ sg = crypt_get_sg_data(cc, dmreq->sg_out); dst = kmap_local_page(sg_page(sg)); - r = crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset); + crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset); kunmap_local(dst); - return r; + return 0; } static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv, @@ -1188,7 +1167,7 @@ static int dm_crypt_integrity_io_alloc(struct dm_crypt_io *io, struct bio *bio) tag_len = io->cc->tuple_size * (bio_sectors(bio) >> io->cc->sector_shift); - bip->bip_iter.bi_sector = io->cc->start + io->sector; + bip->bip_iter.bi_sector = bio->bi_iter.bi_sector; ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata), tag_len, offset_in_page(io->integrity_metadata)); @@ -1720,6 +1699,7 @@ retry: clone->bi_private = io; clone->bi_end_io = crypt_endio; clone->bi_ioprio = io->base_bio->bi_ioprio; + clone->bi_iter.bi_sector = cc->start + io->sector; remaining_size = size; @@ -1910,7 +1890,6 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) crypt_dec_pending(io); return 1; } - clone->bi_iter.bi_sector = cc->start + io->sector; crypt_convert_init(cc, &io->ctx, clone, clone, io->sector); io->saved_bi_iter = clone->bi_iter; dm_submit_bio_remap(io->base_bio, clone); @@ -1926,13 +1905,13 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) clone = bio_alloc_clone(cc->dev->bdev, io->base_bio, gfp, &cc->bs); if (!clone) return 1; + + clone->bi_iter.bi_sector = cc->start + io->sector; clone->bi_private = io; clone->bi_end_io = crypt_endio; crypt_inc_pending(io); - clone->bi_iter.bi_sector = cc->start + io->sector; - if (dm_crypt_integrity_io_alloc(io, clone)) { crypt_dec_pending(io); bio_put(clone); @@ -2040,8 +2019,6 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) /* crypt_convert should have filled the clone bio */ BUG_ON(io->ctx.iter_out.bi_size); - clone->bi_iter.bi_sector = cc->start + io->sector; - if ((likely(!async) && test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) || test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags)) { dm_submit_bio_remap(io->base_bio, clone); @@ -2098,7 +2075,7 @@ static void kcryptd_crypt_write_continue(struct work_struct *work) wait_for_completion(&ctx->restart); reinit_completion(&ctx->restart); - r = crypt_convert(cc, &io->ctx, true, false); + r = crypt_convert(cc, &io->ctx, false, false); if (r) io->error = r; crypt_finished = atomic_dec_and_test(&ctx->cc_pending); @@ -2196,7 +2173,7 @@ static void kcryptd_crypt_read_continue(struct work_struct *work) wait_for_completion(&io->ctx.restart); reinit_completion(&io->ctx.restart); - r = crypt_convert(cc, &io->ctx, true, false); + r = crypt_convert(cc, &io->ctx, false, false); if (r) io->error = r; @@ -2214,7 +2191,6 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) crypt_inc_pending(io); if (io->ctx.aead_recheck) { - io->ctx.cc_sector = io->sector + cc->iv_offset; r = crypt_convert(cc, &io->ctx, test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags), true); } else { diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 08f6387620c1..d4cf0ac2a7aa 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -369,6 +369,21 @@ static int delay_map(struct dm_target *ti, struct bio *bio) return delay_bio(dc, c, bio); } +#ifdef CONFIG_BLK_DEV_ZONED +static int delay_report_zones(struct dm_target *ti, + struct dm_report_zones_args *args, unsigned int nr_zones) +{ + struct delay_c *dc = ti->private; + struct delay_class *c = &dc->read; + + return dm_report_zones(c->dev->bdev, c->start, + c->start + dm_target_offset(ti, args->next_sector), + args, nr_zones); +} +#else +#define delay_report_zones NULL +#endif + #define DMEMIT_DELAY_CLASS(c) \ DMEMIT("%s %llu %u", (c)->dev->name, (unsigned long long)(c)->start, (c)->delay) @@ -424,11 +439,12 @@ out: static struct target_type delay_target = { .name = "delay", .version = {1, 4, 0}, - .features = DM_TARGET_PASSES_INTEGRITY, + .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ZONED_HM, .module = THIS_MODULE, .ctr = delay_ctr, .dtr = delay_dtr, .map = delay_map, + .report_zones = delay_report_zones, .presuspend = delay_presuspend, .resume = delay_resume, .status = delay_status, diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index 18ae45dcbfb2..b19b0142a690 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -390,6 +390,12 @@ static int ebs_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_REMAPPED; } +static void ebs_postsuspend(struct dm_target *ti) +{ + struct ebs_c *ec = ti->private; + dm_bufio_client_reset(ec->bufio); +} + static void ebs_status(struct dm_target *ti, status_type_t type, unsigned int status_flags, char *result, unsigned int maxlen) { @@ -447,6 +453,7 @@ static struct target_type ebs_target = { .ctr = ebs_ctr, .dtr = ebs_dtr, .map = ebs_map, + .postsuspend = ebs_postsuspend, .status = ebs_status, .io_hints = ebs_io_hints, .prepare_ioctl = ebs_prepare_ioctl, diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 731467d4ed10..347881f323d5 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -47,14 +47,15 @@ enum feature_flag_bits { }; struct per_bio_data { - bool bio_submitted; + bool bio_can_corrupt; + struct bvec_iter saved_iter; }; static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, struct dm_target *ti) { - int r; - unsigned int argc; + int r = 0; + unsigned int argc = 0; const char *arg_name; static const struct dm_arg _args[] = { @@ -65,14 +66,13 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, {0, PROBABILITY_BASE, "Invalid random corrupt argument"}, }; - /* No feature arguments supplied. */ - if (!as->argc) - return 0; - - r = dm_read_arg_group(_args, as, &argc, &ti->error); - if (r) + if (as->argc && (r = dm_read_arg_group(_args, as, &argc, &ti->error))) return r; + /* No feature arguments supplied. */ + if (!argc) + goto error_all_io; + while (argc) { arg_name = dm_shift_arg(as); argc--; @@ -217,6 +217,7 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, if (!fc->corrupt_bio_byte && !test_bit(ERROR_READS, &fc->flags) && !test_bit(DROP_WRITES, &fc->flags) && !test_bit(ERROR_WRITES, &fc->flags) && !fc->random_read_corrupt && !fc->random_write_corrupt) { +error_all_io: set_bit(ERROR_WRITES, &fc->flags); set_bit(ERROR_READS, &fc->flags); } @@ -339,7 +340,8 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio) } static void corrupt_bio_common(struct bio *bio, unsigned int corrupt_bio_byte, - unsigned char corrupt_bio_value) + unsigned char corrupt_bio_value, + struct bvec_iter start) { struct bvec_iter iter; struct bio_vec bvec; @@ -348,7 +350,7 @@ static void corrupt_bio_common(struct bio *bio, unsigned int corrupt_bio_byte, * Overwrite the Nth byte of the bio's data, on whichever page * it falls. */ - bio_for_each_segment(bvec, bio, iter) { + __bio_for_each_segment(bvec, bio, iter, start) { if (bio_iter_len(bio, iter) > corrupt_bio_byte) { unsigned char *segment = bvec_kmap_local(&bvec); segment[corrupt_bio_byte] = corrupt_bio_value; @@ -357,36 +359,31 @@ static void corrupt_bio_common(struct bio *bio, unsigned int corrupt_bio_byte, "(rw=%c bi_opf=%u bi_sector=%llu size=%u)\n", bio, corrupt_bio_value, corrupt_bio_byte, (bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_opf, - (unsigned long long)bio->bi_iter.bi_sector, - bio->bi_iter.bi_size); + (unsigned long long)start.bi_sector, + start.bi_size); break; } corrupt_bio_byte -= bio_iter_len(bio, iter); } } -static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) +static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc, + struct bvec_iter start) { unsigned int corrupt_bio_byte = fc->corrupt_bio_byte - 1; - if (!bio_has_data(bio)) - return; - - corrupt_bio_common(bio, corrupt_bio_byte, fc->corrupt_bio_value); + corrupt_bio_common(bio, corrupt_bio_byte, fc->corrupt_bio_value, start); } -static void corrupt_bio_random(struct bio *bio) +static void corrupt_bio_random(struct bio *bio, struct bvec_iter start) { unsigned int corrupt_byte; unsigned char corrupt_value; - if (!bio_has_data(bio)) - return; - - corrupt_byte = get_random_u32() % bio->bi_iter.bi_size; + corrupt_byte = get_random_u32() % start.bi_size; corrupt_value = get_random_u8(); - corrupt_bio_common(bio, corrupt_byte, corrupt_value); + corrupt_bio_common(bio, corrupt_byte, corrupt_value, start); } static void clone_free(struct bio *clone) @@ -426,7 +423,7 @@ static struct bio *clone_bio(struct dm_target *ti, struct flakey_c *fc, struct b if (!clone) return NULL; - bio_init(clone, fc->dev->bdev, bio->bi_inline_vecs, nr_iovecs, bio->bi_opf); + bio_init(clone, fc->dev->bdev, clone->bi_inline_vecs, nr_iovecs, bio->bi_opf); clone->bi_iter.bi_sector = flakey_map_sector(ti, bio->bi_iter.bi_sector); clone->bi_private = bio; @@ -481,7 +478,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) unsigned int elapsed; struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); - pb->bio_submitted = false; + pb->bio_can_corrupt = false; if (op_is_zone_mgmt(bio_op(bio))) goto map_bio; @@ -490,10 +487,11 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) elapsed = (jiffies - fc->start_time) / HZ; if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) { bool corrupt_fixed, corrupt_random; - /* - * Flag this bio as submitted while down. - */ - pb->bio_submitted = true; + + if (bio_has_data(bio)) { + pb->bio_can_corrupt = true; + pb->saved_iter = bio->bi_iter; + } /* * Error reads if neither corrupt_bio_byte or drop_writes or error_writes are set. @@ -516,6 +514,8 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } + if (!pb->bio_can_corrupt) + goto map_bio; /* * Corrupt matching writes. */ @@ -535,9 +535,11 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) struct bio *clone = clone_bio(ti, fc, bio); if (clone) { if (corrupt_fixed) - corrupt_bio_data(clone, fc); + corrupt_bio_data(clone, fc, + clone->bi_iter); if (corrupt_random) - corrupt_bio_random(clone); + corrupt_bio_random(clone, + clone->bi_iter); submit_bio(clone); return DM_MAPIO_SUBMITTED; } @@ -559,21 +561,21 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, if (op_is_zone_mgmt(bio_op(bio))) return DM_ENDIO_DONE; - if (!*error && pb->bio_submitted && (bio_data_dir(bio) == READ)) { + if (!*error && pb->bio_can_corrupt && (bio_data_dir(bio) == READ)) { if (fc->corrupt_bio_byte) { if ((fc->corrupt_bio_rw == READ) && all_corrupt_bio_flags_match(bio, fc)) { /* * Corrupt successful matching READs while in down state. */ - corrupt_bio_data(bio, fc); + corrupt_bio_data(bio, fc, pb->saved_iter); } } if (fc->random_read_corrupt) { u64 rnd = get_random_u64(); u32 rem = do_div(rnd, PROBABILITY_BASE); if (rem < fc->random_read_corrupt) - corrupt_bio_random(bio); + corrupt_bio_random(bio, pb->saved_iter); } if (test_bit(ERROR_READS, &fc->flags)) { /* diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 555dc06b9422..cc3d3897ef42 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -21,6 +21,7 @@ #include <linux/reboot.h> #include <crypto/hash.h> #include <crypto/skcipher.h> +#include <crypto/utils.h> #include <linux/async_tx.h> #include <linux/dm-bufio.h> @@ -516,7 +517,7 @@ static int sb_mac(struct dm_integrity_c *ic, bool wr) dm_integrity_io_error(ic, "crypto_shash_digest", r); return r; } - if (memcmp(mac, actual_mac, mac_size)) { + if (crypto_memneq(mac, actual_mac, mac_size)) { dm_integrity_io_error(ic, "superblock mac", -EILSEQ); dm_audit_log_target(DM_MSG_PREFIX, "mac-superblock", ic->ti, 0); return -EILSEQ; @@ -859,7 +860,7 @@ static void rw_section_mac(struct dm_integrity_c *ic, unsigned int section, bool if (likely(wr)) memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR); else { - if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) { + if (crypto_memneq(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) { dm_integrity_io_error(ic, "journal mac", -EILSEQ); dm_audit_log_target(DM_MSG_PREFIX, "mac-journal", ic->ti, 0); } @@ -1401,10 +1402,9 @@ static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block, unsigned int *metadata_offset, unsigned int total_size, int op) { -#define MAY_BE_FILLER 1 -#define MAY_BE_HASH 2 unsigned int hash_offset = 0; - unsigned int may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0); + unsigned char mismatch_hash = 0; + unsigned char mismatch_filler = !ic->discard; do { unsigned char *data, *dp; @@ -1425,7 +1425,7 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se if (op == TAG_READ) { memcpy(tag, dp, to_copy); } else if (op == TAG_WRITE) { - if (memcmp(dp, tag, to_copy)) { + if (crypto_memneq(dp, tag, to_copy)) { memcpy(dp, tag, to_copy); dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy); } @@ -1433,29 +1433,30 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se /* e.g.: op == TAG_CMP */ if (likely(is_power_of_2(ic->tag_size))) { - if (unlikely(memcmp(dp, tag, to_copy))) - if (unlikely(!ic->discard) || - unlikely(memchr_inv(dp, DISCARD_FILLER, to_copy) != NULL)) { - goto thorough_test; - } + if (unlikely(crypto_memneq(dp, tag, to_copy))) + goto thorough_test; } else { unsigned int i, ts; thorough_test: ts = total_size; for (i = 0; i < to_copy; i++, ts--) { - if (unlikely(dp[i] != tag[i])) - may_be &= ~MAY_BE_HASH; - if (likely(dp[i] != DISCARD_FILLER)) - may_be &= ~MAY_BE_FILLER; + /* + * Warning: the control flow must not be + * dependent on match/mismatch of + * individual bytes. + */ + mismatch_hash |= dp[i] ^ tag[i]; + mismatch_filler |= dp[i] ^ DISCARD_FILLER; hash_offset++; if (unlikely(hash_offset == ic->tag_size)) { - if (unlikely(!may_be)) { + if (unlikely(mismatch_hash) && unlikely(mismatch_filler)) { dm_bufio_release(b); return ts; } hash_offset = 0; - may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0); + mismatch_hash = 0; + mismatch_filler = !ic->discard; } } } @@ -1476,8 +1477,6 @@ thorough_test: } while (unlikely(total_size)); return 0; -#undef MAY_BE_FILLER -#undef MAY_BE_HASH } struct flush_request { @@ -2076,7 +2075,7 @@ retry_kmap: char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack); - if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) { + if (unlikely(crypto_memneq(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) { DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx", logical_sector); dm_audit_log_bio(DM_MSG_PREFIX, "journal-checksum", @@ -2595,7 +2594,7 @@ static void dm_integrity_inline_recheck(struct work_struct *w) bio_put(outgoing_bio); integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, outgoing_data, digest); - if (unlikely(memcmp(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { + if (unlikely(crypto_memneq(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx", ic->dev->bdev, dio->bio_details.bi_iter.bi_sector); atomic64_inc(&ic->number_of_mismatches); @@ -2634,7 +2633,7 @@ static int dm_integrity_end_io(struct dm_target *ti, struct bio *bio, blk_status char *mem = bvec_kmap_local(&bv); //memset(mem, 0xff, ic->sectors_per_block << SECTOR_SHIFT); integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, digest); - if (unlikely(memcmp(digest, dio->integrity_payload + pos, + if (unlikely(crypto_memneq(digest, dio->integrity_payload + pos, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { kunmap_local(mem); dm_integrity_free_payload(dio); @@ -2708,7 +2707,7 @@ static void integrity_commit(struct work_struct *w) unsigned int i, j, n; struct bio *flushes; - del_timer(&ic->autocommit_timer); + timer_delete(&ic->autocommit_timer); if (ic->mode == 'I') return; @@ -2911,7 +2910,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned int write_start integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block), (char *)access_journal_data(ic, i, l), test_tag); - if (unlikely(memcmp(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) { + if (unlikely(crypto_memneq(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) { dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ); dm_audit_log_target(DM_MSG_PREFIX, "integrity-replay-journal", ic->ti, 0); } @@ -3607,7 +3606,7 @@ static void dm_integrity_postsuspend(struct dm_target *ti) WARN_ON(unregister_reboot_notifier(&ic->reboot_notifier)); - del_timer_sync(&ic->autocommit_timer); + timer_delete_sync(&ic->autocommit_timer); if (ic->recalc_wq) drain_workqueue(ic->recalc_wq); @@ -3790,16 +3789,18 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: { - arg_count = 3; + arg_count = 1; /* buffer_sectors */ arg_count += !!ic->meta_dev; arg_count += ic->sectors_per_block != 1; arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)); arg_count += ic->reset_recalculate_flag; arg_count += ic->discard; - arg_count += ic->mode == 'J'; - arg_count += ic->mode == 'J'; - arg_count += ic->mode == 'B'; - arg_count += ic->mode == 'B'; + arg_count += ic->mode != 'I'; /* interleave_sectors */ + arg_count += ic->mode == 'J'; /* journal_sectors */ + arg_count += ic->mode == 'J'; /* journal_watermark */ + arg_count += ic->mode == 'J'; /* commit_time */ + arg_count += ic->mode == 'B'; /* sectors_per_bit */ + arg_count += ic->mode == 'B'; /* bitmap_flush_interval */ arg_count += !!ic->internal_hash_alg.alg_string; arg_count += !!ic->journal_crypt_alg.alg_string; arg_count += !!ic->journal_mac_alg.alg_string; @@ -3818,14 +3819,15 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, DMEMIT(" reset_recalculate"); if (ic->discard) DMEMIT(" allow_discards"); - DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS); - DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors); + if (ic->mode != 'I') + DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors); DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors); if (ic->mode == 'J') { __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100; watermark_percentage += ic->journal_entries / 2; do_div(watermark_percentage, ic->journal_entries); + DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS); DMEMIT(" journal_watermark:%u", (unsigned int)watermark_percentage); DMEMIT(" commit_time:%u", ic->autocommit_msec); } @@ -4808,23 +4810,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv ti->error = "Cannot allocate bio set"; goto bad; } - r = bioset_integrity_create(&ic->recheck_bios, RECHECK_POOL_SIZE); - if (r) { - ti->error = "Cannot allocate bio integrity set"; - r = -ENOMEM; - goto bad; - } r = bioset_init(&ic->recalc_bios, 1, 0, BIOSET_NEED_BVECS); if (r) { ti->error = "Cannot allocate bio set"; goto bad; } - r = bioset_integrity_create(&ic->recalc_bios, 1); - if (r) { - ti->error = "Cannot allocate bio integrity set"; - r = -ENOMEM; - goto bad; - } } ic->metadata_wq = alloc_workqueue("dm-integrity-metadata", @@ -5081,16 +5071,19 @@ try_smaller_buffer: ic->recalc_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages); if (!ic->recalc_bitmap) { + ti->error = "Could not allocate memory for bitmap"; r = -ENOMEM; goto bad; } ic->may_write_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages); if (!ic->may_write_bitmap) { + ti->error = "Could not allocate memory for bitmap"; r = -ENOMEM; goto bad; } ic->bbs = kvmalloc_array(ic->n_bitmap_blocks, sizeof(struct bitmap_block_status), GFP_KERNEL); if (!ic->bbs) { + ti->error = "Could not allocate memory for bitmap"; r = -ENOMEM; goto bad; } @@ -5171,7 +5164,7 @@ static void dm_integrity_dtr(struct dm_target *ti) BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress)); BUG_ON(!list_empty(&ic->wait_list)); - if (ic->mode == 'B') + if (ic->mode == 'B' && ic->bitmap_flush_work.work.func) cancel_delayed_work_sync(&ic->bitmap_flush_work); if (ic->metadata_wq) destroy_workqueue(ic->metadata_wq); diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index d7a8e2f40db3..c37668790577 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -379,6 +379,7 @@ static void do_region(const blk_opf_t opf, unsigned int region, atomic_inc(&io->count); submit_bio(bio); + WARN_ON_ONCE(opf & REQ_ATOMIC && remaining); } while (remaining); } diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 49fb0f684193..66318aba4bdb 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -199,9 +199,10 @@ static size_t linear_dax_recovery_write(struct dm_target *ti, pgoff_t pgoff, static struct target_type linear_target = { .name = "linear", - .version = {1, 4, 0}, + .version = {1, 5, 0}, .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT | - DM_TARGET_ZONED_HM | DM_TARGET_PASSES_CRYPTO, + DM_TARGET_ZONED_HM | DM_TARGET_PASSES_CRYPTO | + DM_TARGET_ATOMIC_WRITES, .report_zones = linear_report_zones, .module = THIS_MODULE, .ctr = linear_ctr, diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 637977acc3dc..6c98f4ae5ea9 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -815,7 +815,7 @@ static void enable_nopath_timeout(struct multipath *m) static void disable_nopath_timeout(struct multipath *m) { - del_timer_sync(&m->nopath_timer); + timer_delete_sync(&m->nopath_timer); } /* diff --git a/drivers/md/dm-ps-io-affinity.c b/drivers/md/dm-ps-io-affinity.c index 461ee6b2044d..716807e511ee 100644 --- a/drivers/md/dm-ps-io-affinity.c +++ b/drivers/md/dm-ps-io-affinity.c @@ -116,7 +116,7 @@ static int ioa_create(struct path_selector *ps, unsigned int argc, char **argv) if (!s) return -ENOMEM; - s->path_map = kzalloc(nr_cpu_ids * sizeof(struct path_info *), + s->path_map = kcalloc(nr_cpu_ids, sizeof(struct path_info *), GFP_KERNEL); if (!s->path_map) goto free_selector; diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 1e0d3b9b75d6..6adc55fd90d3 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3196,7 +3196,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (reshape_sectors || rs_is_raid1(rs)) { /* * We can only prepare for a reshape here, because the - * raid set needs to run to provide the repective reshape + * raid set needs to run to provide the respective reshape * check functions via its MD personality instance. * * So do the reshape check after md_run() succeeded. diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 9511dae5b556..785af4816584 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -133,10 +133,9 @@ static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) spin_lock_irqsave(&ms->lock, flags); should_wake = !(bl->head); bio_list_add(bl, bio); - spin_unlock_irqrestore(&ms->lock, flags); - if (should_wake) wakeup_mirrord(ms); + spin_unlock_irqrestore(&ms->lock, flags); } static void dispatch_bios(void *context, struct bio_list *bio_list) @@ -646,9 +645,9 @@ static void write_callback(unsigned long error, void *context) if (!ms->failures.head) should_wake = 1; bio_list_add(&ms->failures, bio); - spin_unlock_irqrestore(&ms->lock, flags); if (should_wake) wakeup_mirrord(ms); + spin_unlock_irqrestore(&ms->lock, flags); } static void do_write(struct mirror_set *ms, struct bio *bio) @@ -656,7 +655,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio) unsigned int i; struct dm_io_region io[MAX_NR_MIRRORS], *dest = io; struct mirror *m; - blk_opf_t op_flags = bio->bi_opf & (REQ_FUA | REQ_PREFLUSH); + blk_opf_t op_flags = bio->bi_opf & (REQ_FUA | REQ_PREFLUSH | REQ_ATOMIC); struct dm_io_request io_req = { .bi_opf = REQ_OP_WRITE | op_flags, .mem.type = DM_IO_BIO, @@ -1182,7 +1181,7 @@ static void mirror_dtr(struct dm_target *ti) { struct mirror_set *ms = ti->private; - del_timer_sync(&ms->timer); + timer_delete_sync(&ms->timer); flush_workqueue(ms->kmirrord_wq); flush_work(&ms->trigger_event); dm_kcopyd_client_destroy(ms->kcopyd_client); @@ -1483,8 +1482,9 @@ static int mirror_iterate_devices(struct dm_target *ti, static struct target_type mirror_target = { .name = "mirror", - .version = {1, 14, 0}, + .version = {1, 15, 0}, .module = THIS_MODULE, + .features = DM_TARGET_ATOMIC_WRITES, .ctr = mirror_ctr, .dtr = mirror_dtr, .map = mirror_map, diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 499f8cc8a39f..e23076f7ece2 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -547,7 +547,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) md->tag_set->ops = &dm_mq_ops; md->tag_set->queue_depth = dm_get_blk_mq_queue_depth(); md->tag_set->numa_node = md->numa_node_id; - md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING; + md->tag_set->flags = BLK_MQ_F_STACKING; md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues(); md->tag_set->driver_data = md; diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 4112071de0be..a1b7535c508a 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -465,8 +465,9 @@ static void stripe_io_hints(struct dm_target *ti, static struct target_type stripe_target = { .name = "striped", - .version = {1, 6, 0}, - .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT, + .version = {1, 7, 0}, + .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT | + DM_TARGET_ATOMIC_WRITES | DM_TARGET_PASSES_CRYPTO, .module = THIS_MODULE, .ctr = stripe_ctr, .dtr = stripe_dtr, diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index bd8b796ae683..dd074c8ecbba 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -431,6 +431,13 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, return 0; } + mutex_lock(&q->limits_lock); + /* + * BLK_FEAT_ATOMIC_WRITES is not inherited from the bottom device in + * blk_stack_limits(), so do it manually. + */ + limits->features |= (q->limits.features & BLK_FEAT_ATOMIC_WRITES); + if (blk_stack_limits(limits, &q->limits, get_start_sect(bdev) + start) < 0) DMWARN("%s: adding target device %pg caused an alignment inconsistency: " @@ -448,6 +455,7 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, */ if (!dm_target_has_integrity(ti->type)) queue_limits_stack_integrity_bdev(limits, bdev); + mutex_unlock(&q->limits_lock); return 0; } @@ -523,8 +531,9 @@ static char **realloc_argv(unsigned int *size, char **old_argv) gfp = GFP_NOIO; } argv = kmalloc_array(new_size, sizeof(*argv), gfp); - if (argv && old_argv) { - memcpy(argv, old_argv, *size * sizeof(*argv)); + if (argv) { + if (old_argv) + memcpy(argv, old_argv, *size * sizeof(*argv)); *size = new_size; } @@ -697,6 +706,10 @@ int dm_table_add_target(struct dm_table *t, const char *type, DMERR("%s: zero-length target", dm_device_name(t->md)); return -EINVAL; } + if (start + len < start || start + len > LLONG_MAX >> SECTOR_SHIFT) { + DMERR("%s: too large device", dm_device_name(t->md)); + return -EINVAL; + } ti->type = dm_get_target_type(type); if (!ti->type) { @@ -1045,7 +1058,6 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * unsigned int min_pool_size = 0, pool_size; struct dm_md_mempools *pools; unsigned int bioset_flags = 0; - bool mempool_needs_integrity = t->integrity_supported; if (unlikely(type == DM_TYPE_NONE)) { DMERR("no table type is set, can't allocate mempools"); @@ -1070,8 +1082,6 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * per_io_data_size = max(per_io_data_size, ti->per_io_data_size); min_pool_size = max(min_pool_size, ti->num_flush_bios); - - mempool_needs_integrity |= ti->mempool_needs_integrity; } pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); front_pad = roundup(per_io_data_size, @@ -1081,15 +1091,9 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; if (bioset_init(&pools->io_bs, pool_size, io_front_pad, bioset_flags)) goto out_free_pools; - if (mempool_needs_integrity && - bioset_integrity_create(&pools->io_bs, pool_size)) - goto out_free_pools; init_bs: if (bioset_init(&pools->bs, pool_size, front_pad, 0)) goto out_free_pools; - if (mempool_needs_integrity && - bioset_integrity_create(&pools->bs, pool_size)) - goto out_free_pools; t->mempools = pools; return 0; @@ -1177,7 +1181,7 @@ static int dm_keyslot_evict(struct blk_crypto_profile *profile, t = dm_get_live_table(md, &srcu_idx); if (!t) - return 0; + goto put_live_table; for (unsigned int i = 0; i < t->num_targets; i++) { struct dm_target *ti = dm_table_get_target(t, i); @@ -1188,6 +1192,7 @@ static int dm_keyslot_evict(struct blk_crypto_profile *profile, (void *)key); } +put_live_table: dm_put_live_table(md, srcu_idx); return 0; } @@ -1250,6 +1255,7 @@ static int dm_table_construct_crypto_profile(struct dm_table *t) profile->max_dun_bytes_supported = UINT_MAX; memset(profile->modes_supported, 0xFF, sizeof(profile->modes_supported)); + profile->key_types_supported = ~0; for (i = 0; i < t->num_targets; i++) { struct dm_target *ti = dm_table_get_target(t, i); @@ -1492,6 +1498,18 @@ bool dm_table_has_no_data_devices(struct dm_table *t) return true; } +bool dm_table_is_wildcard(struct dm_table *t) +{ + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (!dm_target_is_wildcard(ti->type)) + return false; + } + + return true; +} + static int device_not_zoned(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { @@ -1723,8 +1741,12 @@ static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev * sector_t start, sector_t len, void *data) { struct request_queue *q = bdev_get_queue(dev->bdev); + int b; - return !q->limits.max_write_zeroes_sectors; + mutex_lock(&q->limits_lock); + b = !q->limits.max_write_zeroes_sectors; + mutex_unlock(&q->limits_lock); + return b; } static bool dm_table_supports_write_zeroes(struct dm_table *t) @@ -1806,10 +1828,50 @@ static bool dm_table_supports_secure_erase(struct dm_table *t) return true; } +static int device_not_atomic_write_capable(struct dm_target *ti, + struct dm_dev *dev, sector_t start, + sector_t len, void *data) +{ + return !bdev_can_atomic_write(dev->bdev); +} + +static bool dm_table_supports_atomic_writes(struct dm_table *t) +{ + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (!dm_target_supports_atomic_writes(ti->type)) + return false; + + if (!ti->type->iterate_devices) + return false; + + if (ti->type->iterate_devices(ti, + device_not_atomic_write_capable, NULL)) { + return false; + } + } + return true; +} + +bool dm_table_supports_size_change(struct dm_table *t, sector_t old_size, + sector_t new_size) +{ + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && dm_has_zone_plugs(t->md) && + old_size != new_size) { + DMWARN("%s: device has zone write plug resources. " + "Cannot change size", + dm_device_name(t->md)); + return false; + } + return true; +} + int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits) { int r; + struct queue_limits old_limits; if (!dm_table_supports_nowait(t)) limits->features &= ~BLK_FEAT_NOWAIT; @@ -1836,25 +1898,30 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (dm_table_supports_flush(t)) limits->features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA; - if (dm_table_supports_dax(t, device_not_dax_capable)) { + if (dm_table_supports_dax(t, device_not_dax_capable)) limits->features |= BLK_FEAT_DAX; - if (dm_table_supports_dax(t, device_not_dax_synchronous_capable)) - set_dax_synchronous(t->md->dax_dev); - } else + else limits->features &= ~BLK_FEAT_DAX; - if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL)) - dax_write_cache(t->md->dax_dev, true); - /* For a zoned table, setup the zone related queue attributes. */ - if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && - (limits->features & BLK_FEAT_ZONED)) { - r = dm_set_zones_restrictions(t, q, limits); - if (r) - return r; + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { + if (limits->features & BLK_FEAT_ZONED) { + r = dm_set_zones_restrictions(t, q, limits); + if (r) + return r; + } else if (dm_has_zone_plugs(t->md)) { + DMWARN("%s: device has zone write plug resources. " + "Cannot switch to non-zoned table.", + dm_device_name(t->md)); + return -EINVAL; + } } - r = queue_limits_set(q, limits); + if (dm_table_supports_atomic_writes(t)) + limits->features |= BLK_FEAT_ATOMIC_WRITES; + + old_limits = queue_limits_start_update(q); + r = queue_limits_commit_update(q, limits); if (r) return r; @@ -1865,10 +1932,21 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && (limits->features & BLK_FEAT_ZONED)) { r = dm_revalidate_zones(t, q); - if (r) + if (r) { + queue_limits_set(q, &old_limits); return r; + } } + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) + dm_finalize_zone_settings(t, limits); + + if (dm_table_supports_dax(t, device_not_dax_synchronous_capable)) + set_dax_synchronous(t->md->dax_dev); + + if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL)) + dax_write_cache(t->md->dax_dev, true); + dm_update_crypto_profile(q, t); return 0; } diff --git a/drivers/md/dm-vdo/block-map.c b/drivers/md/dm-vdo/block-map.c index 89cb7942ec5c..baf683cabb1b 100644 --- a/drivers/md/dm-vdo/block-map.c +++ b/drivers/md/dm-vdo/block-map.c @@ -451,7 +451,7 @@ static struct page_info * __must_check find_page(struct vdo_page_cache *cache, * select_lru_page() - Determine which page is least recently used. * * Picks the least recently used from among the non-busy entries at the front of each of the lru - * ring. Since whenever we mark a page busy we also put it to the end of the ring it is unlikely + * list. Since whenever we mark a page busy we also put it to the end of the list it is unlikely * that the entries at the front are busy unless the queue is very short, but not impossible. * * Return: A pointer to the info structure for a relevant page, or NULL if no such page can be @@ -1544,7 +1544,7 @@ static void write_page_if_not_dirtied(struct vdo_waiter *waiter, void *context) static void return_to_pool(struct block_map_zone *zone, struct pooled_vio *vio) { - return_vio_to_pool(zone->vio_pool, vio); + return_vio_to_pool(vio); check_for_drain_complete(zone); } @@ -1837,7 +1837,7 @@ static void finish_block_map_page_load(struct vdo_completion *completion) if (!vdo_copy_valid_page(vio->data, nonce, pbn, page)) vdo_format_block_map_page(page, nonce, pbn, false); - return_vio_to_pool(zone->vio_pool, pooled); + return_vio_to_pool(pooled); /* Release our claim to the load and wake any waiters */ release_page_lock(data_vio, "load"); @@ -1851,10 +1851,9 @@ static void handle_io_error(struct vdo_completion *completion) struct vio *vio = as_vio(completion); struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio); struct data_vio *data_vio = completion->parent; - struct block_map_zone *zone = pooled->context; vio_record_metadata_io_error(vio); - return_vio_to_pool(zone->vio_pool, pooled); + return_vio_to_pool(pooled); abort_load(data_vio, result); } @@ -2499,7 +2498,7 @@ static void finish_cursor(struct cursor *cursor) struct cursors *cursors = cursor->parent; struct vdo_completion *completion = cursors->completion; - return_vio_to_pool(cursors->pool, vdo_forget(cursor->vio)); + return_vio_to_pool(vdo_forget(cursor->vio)); if (--cursors->active_roots > 0) return; @@ -2746,7 +2745,7 @@ static int __must_check initialize_block_map_zone(struct block_map *map, if (result != VDO_SUCCESS) return result; - result = make_vio_pool(vdo, BLOCK_MAP_VIO_POOL_SIZE, + result = make_vio_pool(vdo, BLOCK_MAP_VIO_POOL_SIZE, 1, zone->thread_id, VIO_TYPE_BLOCK_MAP_INTERIOR, VIO_PRIORITY_METADATA, zone, &zone->vio_pool); if (result != VDO_SUCCESS) diff --git a/drivers/md/dm-vdo/constants.h b/drivers/md/dm-vdo/constants.h index a8c4d6e24b38..2a8b03779f87 100644 --- a/drivers/md/dm-vdo/constants.h +++ b/drivers/md/dm-vdo/constants.h @@ -44,9 +44,6 @@ enum { /* The default size of each slab journal, in blocks */ DEFAULT_VDO_SLAB_JOURNAL_SIZE = 224, - /* Unit test minimum */ - MINIMUM_VDO_SLAB_JOURNAL_BLOCKS = 2, - /* * The initial size of lbn_operations and pbn_operations, which is based upon the expected * maximum number of outstanding VIOs. This value was chosen to make it highly unlikely diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c index 3f3d29af1be4..3c58b941e067 100644 --- a/drivers/md/dm-vdo/dedupe.c +++ b/drivers/md/dm-vdo/dedupe.c @@ -226,7 +226,7 @@ struct hash_lock { * A list containing the data VIOs sharing this lock, all having the same record name and * data block contents, linked by their hash_lock_node fields. */ - struct list_head duplicate_ring; + struct list_head duplicate_vios; /* The number of data_vios sharing this lock instance */ data_vio_count_t reference_count; @@ -343,7 +343,7 @@ static void return_hash_lock_to_pool(struct hash_zone *zone, struct hash_lock *l { memset(lock, 0, sizeof(*lock)); INIT_LIST_HEAD(&lock->pool_node); - INIT_LIST_HEAD(&lock->duplicate_ring); + INIT_LIST_HEAD(&lock->duplicate_vios); vdo_waitq_init(&lock->waiters); list_add_tail(&lock->pool_node, &zone->lock_pool); } @@ -441,7 +441,7 @@ static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock) VDO_ASSERT_LOG_ONLY(data_vio->hash_zone != NULL, "must have a hash zone when holding a hash lock"); VDO_ASSERT_LOG_ONLY(!list_empty(&data_vio->hash_lock_entry), - "must be on a hash lock ring when holding a hash lock"); + "must be on a hash lock list when holding a hash lock"); VDO_ASSERT_LOG_ONLY(old_lock->reference_count > 0, "hash lock reference must be counted"); @@ -464,10 +464,10 @@ static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock) if (new_lock != NULL) { /* - * Keep all data_vios sharing the lock on a ring since they can complete in any + * Keep all data_vios sharing the lock on a list since they can complete in any * order and we'll always need a pointer to one to compare data. */ - list_move_tail(&data_vio->hash_lock_entry, &new_lock->duplicate_ring); + list_move_tail(&data_vio->hash_lock_entry, &new_lock->duplicate_vios); new_lock->reference_count += 1; if (new_lock->max_references < new_lock->reference_count) new_lock->max_references = new_lock->reference_count; @@ -1789,10 +1789,10 @@ static bool is_hash_collision(struct hash_lock *lock, struct data_vio *candidate struct hash_zone *zone; bool collides; - if (list_empty(&lock->duplicate_ring)) + if (list_empty(&lock->duplicate_vios)) return false; - lock_holder = list_first_entry(&lock->duplicate_ring, struct data_vio, + lock_holder = list_first_entry(&lock->duplicate_vios, struct data_vio, hash_lock_entry); zone = candidate->hash_zone; collides = !blocks_equal(lock_holder->vio.data, candidate->vio.data); @@ -1815,7 +1815,7 @@ static inline int assert_hash_lock_preconditions(const struct data_vio *data_vio return result; result = VDO_ASSERT(list_empty(&data_vio->hash_lock_entry), - "must not already be a member of a hash lock ring"); + "must not already be a member of a hash lock list"); if (result != VDO_SUCCESS) return result; @@ -1942,8 +1942,8 @@ void vdo_release_hash_lock(struct data_vio *data_vio) "returned hash lock must not be in use with state %s", get_hash_lock_state_name(lock->state)); VDO_ASSERT_LOG_ONLY(list_empty(&lock->pool_node), - "hash lock returned to zone must not be in a pool ring"); - VDO_ASSERT_LOG_ONLY(list_empty(&lock->duplicate_ring), + "hash lock returned to zone must not be in a pool list"); + VDO_ASSERT_LOG_ONLY(list_empty(&lock->duplicate_vios), "hash lock returned to zone must not reference DataVIOs"); return_hash_lock_to_pool(zone, lock); @@ -2261,7 +2261,7 @@ static void check_for_drain_complete(struct hash_zone *zone) if ((atomic_read(&zone->timer_state) == DEDUPE_QUERY_TIMER_IDLE) || change_timer_state(zone, DEDUPE_QUERY_TIMER_RUNNING, DEDUPE_QUERY_TIMER_IDLE)) { - del_timer_sync(&zone->timer); + timer_delete_sync(&zone->timer); } else { /* * There is an in flight time-out, which must get processed before we can continue. diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c index 100e92f8f866..b7cc0f41caca 100644 --- a/drivers/md/dm-vdo/encodings.c +++ b/drivers/md/dm-vdo/encodings.c @@ -711,24 +711,11 @@ int vdo_configure_slab(block_count_t slab_size, block_count_t slab_journal_block ref_blocks = vdo_get_saved_reference_count_size(slab_size - slab_journal_blocks); meta_blocks = (ref_blocks + slab_journal_blocks); - /* Make sure test code hasn't configured slabs to be too small. */ + /* Make sure configured slabs are not too small. */ if (meta_blocks >= slab_size) return VDO_BAD_CONFIGURATION; - /* - * If the slab size is very small, assume this must be a unit test and override the number - * of data blocks to be a power of two (wasting blocks in the slab). Many tests need their - * data_blocks fields to be the exact capacity of the configured volume, and that used to - * fall out since they use a power of two for the number of data blocks, the slab size was - * a power of two, and every block in a slab was a data block. - * - * TODO: Try to figure out some way of structuring testParameters and unit tests so this - * hack isn't needed without having to edit several unit tests every time the metadata size - * changes by one block. - */ data_blocks = slab_size - meta_blocks; - if ((slab_size < 1024) && !is_power_of_2(data_blocks)) - data_blocks = ((block_count_t) 1 << ilog2(data_blocks)); /* * Configure the slab journal thresholds. The flush threshold is 168 of 224 blocks in @@ -1221,11 +1208,6 @@ int vdo_validate_config(const struct vdo_config *config, if (result != VDO_SUCCESS) return result; - result = VDO_ASSERT(config->slab_journal_blocks >= MINIMUM_VDO_SLAB_JOURNAL_BLOCKS, - "slab journal size meets minimum size"); - if (result != VDO_SUCCESS) - return result; - result = VDO_ASSERT(config->slab_journal_blocks <= config->slab_size, "slab journal size is within expected bound"); if (result != VDO_SUCCESS) diff --git a/drivers/md/dm-vdo/indexer/index-layout.c b/drivers/md/dm-vdo/indexer/index-layout.c index af8fab83b0f3..61edf2b72427 100644 --- a/drivers/md/dm-vdo/indexer/index-layout.c +++ b/drivers/md/dm-vdo/indexer/index-layout.c @@ -54,7 +54,6 @@ * Each save also has a unique nonce. */ -#define MAGIC_SIZE 32 #define NONCE_INFO_SIZE 32 #define MAX_SAVES 2 @@ -98,9 +97,11 @@ enum region_type { #define SUPER_VERSION_CURRENT 3 #define SUPER_VERSION_MAXIMUM 7 -static const u8 LAYOUT_MAGIC[MAGIC_SIZE] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*"; +static const u8 LAYOUT_MAGIC[] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*"; static const u64 REGION_MAGIC = 0x416c6252676e3031; /* 'AlbRgn01' */ +#define MAGIC_SIZE (sizeof(LAYOUT_MAGIC) - 1) + struct region_header { u64 magic; u64 region_blocks; diff --git a/drivers/md/dm-vdo/indexer/index-session.c b/drivers/md/dm-vdo/indexer/index-session.c index aee0914d604a..aa575a24e0b2 100644 --- a/drivers/md/dm-vdo/indexer/index-session.c +++ b/drivers/md/dm-vdo/indexer/index-session.c @@ -100,7 +100,6 @@ static int get_index_session(struct uds_index_session *index_session) int uds_launch_request(struct uds_request *request) { - size_t internal_size; int result; if (request->callback == NULL) { @@ -121,10 +120,7 @@ int uds_launch_request(struct uds_request *request) } /* Reset all internal fields before processing. */ - internal_size = - sizeof(struct uds_request) - offsetof(struct uds_request, zone_number); - // FIXME should be using struct_group for this instead - memset((char *) request + sizeof(*request) - internal_size, 0, internal_size); + memset(&request->internal, 0, sizeof(request->internal)); result = get_index_session(request->session); if (result != UDS_SUCCESS) diff --git a/drivers/md/dm-vdo/indexer/indexer.h b/drivers/md/dm-vdo/indexer/indexer.h index 183a94eb7e92..7c1fc4577f5b 100644 --- a/drivers/md/dm-vdo/indexer/indexer.h +++ b/drivers/md/dm-vdo/indexer/indexer.h @@ -8,6 +8,7 @@ #include <linux/mutex.h> #include <linux/sched.h> +#include <linux/stddef.h> #include <linux/types.h> #include <linux/wait.h> @@ -73,7 +74,7 @@ enum uds_request_type { /* Remove any mapping for a name. */ UDS_DELETE, -}; +} __packed; enum uds_open_index_type { /* Create a new index. */ @@ -226,7 +227,7 @@ struct uds_zone_message { enum uds_zone_message_type type; /* The virtual chapter number to which the message applies */ u64 virtual_chapter; -}; +} __packed; struct uds_index_session; struct uds_index; @@ -253,34 +254,32 @@ struct uds_request { /* The existing data associated with the request name, if any */ struct uds_record_data old_metadata; - /* Either UDS_SUCCESS or an error code for the request */ - int status; /* True if the record name had an existing entry in the index */ bool found; + /* Either UDS_SUCCESS or an error code for the request */ + int status; - /* - * The remaining fields are used internally and should not be altered by clients. The index - * relies on zone_number being the first field in this section. - */ - - /* The number of the zone which will process this request*/ - unsigned int zone_number; - /* A link for adding a request to a lock-free queue */ - struct funnel_queue_entry queue_link; - /* A link for adding a request to a standard linked list */ - struct uds_request *next_request; - /* A pointer to the index processing this request */ - struct uds_index *index; - /* Control message for coordinating between zones */ - struct uds_zone_message zone_message; - /* If true, process request immediately by waking the worker thread */ - bool unbatched; - /* If true, continue this request before processing newer requests */ - bool requeued; - /* The virtual chapter containing the record name, if known */ - u64 virtual_chapter; - /* The region of the index containing the record name */ - enum uds_index_region location; + /* The remaining fields are used internally and should not be altered by clients. */ + struct_group(internal, + /* The virtual chapter containing the record name, if known */ + u64 virtual_chapter; + /* The region of the index containing the record name */ + enum uds_index_region location; + /* If true, process request immediately by waking the worker thread */ + bool unbatched; + /* If true, continue this request before processing newer requests */ + bool requeued; + /* Control message for coordinating between zones */ + struct uds_zone_message zone_message; + /* The number of the zone which will process this request*/ + unsigned int zone_number; + /* A link for adding a request to a lock-free queue */ + struct funnel_queue_entry queue_link; + /* A link for adding a request to a standard linked list */ + struct uds_request *next_request; + /* A pointer to the index processing this request */ + struct uds_index *index; + ); }; /* A session is required for most index operations. */ diff --git a/drivers/md/dm-vdo/io-submitter.c b/drivers/md/dm-vdo/io-submitter.c index 421e5436c32c..11d47770b54d 100644 --- a/drivers/md/dm-vdo/io-submitter.c +++ b/drivers/md/dm-vdo/io-submitter.c @@ -327,6 +327,7 @@ void vdo_submit_data_vio(struct data_vio *data_vio) * @error_handler: the handler for submission or I/O errors (may be NULL) * @operation: the type of I/O to perform * @data: the buffer to read or write (may be NULL) + * @size: the I/O amount in bytes * * The vio is enqueued on a vdo bio queue so that bio submission (which may block) does not block * other vdo threads. @@ -338,7 +339,7 @@ void vdo_submit_data_vio(struct data_vio *data_vio) */ void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical, bio_end_io_t callback, vdo_action_fn error_handler, - blk_opf_t operation, char *data) + blk_opf_t operation, char *data, int size) { int result; struct vdo_completion *completion = &vio->completion; @@ -349,7 +350,8 @@ void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical, vdo_reset_completion(completion); completion->error_handler = error_handler; - result = vio_reset_bio(vio, data, callback, operation | REQ_META, physical); + result = vio_reset_bio_with_size(vio, data, size, callback, operation | REQ_META, + physical); if (result != VDO_SUCCESS) { continue_vio(vio, result); return; diff --git a/drivers/md/dm-vdo/io-submitter.h b/drivers/md/dm-vdo/io-submitter.h index 80748699496f..3088f11055fd 100644 --- a/drivers/md/dm-vdo/io-submitter.h +++ b/drivers/md/dm-vdo/io-submitter.h @@ -8,6 +8,7 @@ #include <linux/bio.h> +#include "constants.h" #include "types.h" struct io_submitter; @@ -26,14 +27,25 @@ void vdo_submit_data_vio(struct data_vio *data_vio); void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical, bio_end_io_t callback, vdo_action_fn error_handler, - blk_opf_t operation, char *data); + blk_opf_t operation, char *data, int size); static inline void vdo_submit_metadata_vio(struct vio *vio, physical_block_number_t physical, bio_end_io_t callback, vdo_action_fn error_handler, blk_opf_t operation) { __submit_metadata_vio(vio, physical, callback, error_handler, - operation, vio->data); + operation, vio->data, vio->block_count * VDO_BLOCK_SIZE); +} + +static inline void vdo_submit_metadata_vio_with_size(struct vio *vio, + physical_block_number_t physical, + bio_end_io_t callback, + vdo_action_fn error_handler, + blk_opf_t operation, + int size) +{ + __submit_metadata_vio(vio, physical, callback, error_handler, + operation, vio->data, size); } static inline void vdo_submit_flush_vio(struct vio *vio, bio_end_io_t callback, @@ -41,7 +53,7 @@ static inline void vdo_submit_flush_vio(struct vio *vio, bio_end_io_t callback, { /* FIXME: Can we just use REQ_OP_FLUSH? */ __submit_metadata_vio(vio, 0, callback, error_handler, - REQ_OP_WRITE | REQ_PREFLUSH, NULL); + REQ_OP_WRITE | REQ_PREFLUSH, NULL, 0); } #endif /* VDO_IO_SUBMITTER_H */ diff --git a/drivers/md/dm-vdo/packer.h b/drivers/md/dm-vdo/packer.h index 0f3be44710b5..8c8d6892582d 100644 --- a/drivers/md/dm-vdo/packer.h +++ b/drivers/md/dm-vdo/packer.h @@ -46,7 +46,7 @@ struct compressed_block { /* * Each packer_bin holds an incomplete batch of data_vios that only partially fill a compressed - * block. The bins are kept in a ring sorted by the amount of unused space so the first bin with + * block. The bins are kept in a list sorted by the amount of unused space so the first bin with * enough space to hold a newly-compressed data_vio can easily be found. When the bin fills up or * is flushed, the first uncanceled data_vio in the bin is selected to be the agent for that bin. * Upon entering the packer, each data_vio already has its compressed data in the first slot of the diff --git a/drivers/md/dm-vdo/priority-table.c b/drivers/md/dm-vdo/priority-table.c index 42d3d8d0e4b5..9bae8256ba4e 100644 --- a/drivers/md/dm-vdo/priority-table.c +++ b/drivers/md/dm-vdo/priority-table.c @@ -199,7 +199,7 @@ void vdo_priority_table_remove(struct priority_table *table, struct list_head *e /* * Remove the entry from the bucket list, remembering a pointer to another entry in the - * ring. + * list. */ next_entry = entry->next; list_del_init(entry); diff --git a/drivers/md/dm-vdo/recovery-journal.h b/drivers/md/dm-vdo/recovery-journal.h index 899071173015..25e7ec6d19f6 100644 --- a/drivers/md/dm-vdo/recovery-journal.h +++ b/drivers/md/dm-vdo/recovery-journal.h @@ -43,9 +43,9 @@ * has a vio which is used to commit that block to disk. The vio's data is the on-disk * representation of the journal block. In addition each in-memory block has a buffer which is used * to accumulate entries while a partial commit of the block is in progress. In-memory blocks are - * kept on two rings. Free blocks live on the 'free_tail_blocks' ring. When a block becomes active - * (see below) it is moved to the 'active_tail_blocks' ring. When a block is fully committed, it is - * moved back to the 'free_tail_blocks' ring. + * kept on two lists. Free blocks live on the 'free_tail_blocks' list. When a block becomes active + * (see below) it is moved to the 'active_tail_blocks' list. When a block is fully committed, it is + * moved back to the 'free_tail_blocks' list. * * When entries are added to the journal, they are added to the active in-memory block, as * indicated by the 'active_block' field. If the caller wishes to wait for the entry to be diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c index 8f0a35c63af6..f3d80ff7bef5 100644 --- a/drivers/md/dm-vdo/slab-depot.c +++ b/drivers/md/dm-vdo/slab-depot.c @@ -139,7 +139,7 @@ static bool is_slab_journal_blank(const struct vdo_slab *slab) } /** - * mark_slab_journal_dirty() - Put a slab journal on the dirty ring of its allocator in the correct + * mark_slab_journal_dirty() - Put a slab journal on the dirty list of its allocator in the correct * order. * @journal: The journal to be marked dirty. * @lock: The recovery journal lock held by the slab journal. @@ -414,8 +414,7 @@ static void complete_reaping(struct vdo_completion *completion) { struct slab_journal *journal = completion->parent; - return_vio_to_pool(journal->slab->allocator->vio_pool, - vio_as_pooled_vio(as_vio(vdo_forget(completion)))); + return_vio_to_pool(vio_as_pooled_vio(as_vio(completion))); finish_reaping(journal); reap_slab_journal(journal); } @@ -698,7 +697,7 @@ static void complete_write(struct vdo_completion *completion) sequence_number_t committed = get_committing_sequence_number(pooled); list_del_init(&pooled->list_entry); - return_vio_to_pool(journal->slab->allocator->vio_pool, vdo_forget(pooled)); + return_vio_to_pool(pooled); if (result != VDO_SUCCESS) { vio_record_metadata_io_error(as_vio(completion)); @@ -822,7 +821,7 @@ static void commit_tail(struct slab_journal *journal) /* * Since we are about to commit the tail block, this journal no longer needs to be on the - * ring of journals which the recovery journal might ask to commit. + * list of journals which the recovery journal might ask to commit. */ mark_slab_journal_clean(journal); @@ -1076,7 +1075,7 @@ static void finish_reference_block_write(struct vdo_completion *completion) /* Release the slab journal lock. */ adjust_slab_journal_block_reference(&slab->journal, block->slab_journal_lock_to_release, -1); - return_vio_to_pool(slab->allocator->vio_pool, pooled); + return_vio_to_pool(pooled); /* * We can't clear the is_writing flag earlier as releasing the slab journal lock may cause @@ -1170,8 +1169,8 @@ static void handle_io_error(struct vdo_completion *completion) struct vdo_slab *slab = ((struct reference_block *) completion->parent)->slab; vio_record_metadata_io_error(vio); - return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio)); - slab->active_count--; + return_vio_to_pool(vio_as_pooled_vio(vio)); + slab->active_count -= vio->io_size / VDO_BLOCK_SIZE; vdo_enter_read_only_mode(slab->allocator->depot->vdo, result); check_if_slab_drained(slab); } @@ -1372,7 +1371,7 @@ static unsigned int calculate_slab_priority(struct vdo_slab *slab) static void prioritize_slab(struct vdo_slab *slab) { VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry), - "a slab must not already be on a ring when prioritizing"); + "a slab must not already be on a list when prioritizing"); slab->priority = calculate_slab_priority(slab); vdo_priority_table_enqueue(slab->allocator->prioritized_slabs, slab->priority, &slab->allocq_entry); @@ -2165,28 +2164,95 @@ static void dirty_all_reference_blocks(struct vdo_slab *slab) dirty_block(&slab->reference_blocks[i]); } +static inline bool journal_points_equal(struct journal_point first, + struct journal_point second) +{ + return ((first.sequence_number == second.sequence_number) && + (first.entry_count == second.entry_count)); +} + /** - * clear_provisional_references() - Clear the provisional reference counts from a reference block. - * @block: The block to clear. + * match_bytes() - Check an 8-byte word for bytes matching the value specified + * @input: A word to examine the bytes of + * @match: The byte value sought + * + * Return: 1 in each byte when the corresponding input byte matched, 0 otherwise */ -static void clear_provisional_references(struct reference_block *block) +static inline u64 match_bytes(u64 input, u8 match) { - vdo_refcount_t *counters = get_reference_counters_for_block(block); - block_count_t j; + u64 temp = input ^ (match * 0x0101010101010101ULL); + /* top bit of each byte is set iff top bit of temp byte is clear; rest are 0 */ + u64 test_top_bits = ~temp & 0x8080808080808080ULL; + /* top bit of each byte is set iff low 7 bits of temp byte are clear; rest are useless */ + u64 test_low_bits = 0x8080808080808080ULL - (temp & 0x7f7f7f7f7f7f7f7fULL); + /* return 1 when both tests indicate temp byte is 0 */ + return (test_top_bits & test_low_bits) >> 7; +} + +/** + * count_valid_references() - Process a newly loaded refcount array + * @counters: the array of counters from a metadata block + * + * Scan a 8-byte-aligned array of counters, fixing up any "provisional" values that weren't + * cleaned up at shutdown, changing them internally to "empty". + * + * Return: the number of blocks that are referenced (counters not "empty") + */ +static unsigned int count_valid_references(vdo_refcount_t *counters) +{ + u64 *words = (u64 *)counters; + /* It's easier to count occurrences of a specific byte than its absences. */ + unsigned int empty_count = 0; + /* For speed, we process 8 bytes at once. */ + unsigned int words_left = COUNTS_PER_BLOCK / sizeof(u64); + + /* + * Sanity check assumptions used for optimizing this code: Counters are bytes. The counter + * array is a multiple of the word size. + */ + BUILD_BUG_ON(sizeof(vdo_refcount_t) != 1); + BUILD_BUG_ON((COUNTS_PER_BLOCK % sizeof(u64)) != 0); + + while (words_left > 0) { + /* + * This is used effectively as 8 byte-size counters. Byte 0 counts how many words + * had the target value found in byte 0, etc. We just have to avoid overflow. + */ + u64 split_count = 0; + /* + * The counter "% 255" trick used below to fold split_count into empty_count + * imposes a limit of 254 bytes examined each iteration of the outer loop. We + * process a word at a time, so that limit gets rounded down to 31 u64 words. + */ + const unsigned int max_words_per_iteration = 254 / sizeof(u64); + unsigned int iter_words_left = min_t(unsigned int, words_left, + max_words_per_iteration); + + words_left -= iter_words_left; + + while (iter_words_left--) { + u64 word = *words; + u64 temp; + + /* First, if we have any provisional refcount values, clear them. */ + temp = match_bytes(word, PROVISIONAL_REFERENCE_COUNT); + if (temp) { + /* + * 'temp' has 0x01 bytes where 'word' has PROVISIONAL; this xor + * will alter just those bytes, changing PROVISIONAL to EMPTY. + */ + word ^= temp * (PROVISIONAL_REFERENCE_COUNT ^ EMPTY_REFERENCE_COUNT); + *words = word; + } - for (j = 0; j < COUNTS_PER_BLOCK; j++) { - if (counters[j] == PROVISIONAL_REFERENCE_COUNT) { - counters[j] = EMPTY_REFERENCE_COUNT; - block->allocated_count--; + /* Now count the EMPTY_REFERENCE_COUNT bytes, updating the 8 counters. */ + split_count += match_bytes(word, EMPTY_REFERENCE_COUNT); + words++; } + empty_count += split_count % 255; } -} -static inline bool journal_points_equal(struct journal_point first, - struct journal_point second) -{ - return ((first.sequence_number == second.sequence_number) && - (first.entry_count == second.entry_count)); + return COUNTS_PER_BLOCK - empty_count; } /** @@ -2197,7 +2263,6 @@ static inline bool journal_points_equal(struct journal_point first, static void unpack_reference_block(struct packed_reference_block *packed, struct reference_block *block) { - block_count_t index; sector_count_t i; struct vdo_slab *slab = block->slab; vdo_refcount_t *counters = get_reference_counters_for_block(block); @@ -2223,11 +2288,7 @@ static void unpack_reference_block(struct packed_reference_block *packed, } } - block->allocated_count = 0; - for (index = 0; index < COUNTS_PER_BLOCK; index++) { - if (counters[index] != EMPTY_REFERENCE_COUNT) - block->allocated_count++; - } + block->allocated_count = count_valid_references(counters); } /** @@ -2240,13 +2301,19 @@ static void finish_reference_block_load(struct vdo_completion *completion) struct pooled_vio *pooled = vio_as_pooled_vio(vio); struct reference_block *block = completion->parent; struct vdo_slab *slab = block->slab; + unsigned int block_count = vio->io_size / VDO_BLOCK_SIZE; + unsigned int i; + char *data = vio->data; - unpack_reference_block((struct packed_reference_block *) vio->data, block); - return_vio_to_pool(slab->allocator->vio_pool, pooled); - slab->active_count--; - clear_provisional_references(block); + for (i = 0; i < block_count; i++, block++, data += VDO_BLOCK_SIZE) { + struct packed_reference_block *packed = (struct packed_reference_block *) data; + + unpack_reference_block(packed, block); + slab->free_blocks -= block->allocated_count; + } + return_vio_to_pool(pooled); + slab->active_count -= block_count; - slab->free_blocks -= block->allocated_count; check_if_slab_drained(slab); } @@ -2260,23 +2327,25 @@ static void load_reference_block_endio(struct bio *bio) } /** - * load_reference_block() - After a block waiter has gotten a VIO from the VIO pool, load the - * block. - * @waiter: The waiter of the block to load. + * load_reference_block_group() - After a block waiter has gotten a VIO from the VIO pool, load + * a set of blocks. + * @waiter: The waiter of the first block to load. * @context: The VIO returned by the pool. */ -static void load_reference_block(struct vdo_waiter *waiter, void *context) +static void load_reference_block_group(struct vdo_waiter *waiter, void *context) { struct pooled_vio *pooled = context; struct vio *vio = &pooled->vio; struct reference_block *block = container_of(waiter, struct reference_block, waiter); - size_t block_offset = (block - block->slab->reference_blocks); + u32 block_offset = block - block->slab->reference_blocks; + u32 max_block_count = block->slab->reference_block_count - block_offset; + u32 block_count = min_t(int, vio->block_count, max_block_count); vio->completion.parent = block; - vdo_submit_metadata_vio(vio, block->slab->ref_counts_origin + block_offset, - load_reference_block_endio, handle_io_error, - REQ_OP_READ); + vdo_submit_metadata_vio_with_size(vio, block->slab->ref_counts_origin + block_offset, + load_reference_block_endio, handle_io_error, + REQ_OP_READ, block_count * VDO_BLOCK_SIZE); } /** @@ -2286,14 +2355,21 @@ static void load_reference_block(struct vdo_waiter *waiter, void *context) static void load_reference_blocks(struct vdo_slab *slab) { block_count_t i; + u64 blocks_per_vio = slab->allocator->refcount_blocks_per_big_vio; + struct vio_pool *pool = slab->allocator->refcount_big_vio_pool; + + if (!pool) { + pool = slab->allocator->vio_pool; + blocks_per_vio = 1; + } slab->free_blocks = slab->block_count; slab->active_count = slab->reference_block_count; - for (i = 0; i < slab->reference_block_count; i++) { + for (i = 0; i < slab->reference_block_count; i += blocks_per_vio) { struct vdo_waiter *waiter = &slab->reference_blocks[i].waiter; - waiter->callback = load_reference_block; - acquire_vio_from_pool(slab->allocator->vio_pool, waiter); + waiter->callback = load_reference_block_group; + acquire_vio_from_pool(pool, waiter); } } @@ -2429,7 +2505,7 @@ static void finish_loading_journal(struct vdo_completion *completion) initialize_journal_state(journal); } - return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio)); + return_vio_to_pool(vio_as_pooled_vio(vio)); vdo_finish_loading_with_result(&slab->state, allocate_counters_if_clean(slab)); } @@ -2449,7 +2525,7 @@ static void handle_load_error(struct vdo_completion *completion) struct vio *vio = as_vio(completion); vio_record_metadata_io_error(vio); - return_vio_to_pool(journal->slab->allocator->vio_pool, vio_as_pooled_vio(vio)); + return_vio_to_pool(vio_as_pooled_vio(vio)); vdo_finish_loading_with_result(&journal->slab->state, result); } @@ -2547,7 +2623,7 @@ static void queue_slab(struct vdo_slab *slab) int result; VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry), - "a requeued slab must not already be on a ring"); + "a requeued slab must not already be on a list"); if (vdo_is_read_only(allocator->depot->vdo)) return; @@ -2700,6 +2776,7 @@ static void finish_scrubbing(struct slab_scrubber *scrubber, int result) vdo_log_info("VDO commencing normal operation"); else if (prior_state == VDO_RECOVERING) vdo_log_info("Exiting recovery mode"); + free_vio_pool(vdo_forget(allocator->refcount_big_vio_pool)); } /* @@ -3281,7 +3358,7 @@ int vdo_release_block_reference(struct block_allocator *allocator, * This is a min_heap callback function orders slab_status structures using the 'is_clean' field as * the primary key and the 'emptiness' field as the secondary key. * - * Slabs need to be pushed onto the rings in the same order they are to be popped off. Popping + * Slabs need to be pushed onto the lists in the same order they are to be popped off. Popping * should always get the most empty first, so pushing should be from most empty to least empty. * Thus, the ordering is reversed from the usual sense since min_heap returns smaller elements * before larger ones. @@ -3983,6 +4060,7 @@ static int __must_check initialize_block_allocator(struct slab_depot *depot, struct vdo *vdo = depot->vdo; block_count_t max_free_blocks = depot->slab_config.data_blocks; unsigned int max_priority = (2 + ilog2(max_free_blocks)); + u32 reference_block_count, refcount_reads_needed, refcount_blocks_per_vio; *allocator = (struct block_allocator) { .depot = depot, @@ -4000,12 +4078,24 @@ static int __must_check initialize_block_allocator(struct slab_depot *depot, return result; vdo_initialize_completion(&allocator->completion, vdo, VDO_BLOCK_ALLOCATOR_COMPLETION); - result = make_vio_pool(vdo, BLOCK_ALLOCATOR_VIO_POOL_SIZE, allocator->thread_id, + result = make_vio_pool(vdo, BLOCK_ALLOCATOR_VIO_POOL_SIZE, 1, allocator->thread_id, VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA, allocator, &allocator->vio_pool); if (result != VDO_SUCCESS) return result; + /* Initialize the refcount-reading vio pool. */ + reference_block_count = vdo_get_saved_reference_count_size(depot->slab_config.slab_blocks); + refcount_reads_needed = DIV_ROUND_UP(reference_block_count, MAX_BLOCKS_PER_VIO); + refcount_blocks_per_vio = DIV_ROUND_UP(reference_block_count, refcount_reads_needed); + allocator->refcount_blocks_per_big_vio = refcount_blocks_per_vio; + result = make_vio_pool(vdo, BLOCK_ALLOCATOR_REFCOUNT_VIO_POOL_SIZE, + allocator->refcount_blocks_per_big_vio, allocator->thread_id, + VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA, + NULL, &allocator->refcount_big_vio_pool); + if (result != VDO_SUCCESS) + return result; + result = initialize_slab_scrubber(allocator); if (result != VDO_SUCCESS) return result; @@ -4223,6 +4313,7 @@ void vdo_free_slab_depot(struct slab_depot *depot) uninitialize_allocator_summary(allocator); uninitialize_scrubber_vio(&allocator->scrubber); free_vio_pool(vdo_forget(allocator->vio_pool)); + free_vio_pool(vdo_forget(allocator->refcount_big_vio_pool)); vdo_free_priority_table(vdo_forget(allocator->prioritized_slabs)); } diff --git a/drivers/md/dm-vdo/slab-depot.h b/drivers/md/dm-vdo/slab-depot.h index f234853501ca..fadc0c9d4dc4 100644 --- a/drivers/md/dm-vdo/slab-depot.h +++ b/drivers/md/dm-vdo/slab-depot.h @@ -45,6 +45,13 @@ enum { /* The number of vios in the vio pool is proportional to the throughput of the VDO. */ BLOCK_ALLOCATOR_VIO_POOL_SIZE = 128, + + /* + * The number of vios in the vio pool used for loading reference count data. A slab's + * refcounts is capped at ~8MB, and we process one at a time in a zone, so 9 should be + * plenty. + */ + BLOCK_ALLOCATOR_REFCOUNT_VIO_POOL_SIZE = 9, }; /* @@ -248,7 +255,7 @@ struct vdo_slab { /* A list of the dirty blocks waiting to be written out */ struct vdo_wait_queue dirty_blocks; - /* The number of blocks which are currently writing */ + /* The number of blocks which are currently reading or writing */ size_t active_count; /* A waiter object for updating the slab summary */ @@ -425,6 +432,10 @@ struct block_allocator { /* The vio pool for reading and writing block allocator metadata */ struct vio_pool *vio_pool; + /* The vio pool for large initial reads of ref count areas */ + struct vio_pool *refcount_big_vio_pool; + /* How many ref count blocks are read per vio at initial load */ + u32 refcount_blocks_per_big_vio; /* The dm_kcopyd client for erasing slab journals */ struct dm_kcopyd_client *eraser; /* Iterator over the slabs to be erased */ diff --git a/drivers/md/dm-vdo/types.h b/drivers/md/dm-vdo/types.h index dbe892b10f26..cdf36e7d7702 100644 --- a/drivers/md/dm-vdo/types.h +++ b/drivers/md/dm-vdo/types.h @@ -376,6 +376,9 @@ struct vio { /* The size of this vio in blocks */ unsigned int block_count; + /* The amount of data to be read or written, in bytes */ + unsigned int io_size; + /* The data being read or written. */ char *data; diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c index a7e32baab4af..80b608674022 100644 --- a/drivers/md/dm-vdo/vdo.c +++ b/drivers/md/dm-vdo/vdo.c @@ -31,9 +31,7 @@ #include <linux/completion.h> #include <linux/device-mapper.h> -#include <linux/kernel.h> #include <linux/lz4.h> -#include <linux/module.h> #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/types.h> @@ -142,12 +140,6 @@ static void finish_vdo_request_queue(void *ptr) vdo_unregister_allocating_thread(); } -#ifdef MODULE -#define MODULE_NAME THIS_MODULE->name -#else -#define MODULE_NAME "dm-vdo" -#endif /* MODULE */ - static const struct vdo_work_queue_type default_queue_type = { .start = start_vdo_request_queue, .finish = finish_vdo_request_queue, @@ -559,8 +551,7 @@ int vdo_make(unsigned int instance, struct device_config *config, char **reason, *vdo_ptr = vdo; snprintf(vdo->thread_name_prefix, sizeof(vdo->thread_name_prefix), - "%s%u", MODULE_NAME, instance); - BUG_ON(vdo->thread_name_prefix[0] == '\0'); + "vdo%u", instance); result = vdo_allocate(vdo->thread_config.thread_count, struct vdo_thread, __func__, &vdo->threads); if (result != VDO_SUCCESS) { diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c index e710f3c5a972..e7f4153e55e3 100644 --- a/drivers/md/dm-vdo/vio.c +++ b/drivers/md/dm-vdo/vio.c @@ -188,14 +188,23 @@ void vdo_set_bio_properties(struct bio *bio, struct vio *vio, bio_end_io_t callb /* * Prepares the bio to perform IO with the specified buffer. May only be used on a VDO-allocated - * bio, as it assumes the bio wraps a 4k buffer that is 4k aligned, but there does not have to be a - * vio associated with the bio. + * bio, as it assumes the bio wraps a 4k-multiple buffer that is 4k aligned, but there does not + * have to be a vio associated with the bio. */ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, blk_opf_t bi_opf, physical_block_number_t pbn) { - int bvec_count, offset, len, i; + return vio_reset_bio_with_size(vio, data, vio->block_count * VDO_BLOCK_SIZE, + callback, bi_opf, pbn); +} + +int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t callback, + blk_opf_t bi_opf, physical_block_number_t pbn) +{ + int bvec_count, offset, i; struct bio *bio = vio->bio; + int vio_size = vio->block_count * VDO_BLOCK_SIZE; + int remaining; bio_reset(bio, bio->bi_bdev, bi_opf); vdo_set_bio_properties(bio, vio, callback, bi_opf, pbn); @@ -205,22 +214,21 @@ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, bio->bi_ioprio = 0; bio->bi_io_vec = bio->bi_inline_vecs; bio->bi_max_vecs = vio->block_count + 1; - len = VDO_BLOCK_SIZE * vio->block_count; + if (VDO_ASSERT(size <= vio_size, "specified size %d is not greater than allocated %d", + size, vio_size) != VDO_SUCCESS) + size = vio_size; + vio->io_size = size; offset = offset_in_page(data); - bvec_count = DIV_ROUND_UP(offset + len, PAGE_SIZE); + bvec_count = DIV_ROUND_UP(offset + size, PAGE_SIZE); + remaining = size; - /* - * If we knew that data was always on one page, or contiguous pages, we wouldn't need the - * loop. But if we're using vmalloc, it's not impossible that the data is in different - * pages that can't be merged in bio_add_page... - */ - for (i = 0; (i < bvec_count) && (len > 0); i++) { + for (i = 0; (i < bvec_count) && (remaining > 0); i++) { struct page *page; int bytes_added; int bytes = PAGE_SIZE - offset; - if (bytes > len) - bytes = len; + if (bytes > remaining) + bytes = remaining; page = is_vmalloc_addr(data) ? vmalloc_to_page(data) : virt_to_page(data); bytes_added = bio_add_page(bio, page, bytes, offset); @@ -232,7 +240,7 @@ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, } data += bytes; - len -= bytes; + remaining -= bytes; offset = 0; } @@ -301,6 +309,7 @@ void vio_record_metadata_io_error(struct vio *vio) * make_vio_pool() - Create a new vio pool. * @vdo: The vdo. * @pool_size: The number of vios in the pool. + * @block_count: The number of 4k blocks per vio. * @thread_id: The ID of the thread using this pool. * @vio_type: The type of vios in the pool. * @priority: The priority with which vios from the pool should be enqueued. @@ -309,13 +318,14 @@ void vio_record_metadata_io_error(struct vio *vio) * * Return: A success or error code. */ -int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, +int make_vio_pool(struct vdo *vdo, size_t pool_size, size_t block_count, thread_id_t thread_id, enum vio_type vio_type, enum vio_priority priority, void *context, struct vio_pool **pool_ptr) { struct vio_pool *pool; char *ptr; int result; + size_t per_vio_size = VDO_BLOCK_SIZE * block_count; result = vdo_allocate_extended(struct vio_pool, pool_size, struct pooled_vio, __func__, &pool); @@ -326,7 +336,7 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, INIT_LIST_HEAD(&pool->available); INIT_LIST_HEAD(&pool->busy); - result = vdo_allocate(pool_size * VDO_BLOCK_SIZE, char, + result = vdo_allocate(pool_size * per_vio_size, char, "VIO pool buffer", &pool->buffer); if (result != VDO_SUCCESS) { free_vio_pool(pool); @@ -334,10 +344,10 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, } ptr = pool->buffer; - for (pool->size = 0; pool->size < pool_size; pool->size++, ptr += VDO_BLOCK_SIZE) { + for (pool->size = 0; pool->size < pool_size; pool->size++, ptr += per_vio_size) { struct pooled_vio *pooled = &pool->vios[pool->size]; - result = allocate_vio_components(vdo, vio_type, priority, NULL, 1, ptr, + result = allocate_vio_components(vdo, vio_type, priority, NULL, block_count, ptr, &pooled->vio); if (result != VDO_SUCCESS) { free_vio_pool(pool); @@ -345,6 +355,7 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, } pooled->context = context; + pooled->pool = pool; list_add_tail(&pooled->pool_entry, &pool->available); } @@ -419,12 +430,13 @@ void acquire_vio_from_pool(struct vio_pool *pool, struct vdo_waiter *waiter) } /** - * return_vio_to_pool() - Return a vio to the pool - * @pool: The vio pool. + * return_vio_to_pool() - Return a vio to its pool * @vio: The pooled vio to return. */ -void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio) +void return_vio_to_pool(struct pooled_vio *vio) { + struct vio_pool *pool = vio->pool; + VDO_ASSERT_LOG_ONLY((pool->thread_id == vdo_get_callback_thread_id()), "vio pool entry returned on same thread as it was acquired"); diff --git a/drivers/md/dm-vdo/vio.h b/drivers/md/dm-vdo/vio.h index 3490e9f59b04..4bfcb21901f1 100644 --- a/drivers/md/dm-vdo/vio.h +++ b/drivers/md/dm-vdo/vio.h @@ -30,6 +30,8 @@ struct pooled_vio { void *context; /* The list entry used by the pool */ struct list_head pool_entry; + /* The pool this vio is allocated from */ + struct vio_pool *pool; }; /** @@ -123,6 +125,8 @@ void vdo_set_bio_properties(struct bio *bio, struct vio *vio, bio_end_io_t callb int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, blk_opf_t bi_opf, physical_block_number_t pbn); +int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t callback, + blk_opf_t bi_opf, physical_block_number_t pbn); void update_vio_error_stats(struct vio *vio, const char *format, ...) __printf(2, 3); @@ -188,12 +192,13 @@ static inline struct pooled_vio *vio_as_pooled_vio(struct vio *vio) struct vio_pool; -int __must_check make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, - enum vio_type vio_type, enum vio_priority priority, - void *context, struct vio_pool **pool_ptr); +int __must_check make_vio_pool(struct vdo *vdo, size_t pool_size, size_t block_count, + thread_id_t thread_id, enum vio_type vio_type, + enum vio_priority priority, void *context, + struct vio_pool **pool_ptr); void free_vio_pool(struct vio_pool *pool); bool __must_check is_vio_pool_busy(struct vio_pool *pool); void acquire_vio_from_pool(struct vio_pool *pool, struct vdo_waiter *waiter); -void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio); +void return_vio_to_pool(struct pooled_vio *vio); #endif /* VIO_H */ diff --git a/drivers/md/dm-vdo/wait-queue.c b/drivers/md/dm-vdo/wait-queue.c index 6e1e739277ef..f81ed0cee2bf 100644 --- a/drivers/md/dm-vdo/wait-queue.c +++ b/drivers/md/dm-vdo/wait-queue.c @@ -34,7 +34,7 @@ void vdo_waitq_enqueue_waiter(struct vdo_wait_queue *waitq, struct vdo_waiter *w waitq->last_waiter->next_waiter = waiter; } - /* In both cases, the waiter we added to the ring becomes the last waiter. */ + /* In both cases, the waiter we added to the list becomes the last waiter. */ waitq->last_waiter = waiter; waitq->length += 1; } diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index e61855da6461..631a887b487c 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -122,7 +122,7 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); par = fec_read_parity(v, rsb, block_offset, &offset, - par_buf_offset, &buf, bio_prio(bio)); + par_buf_offset, &buf, bio->bi_ioprio); if (IS_ERR(par)) return PTR_ERR(par); @@ -164,7 +164,7 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, dm_bufio_release(buf); par = fec_read_parity(v, rsb, block_offset, &offset, - par_buf_offset, &buf, bio_prio(bio)); + par_buf_offset, &buf, bio->bi_ioprio); if (IS_ERR(par)) return PTR_ERR(par); } @@ -254,7 +254,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, bufio = v->bufio; } - bbuf = dm_bufio_read_with_ioprio(bufio, block, &buf, bio_prio(bio)); + bbuf = dm_bufio_read_with_ioprio(bufio, block, &buf, bio->bi_ioprio); if (IS_ERR(bbuf)) { DMWARN_LIMIT("%s: FEC %llu: read failed (%llu): %ld", v->data_dev->name, @@ -593,6 +593,10 @@ int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, (*argc)--; if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV)) { + if (v->fec->dev) { + ti->error = "FEC device already specified"; + return -EINVAL; + } r = dm_get_device(ti, arg_value, BLK_OPEN_READ, &v->fec->dev); if (r) { ti->error = "FEC device lookup failed"; diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 47d595f6a76e..ed49bcbd224f 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -30,6 +30,7 @@ #define DM_VERITY_ENV_VAR_NAME "DM_VERITY_ERR_BLOCK_NR" #define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144 +#define DM_VERITY_USE_BH_DEFAULT_BYTES 8192 #define DM_VERITY_MAX_CORRUPTED_ERRS 100 @@ -49,6 +50,15 @@ static unsigned int dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, 0644); +static unsigned int dm_verity_use_bh_bytes[4] = { + DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_NONE + DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_RT + DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_BE + 0 // IOPRIO_CLASS_IDLE +}; + +module_param_array_named(use_bh_bytes, dm_verity_use_bh_bytes, uint, NULL, 0644); + static DEFINE_STATIC_KEY_FALSE(use_bh_wq_enabled); /* Is at least one dm-verity instance using ahash_tfm instead of shash_tfm? */ @@ -311,7 +321,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) { data = dm_bufio_get(v->bufio, hash_block, &buf); - if (data == NULL) { + if (IS_ERR_OR_NULL(data)) { /* * In tasklet and the hash was not in the bufio cache. * Return early and resume execution from a work-queue @@ -321,11 +331,27 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, } } else { data = dm_bufio_read_with_ioprio(v->bufio, hash_block, - &buf, bio_prio(bio)); + &buf, bio->bi_ioprio); } - if (IS_ERR(data)) - return PTR_ERR(data); + if (IS_ERR(data)) { + if (skip_unverified) + return 1; + r = PTR_ERR(data); + data = dm_bufio_new(v->bufio, hash_block, &buf); + if (IS_ERR(data)) + return r; + if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_METADATA, + hash_block, data) == 0) { + aux = dm_bufio_get_aux_data(buf); + aux->hash_verified = 1; + goto release_ok; + } else { + dm_bufio_release(buf); + dm_bufio_forget(v->bufio, hash_block); + return r; + } + } aux = dm_bufio_get_aux_data(buf); @@ -366,6 +392,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, } } +release_ok: data += offset; memcpy(want_digest, data, v->digest_size); r = 0; @@ -652,9 +679,17 @@ static void verity_bh_work(struct work_struct *w) verity_finish_io(io, errno_to_blk_status(err)); } +static inline bool verity_use_bh(unsigned int bytes, unsigned short ioprio) +{ + return ioprio <= IOPRIO_CLASS_IDLE && + bytes <= READ_ONCE(dm_verity_use_bh_bytes[ioprio]); +} + static void verity_end_io(struct bio *bio) { struct dm_verity_io *io = bio->bi_private; + unsigned short ioprio = IOPRIO_PRIO_CLASS(bio->bi_ioprio); + unsigned int bytes = io->n_blocks << io->v->data_dev_block_bits; if (bio->bi_status && (!verity_fec_is_enabled(io->v) || @@ -664,9 +699,14 @@ static void verity_end_io(struct bio *bio) return; } - if (static_branch_unlikely(&use_bh_wq_enabled) && io->v->use_bh_wq) { - INIT_WORK(&io->bh_work, verity_bh_work); - queue_work(system_bh_wq, &io->bh_work); + if (static_branch_unlikely(&use_bh_wq_enabled) && io->v->use_bh_wq && + verity_use_bh(bytes, ioprio)) { + if (in_hardirq() || irqs_disabled()) { + INIT_WORK(&io->bh_work, verity_bh_work); + queue_work(system_bh_wq, &io->bh_work); + } else { + verity_bh_work(&io->bh_work); + } } else { INIT_WORK(&io->work, verity_work); queue_work(io->v->verify_wq, &io->work); @@ -789,13 +829,20 @@ static int verity_map(struct dm_target *ti, struct bio *bio) verity_fec_init_io(io); - verity_submit_prefetch(v, io, bio_prio(bio)); + verity_submit_prefetch(v, io, bio->bi_ioprio); submit_bio_noacct(bio); return DM_MAPIO_SUBMITTED; } +static void verity_postsuspend(struct dm_target *ti) +{ + struct dm_verity *v = ti->private; + flush_workqueue(v->verify_wq); + dm_bufio_client_reset(v->bufio); +} + /* * Status: V (valid) or C (corruption found) */ @@ -1073,6 +1120,9 @@ static int verity_alloc_most_once(struct dm_verity *v) { struct dm_target *ti = v->ti; + if (v->validated_blocks) + return 0; + /* the bitset can only handle INT_MAX blocks */ if (v->data_blocks > INT_MAX) { ti->error = "device too large to use check_at_most_once"; @@ -1096,6 +1146,9 @@ static int verity_alloc_zero_digest(struct dm_verity *v) struct dm_verity_io *io; u8 *zero_data; + if (v->zero_digest) + return 0; + v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL); if (!v->zero_digest) @@ -1530,7 +1583,7 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - /* Root hash signature is a optional parameter*/ + /* Root hash signature is an optional parameter */ r = verity_verify_root_hash(root_hash_digest_to_validate, strlen(root_hash_digest_to_validate), verify_args.sig, @@ -1761,11 +1814,12 @@ static struct target_type verity_target = { .name = "verity", /* Note: the LSMs depend on the singleton and immutable features */ .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE, - .version = {1, 10, 0}, + .version = {1, 11, 0}, .module = THIS_MODULE, .ctr = verity_ctr, .dtr = verity_dtr, .map = verity_map, + .postsuspend = verity_postsuspend, .status = verity_status, .prepare_ioctl = verity_prepare_ioctl, .iterate_devices = verity_iterate_devices, diff --git a/drivers/md/dm-verity-verify-sig.c b/drivers/md/dm-verity-verify-sig.c index a9e2c6c0a33c..d5261a0e4232 100644 --- a/drivers/md/dm-verity-verify-sig.c +++ b/drivers/md/dm-verity-verify-sig.c @@ -71,9 +71,14 @@ int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, const char *arg_name) { struct dm_target *ti = v->ti; - int ret = 0; + int ret; const char *sig_key = NULL; + if (v->signature_key_desc) { + ti->error = DM_VERITY_VERIFY_ERR("root_hash_sig_key_desc already specified"); + return -EINVAL; + } + if (!*argc) { ti->error = DM_VERITY_VERIFY_ERR("Signature key not specified"); return -EINVAL; @@ -83,14 +88,18 @@ int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, (*argc)--; ret = verity_verify_get_sig_from_key(sig_key, sig_opts); - if (ret < 0) + if (ret < 0) { ti->error = DM_VERITY_VERIFY_ERR("Invalid key specified"); + return ret; + } v->signature_key_desc = kstrdup(sig_key, GFP_KERNEL); - if (!v->signature_key_desc) + if (!v->signature_key_desc) { + ti->error = DM_VERITY_VERIFY_ERR("Could not allocate memory for signature key"); return -ENOMEM; + } - return ret; + return 0; } /* diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 7ce8847b3404..d6a04a57472d 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -797,7 +797,7 @@ static void writecache_flush(struct dm_writecache *wc) bool need_flush_after_free; wc->uncommitted_blocks = 0; - del_timer(&wc->autocommit_timer); + timer_delete(&wc->autocommit_timer); if (list_empty(&wc->lru)) return; @@ -927,8 +927,8 @@ static void writecache_suspend(struct dm_target *ti) struct dm_writecache *wc = ti->private; bool flush_on_suspend; - del_timer_sync(&wc->autocommit_timer); - del_timer_sync(&wc->max_age_timer); + timer_delete_sync(&wc->autocommit_timer); + timer_delete_sync(&wc->max_age_timer); wc_lock(wc); writecache_flush(wc); diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 20edd3fabbab..4af78111d0b4 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -56,24 +56,31 @@ int dm_blk_report_zones(struct gendisk *disk, sector_t sector, { struct mapped_device *md = disk->private_data; struct dm_table *map; - int srcu_idx, ret; + struct dm_table *zone_revalidate_map = md->zone_revalidate_map; + int srcu_idx, ret = -EIO; + bool put_table = false; - if (!md->zone_revalidate_map) { - /* Regular user context */ + if (!zone_revalidate_map || md->revalidate_map_task != current) { + /* + * Regular user context or + * Zone revalidation during __bind() is in progress, but this + * call is from a different process + */ if (dm_suspended_md(md)) return -EAGAIN; map = dm_get_live_table(md, &srcu_idx); - if (!map) - return -EIO; + put_table = true; } else { /* Zone revalidation during __bind() */ - map = md->zone_revalidate_map; + map = zone_revalidate_map; } - ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data); + if (map) + ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, + data); - if (!md->zone_revalidate_map) + if (put_table) dm_put_live_table(md, srcu_idx); return ret; @@ -153,33 +160,36 @@ int dm_revalidate_zones(struct dm_table *t, struct request_queue *q) { struct mapped_device *md = t->md; struct gendisk *disk = md->disk; + unsigned int nr_zones = disk->nr_zones; int ret; if (!get_capacity(disk)) return 0; - /* Revalidate only if something changed. */ - if (!disk->nr_zones || disk->nr_zones != md->nr_zones) { - DMINFO("%s using %s zone append", - disk->disk_name, - queue_emulates_zone_append(q) ? "emulated" : "native"); - md->nr_zones = 0; - } - - if (md->nr_zones) + /* + * Do not revalidate if zone write plug resources have already + * been allocated. + */ + if (dm_has_zone_plugs(md)) return 0; + DMINFO("%s using %s zone append", disk->disk_name, + queue_emulates_zone_append(q) ? "emulated" : "native"); + /* * Our table is not live yet. So the call to dm_get_live_table() * in dm_blk_report_zones() will fail. Set a temporary pointer to * our table for dm_blk_report_zones() to use directly. */ md->zone_revalidate_map = t; + md->revalidate_map_task = current; ret = blk_revalidate_disk_zones(disk); + md->revalidate_map_task = NULL; md->zone_revalidate_map = NULL; if (ret) { DMERR("Revalidate zones failed %d", ret); + disk->nr_zones = nr_zones; return ret; } @@ -340,12 +350,8 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, * mapped device queue as needing zone append emulation. */ WARN_ON_ONCE(queue_is_mq(q)); - if (dm_table_supports_zone_append(t)) { - clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); - } else { - set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + if (!dm_table_supports_zone_append(t)) lim->max_hw_zone_append_sectors = 0; - } /* * Determine the max open and max active zone limits for the mapped @@ -380,15 +386,28 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, lim->max_open_zones = 0; lim->max_active_zones = 0; lim->max_hw_zone_append_sectors = 0; + lim->max_zone_append_sectors = 0; lim->zone_write_granularity = 0; lim->chunk_sectors = 0; lim->features &= ~BLK_FEAT_ZONED; - clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); - md->nr_zones = 0; - disk->nr_zones = 0; return 0; } + if (get_capacity(disk) && dm_has_zone_plugs(t->md)) { + if (q->limits.chunk_sectors != lim->chunk_sectors) { + DMWARN("%s: device has zone write plug resources. " + "Cannot change zone size", + disk->disk_name); + return -EINVAL; + } + if (lim->max_hw_zone_append_sectors != 0 && + !dm_table_is_wildcard(t)) { + DMWARN("%s: device has zone write plug resources. " + "New table must emulate zone append", + disk->disk_name); + return -EINVAL; + } + } /* * Warn once (when the capacity is not yet set) if the mapped device is * partially using zone resources of the target devices as that leads to @@ -408,6 +427,23 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, return 0; } +void dm_finalize_zone_settings(struct dm_table *t, struct queue_limits *lim) +{ + struct mapped_device *md = t->md; + + if (lim->features & BLK_FEAT_ZONED) { + if (dm_table_supports_zone_append(t)) + clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + else + set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + } else { + clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + md->nr_zones = 0; + md->disk->nr_zones = 0; + } +} + + /* * IO completion callback called from clone_endio(). */ diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 12ecf07a3841..240f6dab8dda 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1479,12 +1479,12 @@ static void setup_split_accounting(struct clone_info *ci, unsigned int len) static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, struct dm_target *ti, unsigned int num_bios, - unsigned *len, gfp_t gfp_flag) + unsigned *len) { struct bio *bio; - int try = (gfp_flag & GFP_NOWAIT) ? 0 : 1; + int try; - for (; try < 2; try++) { + for (try = 0; try < 2; try++) { int bio_nr; if (try && num_bios > 1) @@ -1508,8 +1508,7 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, } static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, - unsigned int num_bios, unsigned int *len, - gfp_t gfp_flag) + unsigned int num_bios, unsigned int *len) { struct bio_list blist = BIO_EMPTY_LIST; struct bio *clone; @@ -1526,7 +1525,7 @@ static unsigned int __send_duplicate_bios(struct clone_info *ci, struct dm_targe * Using alloc_multiple_bios(), even if num_bios is 1, to consistently * support allocating using GFP_NOWAIT with GFP_NOIO fallback. */ - alloc_multiple_bios(&blist, ci, ti, num_bios, len, gfp_flag); + alloc_multiple_bios(&blist, ci, ti, num_bios, len); while ((clone = bio_list_pop(&blist))) { if (num_bios > 1) dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO); @@ -1541,14 +1540,18 @@ static void __send_empty_flush(struct clone_info *ci) { struct dm_table *t = ci->map; struct bio flush_bio; + blk_opf_t opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; + + if ((ci->io->orig_bio->bi_opf & (REQ_IDLE | REQ_SYNC)) == + (REQ_IDLE | REQ_SYNC)) + opf |= REQ_IDLE; /* * Use an on-stack bio for this, it's safe since we don't * need to reference it after submit. It's just used as * the basis for the clone(s). */ - bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, - REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC); + bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, opf); ci->bio = &flush_bio; ci->sector_count = 0; @@ -1564,7 +1567,7 @@ static void __send_empty_flush(struct clone_info *ci) atomic_add(ti->num_flush_bios, &ci->io->io_count); bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios, - NULL, GFP_NOWAIT); + NULL); atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count); } } else { @@ -1612,7 +1615,7 @@ static void __send_abnormal_io(struct clone_info *ci, struct dm_target *ti, __max_io_len(ti, ci->sector, max_granularity, max_sectors)); atomic_add(num_bios, &ci->io->io_count); - bios = __send_duplicate_bios(ci, ti, num_bios, &len, GFP_NOIO); + bios = __send_duplicate_bios(ci, ti, num_bios, &len); /* * alloc_io() takes one extra reference for submission, so the * reference won't reach 0 without the following (+1) subtraction @@ -1746,6 +1749,9 @@ static blk_status_t __split_and_process_bio(struct clone_info *ci) ci->submit_as_polled = !!(ci->bio->bi_opf & REQ_POLLED); len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count); + if (ci->bio->bi_opf & REQ_ATOMIC && len != ci->sector_count) + return BLK_STS_IOERR; + setup_split_accounting(ci, len); if (unlikely(ci->bio->bi_opf & REQ_NOWAIT)) { @@ -1849,7 +1855,7 @@ static blk_status_t __send_zone_reset_all_emulated(struct clone_info *ci, * not go crazy with the clone allocation. */ alloc_multiple_bios(&blist, ci, ti, min(nr_reset, 32), - NULL, GFP_NOIO); + NULL); } /* Get a clone and change it to a regular reset operation. */ @@ -1881,7 +1887,7 @@ static void __send_zone_reset_all_native(struct clone_info *ci, unsigned int bios; atomic_add(1, &ci->io->io_count); - bios = __send_duplicate_bios(ci, ti, 1, NULL, GFP_NOIO); + bios = __send_duplicate_bios(ci, ti, 1, NULL); atomic_sub(1 - bios, &ci->io->io_count); ci->sector_count = 0; @@ -1969,6 +1975,15 @@ static void dm_split_and_process_bio(struct mapped_device *md, /* Only support nowait for normal IO */ if (unlikely(bio->bi_opf & REQ_NOWAIT) && !is_abnormal) { + /* + * Don't support NOWAIT for FLUSH because it may allocate + * multiple bios and there's no easy way how to undo the + * allocations. + */ + if (bio->bi_opf & REQ_PREFLUSH) { + bio_wouldblock_error(bio); + return; + } io = alloc_io(md, bio, GFP_NOWAIT); if (unlikely(!io)) { /* Unable to do anything without dm_io. */ @@ -2406,21 +2421,35 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, struct queue_limits *limits) { struct dm_table *old_map; - sector_t size; + sector_t size, old_size; int ret; lockdep_assert_held(&md->suspend_lock); size = dm_table_get_size(t); + old_size = dm_get_size(md); + + if (!dm_table_supports_size_change(t, old_size, size)) { + old_map = ERR_PTR(-EINVAL); + goto out; + } + + set_capacity(md->disk, size); + + ret = dm_table_set_restrictions(t, md->queue, limits); + if (ret) { + set_capacity(md->disk, old_size); + old_map = ERR_PTR(ret); + goto out; + } + /* * Wipe any geometry if the size of the table changed. */ - if (size != dm_get_size(md)) + if (size != old_size) memset(&md->geometry, 0, sizeof(md->geometry)); - set_capacity(md->disk, size); - dm_table_event_callback(t, event_callback, md); if (dm_table_request_based(t)) { @@ -2438,10 +2467,10 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, * requests in the queue may refer to bio from the old bioset, * so you must walk through the queue to unprep. */ - if (!md->mempools) { + if (!md->mempools) md->mempools = t->mempools; - t->mempools = NULL; - } + else + dm_free_md_mempools(t->mempools); } else { /* * The md may already have mempools that need changing. @@ -2450,14 +2479,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, */ dm_free_md_mempools(md->mempools); md->mempools = t->mempools; - t->mempools = NULL; - } - - ret = dm_table_set_restrictions(t, md->queue, limits); - if (ret) { - old_map = ERR_PTR(ret); - goto out; } + t->mempools = NULL; old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); rcu_assign_pointer(md->map, (void *)t); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index a0a8ff119815..245f52b59215 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -58,6 +58,7 @@ void dm_table_event_callback(struct dm_table *t, void (*fn)(void *), void *context); struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); bool dm_table_has_no_data_devices(struct dm_table *table); +bool dm_table_is_wildcard(struct dm_table *t); int dm_calculate_queue_limits(struct dm_table *table, struct queue_limits *limits); int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, @@ -72,6 +73,8 @@ struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); struct dm_target *dm_table_get_immutable_target(struct dm_table *t); struct dm_target *dm_table_get_wildcard_target(struct dm_table *t); bool dm_table_request_based(struct dm_table *t); +bool dm_table_supports_size_change(struct dm_table *t, sector_t old_size, + sector_t new_size); void dm_lock_md_type(struct mapped_device *md); void dm_unlock_md_type(struct mapped_device *md); @@ -102,6 +105,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *lim); int dm_revalidate_zones(struct dm_table *t, struct request_queue *q); +void dm_finalize_zone_settings(struct dm_table *t, struct queue_limits *lim); void dm_zone_endio(struct dm_io *io, struct bio *clone); #ifdef CONFIG_BLK_DEV_ZONED int dm_blk_report_zones(struct gendisk *disk, sector_t sector, @@ -110,12 +114,14 @@ bool dm_is_zone_write(struct mapped_device *md, struct bio *bio); int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t, sector_t sector, unsigned int nr_zones, unsigned long *need_reset); +#define dm_has_zone_plugs(md) ((md)->disk->zone_wplugs_hash != NULL) #else #define dm_blk_report_zones NULL static inline bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) { return false; } +#define dm_has_zone_plugs(md) false #endif /* diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 2e3087556adb..37b08f26c62f 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -29,8 +29,10 @@ #include <linux/buffer_head.h> #include <linux/seq_file.h> #include <trace/events/block.h> + #include "md.h" #include "md-bitmap.h" +#include "md-cluster.h" #define BITMAP_MAJOR_LO 3 /* version 4 insists the bitmap is in little-endian order @@ -426,8 +428,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, struct block_device *bdev; struct mddev *mddev = bitmap->mddev; struct bitmap_storage *store = &bitmap->storage; - unsigned int bitmap_limit = (bitmap->storage.file_pages - pg_index) << - PAGE_SHIFT; + unsigned long num_pages = bitmap->storage.file_pages; + unsigned int bitmap_limit = (num_pages - pg_index % num_pages) << PAGE_SHIFT; loff_t sboff, offset = mddev->bitmap_info.offset; sector_t ps = pg_index * PAGE_SIZE / SECTOR_SIZE; unsigned int size = PAGE_SIZE; @@ -436,7 +438,7 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; /* we compare length (page numbers), not page offset. */ - if ((pg_index - store->sb_index) == store->file_pages - 1) { + if ((pg_index - store->sb_index) == num_pages - 1) { unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1); if (last_page_size == 0) @@ -682,7 +684,7 @@ static void bitmap_update_sb(void *data) return; if (!bitmap->storage.sb_page) /* no superblock */ return; - sb = kmap_atomic(bitmap->storage.sb_page); + sb = kmap_local_page(bitmap->storage.sb_page); sb->events = cpu_to_le64(bitmap->mddev->events); if (bitmap->mddev->events < bitmap->events_cleared) /* rocking back to read-only */ @@ -702,7 +704,7 @@ static void bitmap_update_sb(void *data) sb->nodes = cpu_to_le32(bitmap->mddev->bitmap_info.nodes); sb->sectors_reserved = cpu_to_le32(bitmap->mddev-> bitmap_info.space); - kunmap_atomic(sb); + kunmap_local(sb); if (bitmap->storage.file) write_file_page(bitmap, bitmap->storage.sb_page, 1); @@ -717,7 +719,7 @@ static void bitmap_print_sb(struct bitmap *bitmap) if (!bitmap || !bitmap->storage.sb_page) return; - sb = kmap_atomic(bitmap->storage.sb_page); + sb = kmap_local_page(bitmap->storage.sb_page); pr_debug("%s: bitmap file superblock:\n", bmname(bitmap)); pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic)); pr_debug(" version: %u\n", le32_to_cpu(sb->version)); @@ -736,7 +738,7 @@ static void bitmap_print_sb(struct bitmap *bitmap) pr_debug(" sync size: %llu KB\n", (unsigned long long)le64_to_cpu(sb->sync_size)/2); pr_debug("max write behind: %u\n", le32_to_cpu(sb->write_behind)); - kunmap_atomic(sb); + kunmap_local(sb); } /* @@ -760,7 +762,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap) return -ENOMEM; bitmap->storage.sb_index = 0; - sb = kmap_atomic(bitmap->storage.sb_page); + sb = kmap_local_page(bitmap->storage.sb_page); sb->magic = cpu_to_le32(BITMAP_MAGIC); sb->version = cpu_to_le32(BITMAP_MAJOR_HI); @@ -768,7 +770,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap) chunksize = bitmap->mddev->bitmap_info.chunksize; BUG_ON(!chunksize); if (!is_power_of_2(chunksize)) { - kunmap_atomic(sb); + kunmap_local(sb); pr_warn("bitmap chunksize not a power of 2\n"); return -EINVAL; } @@ -803,7 +805,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap) sb->events_cleared = cpu_to_le64(bitmap->mddev->events); bitmap->mddev->bitmap_info.nodes = 0; - kunmap_atomic(sb); + kunmap_local(sb); return 0; } @@ -865,7 +867,7 @@ re_read: return err; err = -EINVAL; - sb = kmap_atomic(sb_page); + sb = kmap_local_page(sb_page); chunksize = le32_to_cpu(sb->chunksize); daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; @@ -932,7 +934,7 @@ re_read: err = 0; out: - kunmap_atomic(sb); + kunmap_local(sb); if (err == 0 && nodes && (bitmap->cluster_slot < 0)) { /* Assigning chunksize is required for "re_read" */ bitmap->mddev->bitmap_info.chunksize = chunksize; @@ -942,7 +944,7 @@ out: bmname(bitmap), err); goto out_no_sb; } - bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev); + bitmap->cluster_slot = bitmap->mddev->cluster_ops->slot_number(bitmap->mddev); goto re_read; } @@ -1161,12 +1163,12 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) bit = file_page_offset(&bitmap->storage, chunk); /* set the bit */ - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) set_bit(bit, kaddr); else set_bit_le(bit, kaddr); - kunmap_atomic(kaddr); + kunmap_local(kaddr); pr_debug("set file bit %lu page %lu\n", bit, index); /* record page number so it gets flushed to disk when unplug occurs */ set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_DIRTY); @@ -1190,12 +1192,12 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) if (!page) return; bit = file_page_offset(&bitmap->storage, chunk); - paddr = kmap_atomic(page); + paddr = kmap_local_page(page); if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) clear_bit(bit, paddr); else clear_bit_le(bit, paddr); - kunmap_atomic(paddr); + kunmap_local(paddr); if (!test_page_attr(bitmap, index - node_offset, BITMAP_PAGE_NEEDWRITE)) { set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_PENDING); bitmap->allclean = 0; @@ -1214,12 +1216,12 @@ static int md_bitmap_file_test_bit(struct bitmap *bitmap, sector_t block) if (!page) return -EINVAL; bit = file_page_offset(&bitmap->storage, chunk); - paddr = kmap_atomic(page); + paddr = kmap_local_page(page); if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) set = test_bit(bit, paddr); else set = test_bit_le(bit, paddr); - kunmap_atomic(paddr); + kunmap_local(paddr); return set; } @@ -1388,9 +1390,9 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) * If the bitmap is out of date, dirty the whole page * and write it out */ - paddr = kmap_atomic(page); + paddr = kmap_local_page(page); memset(paddr + offset, 0xff, PAGE_SIZE - offset); - kunmap_atomic(paddr); + kunmap_local(paddr); filemap_write_page(bitmap, i, true); if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) { @@ -1406,12 +1408,12 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) void *paddr; bool was_set; - paddr = kmap_atomic(page); + paddr = kmap_local_page(page); if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) was_set = test_bit(bit, paddr); else was_set = test_bit_le(bit, paddr); - kunmap_atomic(paddr); + kunmap_local(paddr); if (was_set) { /* if the disk bit is set, set the memory bit */ @@ -1546,10 +1548,10 @@ static void bitmap_daemon_work(struct mddev *mddev) bitmap_super_t *sb; bitmap->need_sync = 0; if (bitmap->storage.filemap) { - sb = kmap_atomic(bitmap->storage.sb_page); + sb = kmap_local_page(bitmap->storage.sb_page); sb->events_cleared = cpu_to_le64(bitmap->events_cleared); - kunmap_atomic(sb); + kunmap_local(sb); set_page_attr(bitmap, 0, BITMAP_PAGE_NEEDWRITE); } @@ -2021,7 +2023,7 @@ static void md_bitmap_free(void *data) sysfs_put(bitmap->sysfs_can_clear); if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info && - bitmap->cluster_slot == md_cluster_ops->slot_number(bitmap->mddev)) + bitmap->cluster_slot == bitmap->mddev->cluster_ops->slot_number(bitmap->mddev)) md_cluster_stop(bitmap->mddev); /* Shouldn't be needed - but just in case.... */ @@ -2229,7 +2231,7 @@ static int bitmap_load(struct mddev *mddev) mddev_create_serial_pool(mddev, rdev); if (mddev_is_clustered(mddev)) - md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); + mddev->cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); /* Clear out old bitmap info first: Either there is none, or we * are resuming after someone else has possibly changed things, @@ -2355,9 +2357,8 @@ static int bitmap_get_stats(void *data, struct md_bitmap_stats *stats) if (!bitmap) return -ENOENT; - if (bitmap->mddev->bitmap_info.external) - return -ENOENT; - if (!bitmap->storage.sb_page) /* no superblock */ + if (!bitmap->mddev->bitmap_info.external && + !bitmap->storage.sb_page) return -EINVAL; sb = kmap_local_page(bitmap->storage.sb_page); stats->sync_size = le64_to_cpu(sb->sync_size); diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 6595f89becdb..94221d964d4f 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1166,7 +1166,7 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz struct dlm_lock_resource *bm_lockres; char str[64]; - if (i == md_cluster_ops->slot_number(mddev)) + if (i == slot_number(mddev)) continue; bitmap = mddev->bitmap_ops->get_from_slot(mddev, i); @@ -1216,7 +1216,7 @@ out: */ static int cluster_check_sync_size(struct mddev *mddev) { - int current_slot = md_cluster_ops->slot_number(mddev); + int current_slot = slot_number(mddev); int node_num = mddev->bitmap_info.nodes; struct dlm_lock_resource *bm_lockres; struct md_bitmap_stats stats; @@ -1612,7 +1612,14 @@ out: return err; } -static const struct md_cluster_operations cluster_ops = { +static struct md_cluster_operations cluster_ops = { + .head = { + .type = MD_CLUSTER, + .id = ID_CLUSTER, + .name = "cluster", + .owner = THIS_MODULE, + }, + .join = join, .leave = leave, .slot_number = slot_number, @@ -1642,13 +1649,12 @@ static int __init cluster_init(void) { pr_warn("md-cluster: support raid1 and raid10 (limited support)\n"); pr_info("Registering Cluster MD functions\n"); - register_md_cluster_operations(&cluster_ops, THIS_MODULE); - return 0; + return register_md_submodule(&cluster_ops.head); } static void cluster_exit(void) { - unregister_md_cluster_operations(); + unregister_md_submodule(&cluster_ops.head); } module_init(cluster_init); diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h index 470bf18ffde5..8fb06d853173 100644 --- a/drivers/md/md-cluster.h +++ b/drivers/md/md-cluster.h @@ -10,6 +10,8 @@ struct mddev; struct md_rdev; struct md_cluster_operations { + struct md_submodule_head head; + int (*join)(struct mddev *mddev, int nodes); int (*leave)(struct mddev *mddev); int (*slot_number)(struct mddev *mddev); @@ -35,4 +37,8 @@ struct md_cluster_operations { void (*update_size)(struct mddev *mddev, sector_t old_dev_sectors); }; +extern int md_setup_cluster(struct mddev *mddev, int nodes); +extern void md_cluster_stop(struct mddev *mddev); +extern void md_reload_sb(struct mddev *mddev, int raid_disk); + #endif /* _MD_CLUSTER_H */ diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 369aed044b40..5d9b08115375 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -5,7 +5,6 @@ */ #include <linux/blkdev.h> -#include <linux/raid/md_u.h> #include <linux/seq_file.h> #include <linux/module.h> #include <linux/slab.h> @@ -320,9 +319,13 @@ static void linear_quiesce(struct mddev *mddev, int state) } static struct md_personality linear_personality = { - .name = "linear", - .level = LEVEL_LINEAR, - .owner = THIS_MODULE, + .head = { + .type = MD_PERSONALITY, + .id = ID_LINEAR, + .name = "linear", + .owner = THIS_MODULE, + }, + .make_request = linear_make_request, .run = linear_run, .free = linear_free, @@ -335,12 +338,12 @@ static struct md_personality linear_personality = { static int __init linear_init(void) { - return register_md_personality(&linear_personality); + return register_md_submodule(&linear_personality.head); } static void linear_exit(void) { - unregister_md_personality(&linear_personality); + unregister_md_submodule(&linear_personality.head); } module_init(linear_init); diff --git a/drivers/md/md.c b/drivers/md/md.c index 465ca2af1e6e..9daa78c5fe33 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -79,16 +79,10 @@ static const char *action_name[NR_SYNC_ACTIONS] = { [ACTION_IDLE] = "idle", }; -/* pers_list is a list of registered personalities protected by pers_lock. */ -static LIST_HEAD(pers_list); -static DEFINE_SPINLOCK(pers_lock); +static DEFINE_XARRAY(md_submodule); static const struct kobj_type md_ktype; -const struct md_cluster_operations *md_cluster_ops; -EXPORT_SYMBOL(md_cluster_ops); -static struct module *md_cluster_mod; - static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; @@ -294,7 +288,7 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) static struct ctl_table_header *raid_table_header; -static struct ctl_table raid_table[] = { +static const struct ctl_table raid_table[] = { { .procname = "speed_limit_min", .data = &sysctl_speed_limit_min, @@ -629,6 +623,12 @@ static void __mddev_put(struct mddev *mddev) queue_work(md_misc_wq, &mddev->del_work); } +static void mddev_put_locked(struct mddev *mddev) +{ + if (atomic_dec_and_test(&mddev->active)) + __mddev_put(mddev); +} + void mddev_put(struct mddev *mddev) { if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) @@ -888,16 +888,40 @@ struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) } EXPORT_SYMBOL_GPL(md_find_rdev_rcu); -static struct md_personality *find_pers(int level, char *clevel) +static struct md_personality *get_pers(int level, char *clevel) { - struct md_personality *pers; - list_for_each_entry(pers, &pers_list, list) { - if (level != LEVEL_NONE && pers->level == level) - return pers; - if (strcmp(pers->name, clevel)==0) - return pers; + struct md_personality *ret = NULL; + struct md_submodule_head *head; + unsigned long i; + + xa_lock(&md_submodule); + xa_for_each(&md_submodule, i, head) { + if (head->type != MD_PERSONALITY) + continue; + if ((level != LEVEL_NONE && head->id == level) || + !strcmp(head->name, clevel)) { + if (try_module_get(head->owner)) + ret = (void *)head; + break; + } } - return NULL; + xa_unlock(&md_submodule); + + if (!ret) { + if (level != LEVEL_NONE) + pr_warn("md: personality for level %d is not loaded!\n", + level); + else + pr_warn("md: personality for level %s is not loaded!\n", + clevel); + } + + return ret; +} + +static void put_pers(struct md_personality *pers) +{ + module_put(pers->head.owner); } /* return the offset of the super block in 512byte sectors */ @@ -1180,7 +1204,7 @@ int md_check_no_bitmap(struct mddev *mddev) if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) return 0; pr_warn("%s: bitmaps are not supported for %s\n", - mdname(mddev), mddev->pers->name); + mdname(mddev), mddev->pers->head.name); return 1; } EXPORT_SYMBOL(md_check_no_bitmap); @@ -1748,7 +1772,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ count <<= sb->bblog_shift; if (bb + 1 == 0) break; - if (badblocks_set(&rdev->badblocks, sector, count, 1)) + if (!badblocks_set(&rdev->badblocks, sector, count, 1)) return -EINVAL; } } else if (sb->bblog_offset != 0) @@ -2359,19 +2383,6 @@ int md_integrity_register(struct mddev *mddev) return 0; /* shouldn't register */ pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); - if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || - (mddev->level != 1 && mddev->level != 10 && - bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { - /* - * No need to handle the failure of bioset_integrity_create, - * because the function is called by md_run() -> pers->run(), - * md_run calls bioset_exit -> bioset_integrity_free in case - * of failure case. - */ - pr_err("md: failed to create integrity pool for %s\n", - mdname(mddev)); - return -EINVAL; - } return 0; } EXPORT_SYMBOL(md_integrity_register); @@ -2639,11 +2650,11 @@ repeat: force_change = 1; if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) nospares = 1; - ret = md_cluster_ops->metadata_update_start(mddev); + ret = mddev->cluster_ops->metadata_update_start(mddev); /* Has someone else has updated the sb */ if (!does_sb_need_changing(mddev)) { if (ret == 0) - md_cluster_ops->metadata_update_cancel(mddev); + mddev->cluster_ops->metadata_update_cancel(mddev); bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)); @@ -2783,7 +2794,7 @@ rewrite: /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ if (mddev_is_clustered(mddev) && ret == 0) - md_cluster_ops->metadata_update_finish(mddev); + mddev->cluster_ops->metadata_update_finish(mddev); if (mddev->in_sync != sync_req || !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), @@ -2942,7 +2953,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) else { err = 0; if (mddev_is_clustered(mddev)) - err = md_cluster_ops->remove_disk(mddev, rdev); + err = mddev->cluster_ops->remove_disk(mddev, rdev); if (err == 0) { md_kick_rdev_from_array(rdev); @@ -3052,7 +3063,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) * by this node eventually */ if (!mddev_is_clustered(rdev->mddev) || - (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) { + (err = mddev->cluster_ops->gather_bitmaps(rdev)) == 0) { clear_bit(Faulty, &rdev->flags); err = add_bound_rdev(rdev); } @@ -3860,7 +3871,7 @@ level_show(struct mddev *mddev, char *page) spin_lock(&mddev->lock); p = mddev->pers; if (p) - ret = sprintf(page, "%s\n", p->name); + ret = sprintf(page, "%s\n", p->head.name); else if (mddev->clevel[0]) ret = sprintf(page, "%s\n", mddev->clevel); else if (mddev->level != LEVEL_NONE) @@ -3917,7 +3928,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) rv = -EINVAL; if (!mddev->pers->quiesce) { pr_warn("md: %s: %s does not support online personality change\n", - mdname(mddev), mddev->pers->name); + mdname(mddev), mddev->pers->head.name); goto out_unlock; } @@ -3931,24 +3942,20 @@ level_store(struct mddev *mddev, const char *buf, size_t len) if (request_module("md-%s", clevel) != 0) request_module("md-level-%s", clevel); - spin_lock(&pers_lock); - pers = find_pers(level, clevel); - if (!pers || !try_module_get(pers->owner)) { - spin_unlock(&pers_lock); - pr_warn("md: personality %s not loaded\n", clevel); + pers = get_pers(level, clevel); + if (!pers) { rv = -EINVAL; goto out_unlock; } - spin_unlock(&pers_lock); if (pers == mddev->pers) { /* Nothing to do! */ - module_put(pers->owner); + put_pers(pers); rv = len; goto out_unlock; } if (!pers->takeover) { - module_put(pers->owner); + put_pers(pers); pr_warn("md: %s: %s does not support personality takeover\n", mdname(mddev), clevel); rv = -EINVAL; @@ -3969,7 +3976,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->raid_disks -= mddev->delta_disks; mddev->delta_disks = 0; mddev->reshape_backwards = 0; - module_put(pers->owner); + put_pers(pers); pr_warn("md: %s: %s would not accept array\n", mdname(mddev), clevel); rv = PTR_ERR(priv); @@ -3984,7 +3991,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) oldpriv = mddev->private; mddev->pers = pers; mddev->private = priv; - strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); mddev->level = mddev->new_level; mddev->layout = mddev->new_layout; mddev->chunk_sectors = mddev->new_chunk_sectors; @@ -4026,7 +4033,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->to_remove = &md_redundancy_group; } - module_put(oldpers->owner); + put_pers(oldpers); rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) @@ -4057,7 +4064,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) * it must always be in_sync */ mddev->in_sync = 1; - del_timer_sync(&mddev->safemode_timer); + timer_delete_sync(&mddev->safemode_timer); } pers->run(mddev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); @@ -5584,7 +5591,7 @@ __ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show, static ssize_t serialize_policy_show(struct mddev *mddev, char *page) { - if (mddev->pers == NULL || (mddev->pers->level != 1)) + if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) return sprintf(page, "n/a\n"); else return sprintf(page, "%d\n", mddev->serialize_policy); @@ -5610,7 +5617,7 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) err = mddev_suspend_and_lock(mddev); if (err) return err; - if (mddev->pers == NULL || (mddev->pers->level != 1)) { + if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) { pr_err("md: serialize_policy is only effective for raid1\n"); err = -EINVAL; goto unlock; @@ -6096,30 +6103,21 @@ int md_run(struct mddev *mddev) goto exit_sync_set; } - spin_lock(&pers_lock); - pers = find_pers(mddev->level, mddev->clevel); - if (!pers || !try_module_get(pers->owner)) { - spin_unlock(&pers_lock); - if (mddev->level != LEVEL_NONE) - pr_warn("md: personality for level %d is not loaded!\n", - mddev->level); - else - pr_warn("md: personality for level %s is not loaded!\n", - mddev->clevel); + pers = get_pers(mddev->level, mddev->clevel); + if (!pers) { err = -EINVAL; goto abort; } - spin_unlock(&pers_lock); - if (mddev->level != pers->level) { - mddev->level = pers->level; - mddev->new_level = pers->level; + if (mddev->level != pers->head.id) { + mddev->level = pers->head.id; + mddev->new_level = pers->head.id; } - strscpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + strscpy(mddev->clevel, pers->head.name, sizeof(mddev->clevel)); if (mddev->reshape_position != MaxSector && pers->start_reshape == NULL) { /* This personality cannot handle reshaping... */ - module_put(pers->owner); + put_pers(pers); err = -EINVAL; goto abort; } @@ -6246,7 +6244,7 @@ bitmap_abort: if (mddev->private) pers->free(mddev, mddev->private); mddev->private = NULL; - module_put(pers->owner); + put_pers(pers); mddev->bitmap_ops->destroy(mddev); abort: bioset_exit(&mddev->io_clone_set); @@ -6407,7 +6405,7 @@ static void md_clean(struct mddev *mddev) static void __md_stop_writes(struct mddev *mddev) { - del_timer_sync(&mddev->safemode_timer); + timer_delete_sync(&mddev->safemode_timer); if (mddev->pers && mddev->pers->quiesce) { mddev->pers->quiesce(mddev, 1); @@ -6467,7 +6465,7 @@ static void __md_stop(struct mddev *mddev) mddev->private = NULL; if (pers->sync_request && mddev->to_remove == NULL) mddev->to_remove = &md_redundancy_group; - module_put(pers->owner); + put_pers(pers); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); bioset_exit(&mddev->bio_set); @@ -6983,7 +6981,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) set_bit(Candidate, &rdev->flags); else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { /* --add initiated by this node */ - err = md_cluster_ops->add_new_disk(mddev, rdev); + err = mddev->cluster_ops->add_new_disk(mddev, rdev); if (err) { export_rdev(rdev, mddev); return err; @@ -7000,14 +6998,14 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) if (mddev_is_clustered(mddev)) { if (info->state & (1 << MD_DISK_CANDIDATE)) { if (!err) { - err = md_cluster_ops->new_disk_ack(mddev, - err == 0); + err = mddev->cluster_ops->new_disk_ack( + mddev, err == 0); if (err) md_kick_rdev_from_array(rdev); } } else { if (err) - md_cluster_ops->add_new_disk_cancel(mddev); + mddev->cluster_ops->add_new_disk_cancel(mddev); else err = add_bound_rdev(rdev); } @@ -7087,10 +7085,9 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) goto busy; kick_rdev: - if (mddev_is_clustered(mddev)) { - if (md_cluster_ops->remove_disk(mddev, rdev)) - goto busy; - } + if (mddev_is_clustered(mddev) && + mddev->cluster_ops->remove_disk(mddev, rdev)) + goto busy; md_kick_rdev_from_array(rdev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); @@ -7393,7 +7390,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) rv = mddev->pers->resize(mddev, num_sectors); if (!rv) { if (mddev_is_clustered(mddev)) - md_cluster_ops->update_size(mddev, old_dev_sectors); + mddev->cluster_ops->update_size(mddev, old_dev_sectors); else if (!mddev_is_dm(mddev)) set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); @@ -7441,6 +7438,28 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) return rv; } +static int get_cluster_ops(struct mddev *mddev) +{ + xa_lock(&md_submodule); + mddev->cluster_ops = xa_load(&md_submodule, ID_CLUSTER); + if (mddev->cluster_ops && + !try_module_get(mddev->cluster_ops->head.owner)) + mddev->cluster_ops = NULL; + xa_unlock(&md_submodule); + + return mddev->cluster_ops == NULL ? -ENOENT : 0; +} + +static void put_cluster_ops(struct mddev *mddev) +{ + if (!mddev->cluster_ops) + return; + + mddev->cluster_ops->leave(mddev); + module_put(mddev->cluster_ops->head.owner); + mddev->cluster_ops = NULL; +} + /* * update_array_info is used to change the configuration of an * on-line array. @@ -7549,16 +7568,15 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) if (mddev->bitmap_info.nodes) { /* hold PW on all the bitmap lock */ - if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { + if (mddev->cluster_ops->lock_all_bitmaps(mddev) <= 0) { pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); rv = -EPERM; - md_cluster_ops->unlock_all_bitmaps(mddev); + mddev->cluster_ops->unlock_all_bitmaps(mddev); goto err; } mddev->bitmap_info.nodes = 0; - md_cluster_ops->leave(mddev); - module_put(md_cluster_mod); + put_cluster_ops(mddev); mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; } mddev->bitmap_ops->destroy(mddev); @@ -7842,7 +7860,7 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode, case CLUSTERED_DISK_NACK: if (mddev_is_clustered(mddev)) - md_cluster_ops->new_disk_ack(mddev, false); + mddev->cluster_ops->new_disk_ack(mddev, false); else err = -EINVAL; goto unlock; @@ -8124,7 +8142,8 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev) return; mddev->pers->error_handler(mddev, rdev); - if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR) + if (mddev->pers->head.id == ID_RAID0 || + mddev->pers->head.id == ID_LINEAR) return; if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) @@ -8162,14 +8181,17 @@ static void status_unused(struct seq_file *seq) static void status_personalities(struct seq_file *seq) { - struct md_personality *pers; + struct md_submodule_head *head; + unsigned long i; seq_puts(seq, "Personalities : "); - spin_lock(&pers_lock); - list_for_each_entry(pers, &pers_list, list) - seq_printf(seq, "[%s] ", pers->name); - spin_unlock(&pers_lock); + xa_lock(&md_submodule); + xa_for_each(&md_submodule, i, head) + if (head->type == MD_PERSONALITY) + seq_printf(seq, "[%s] ", head->name); + xa_unlock(&md_submodule); + seq_puts(seq, "\n"); } @@ -8392,7 +8414,7 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, " (read-only)"); if (mddev->ro == MD_AUTO_READ) seq_printf(seq, " (auto-read-only)"); - seq_printf(seq, " %s", mddev->pers->name); + seq_printf(seq, " %s", mddev->pers->head.name); } else { seq_printf(seq, "inactive"); } @@ -8461,9 +8483,7 @@ static int md_seq_show(struct seq_file *seq, void *v) if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs)) status_unused(seq); - if (atomic_dec_and_test(&mddev->active)) - __mddev_put(mddev); - + mddev_put_locked(mddev); return 0; } @@ -8514,67 +8534,34 @@ static const struct proc_ops mdstat_proc_ops = { .proc_poll = mdstat_poll, }; -int register_md_personality(struct md_personality *p) -{ - pr_debug("md: %s personality registered for level %d\n", - p->name, p->level); - spin_lock(&pers_lock); - list_add_tail(&p->list, &pers_list); - spin_unlock(&pers_lock); - return 0; -} -EXPORT_SYMBOL(register_md_personality); - -int unregister_md_personality(struct md_personality *p) +int register_md_submodule(struct md_submodule_head *msh) { - pr_debug("md: %s personality unregistered\n", p->name); - spin_lock(&pers_lock); - list_del_init(&p->list); - spin_unlock(&pers_lock); - return 0; + return xa_insert(&md_submodule, msh->id, msh, GFP_KERNEL); } -EXPORT_SYMBOL(unregister_md_personality); +EXPORT_SYMBOL_GPL(register_md_submodule); -int register_md_cluster_operations(const struct md_cluster_operations *ops, - struct module *module) +void unregister_md_submodule(struct md_submodule_head *msh) { - int ret = 0; - spin_lock(&pers_lock); - if (md_cluster_ops != NULL) - ret = -EALREADY; - else { - md_cluster_ops = ops; - md_cluster_mod = module; - } - spin_unlock(&pers_lock); - return ret; + xa_erase(&md_submodule, msh->id); } -EXPORT_SYMBOL(register_md_cluster_operations); - -int unregister_md_cluster_operations(void) -{ - spin_lock(&pers_lock); - md_cluster_ops = NULL; - spin_unlock(&pers_lock); - return 0; -} -EXPORT_SYMBOL(unregister_md_cluster_operations); +EXPORT_SYMBOL_GPL(unregister_md_submodule); int md_setup_cluster(struct mddev *mddev, int nodes) { - int ret; - if (!md_cluster_ops) + int ret = get_cluster_ops(mddev); + + if (ret) { request_module("md-cluster"); - spin_lock(&pers_lock); + ret = get_cluster_ops(mddev); + } + /* ensure module won't be unloaded */ - if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { + if (ret) { pr_warn("can't find md-cluster module or get its reference.\n"); - spin_unlock(&pers_lock); - return -ENOENT; + return ret; } - spin_unlock(&pers_lock); - ret = md_cluster_ops->join(mddev, nodes); + ret = mddev->cluster_ops->join(mddev, nodes); if (!ret) mddev->safemode_delay = 0; return ret; @@ -8582,10 +8569,7 @@ int md_setup_cluster(struct mddev *mddev, int nodes) void md_cluster_stop(struct mddev *mddev) { - if (!md_cluster_ops) - return; - md_cluster_ops->leave(mddev); - module_put(md_cluster_mod); + put_cluster_ops(mddev); } static int is_mddev_idle(struct mddev *mddev, int init) @@ -8978,7 +8962,7 @@ void md_do_sync(struct md_thread *thread) } if (mddev_is_clustered(mddev)) { - ret = md_cluster_ops->resync_start(mddev); + ret = mddev->cluster_ops->resync_start(mddev); if (ret) goto skip; @@ -9005,7 +8989,7 @@ void md_do_sync(struct md_thread *thread) * */ if (mddev_is_clustered(mddev)) - md_cluster_ops->resync_start_notify(mddev); + mddev->cluster_ops->resync_start_notify(mddev); do { int mddev2_minor = -1; mddev->curr_resync = MD_RESYNC_DELAYED; @@ -9460,6 +9444,13 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) return true; } + /* Check if resync is in progress. */ + if (mddev->recovery_cp < MaxSector) { + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + return true; + } + /* * Remove any failed drives, then add spares if possible. Spares are * also removed and re-added, to allow the personality to fail the @@ -9476,13 +9467,6 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) return true; } - /* Check if recovery is in progress. */ - if (mddev->recovery_cp < MaxSector) { - set_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - return true; - } - /* Delay to choose resync/check/repair in md_do_sync(). */ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) return true; @@ -9789,7 +9773,7 @@ void md_reap_sync_thread(struct mddev *mddev) * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by * clustered raid */ if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) - md_cluster_ops->resync_finish(mddev); + mddev->cluster_ops->resync_finish(mddev); clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); @@ -9797,13 +9781,13 @@ void md_reap_sync_thread(struct mddev *mddev) clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); /* - * We call md_cluster_ops->update_size here because sync_size could + * We call mddev->cluster_ops->update_size here because sync_size could * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, * so it is time to update size across cluster. */ if (mddev_is_clustered(mddev) && is_reshaped && !test_bit(MD_CLOSING, &mddev->flags)) - md_cluster_ops->update_size(mddev, old_dev_sectors); + mddev->cluster_ops->update_size(mddev, old_dev_sectors); /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); sysfs_notify_dirent_safe(mddev->sysfs_completed); @@ -9841,12 +9825,11 @@ EXPORT_SYMBOL(md_finish_reshape); /* Bad block management */ -/* Returns 1 on success, 0 on failure */ -int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, - int is_new) +/* Returns true on success, false on failure */ +bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new) { struct mddev *mddev = rdev->mddev; - int rv; /* * Recording new badblocks for faulty rdev will force unnecessary @@ -9856,50 +9839,51 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, * avoid it. */ if (test_bit(Faulty, &rdev->flags)) - return 1; + return true; if (is_new) s += rdev->new_data_offset; else s += rdev->data_offset; - rv = badblocks_set(&rdev->badblocks, s, sectors, 0); - if (rv == 0) { - /* Make sure they get written out promptly */ - if (test_bit(ExternalBbl, &rdev->flags)) - sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); - sysfs_notify_dirent_safe(rdev->sysfs_state); - set_mask_bits(&mddev->sb_flags, 0, - BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); - md_wakeup_thread(rdev->mddev->thread); - return 1; - } else - return 0; + + if (!badblocks_set(&rdev->badblocks, s, sectors, 0)) + return false; + + /* Make sure they get written out promptly */ + if (test_bit(ExternalBbl, &rdev->flags)) + sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); + sysfs_notify_dirent_safe(rdev->sysfs_state); + set_mask_bits(&mddev->sb_flags, 0, + BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); + md_wakeup_thread(rdev->mddev->thread); + return true; } EXPORT_SYMBOL_GPL(rdev_set_badblocks); -int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, - int is_new) +void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new) { - int rv; if (is_new) s += rdev->new_data_offset; else s += rdev->data_offset; - rv = badblocks_clear(&rdev->badblocks, s, sectors); - if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) + + if (!badblocks_clear(&rdev->badblocks, s, sectors)) + return; + + if (test_bit(ExternalBbl, &rdev->flags)) sysfs_notify_dirent_safe(rdev->sysfs_badblocks); - return rv; } EXPORT_SYMBOL_GPL(rdev_clear_badblocks); static int md_notify_reboot(struct notifier_block *this, unsigned long code, void *x) { - struct mddev *mddev, *n; + struct mddev *mddev; int need_delay = 0; spin_lock(&all_mddevs_lock); - list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { + list_for_each_entry(mddev, &all_mddevs, all_mddevs) { if (!mddev_get(mddev)) continue; spin_unlock(&all_mddevs_lock); @@ -9911,8 +9895,8 @@ static int md_notify_reboot(struct notifier_block *this, mddev_unlock(mddev); } need_delay = 1; - mddev_put(mddev); spin_lock(&all_mddevs_lock); + mddev_put_locked(mddev); } spin_unlock(&all_mddevs_lock); @@ -10029,7 +10013,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) if (rdev2->raid_disk == -1 && role != MD_DISK_ROLE_SPARE && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && - !md_cluster_ops->resync_status_get(mddev)) { + !mddev->cluster_ops->resync_status_get(mddev)) { /* * -1 to make raid1_add_disk() set conf->fullsync * to 1. This could avoid skipping sync when the @@ -10245,7 +10229,7 @@ void md_autostart_arrays(int part) static __exit void md_exit(void) { - struct mddev *mddev, *n; + struct mddev *mddev; int delay = 1; unregister_blkdev(MD_MAJOR,"md"); @@ -10266,7 +10250,7 @@ static __exit void md_exit(void) remove_proc_entry("mdstat", NULL); spin_lock(&all_mddevs_lock); - list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { + list_for_each_entry(mddev, &all_mddevs, all_mddevs) { if (!mddev_get(mddev)) continue; spin_unlock(&all_mddevs_lock); @@ -10278,8 +10262,8 @@ static __exit void md_exit(void) * the mddev for destruction by a workqueue, and the * destroy_workqueue() below will wait for that to complete. */ - mddev_put(mddev); spin_lock(&all_mddevs_lock); + mddev_put_locked(mddev); } spin_unlock(&all_mddevs_lock); diff --git a/drivers/md/md.h b/drivers/md/md.h index def808064ad8..1cf00a04bcdd 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -18,11 +18,37 @@ #include <linux/timer.h> #include <linux/wait.h> #include <linux/workqueue.h> +#include <linux/raid/md_u.h> #include <trace/events/block.h> -#include "md-cluster.h" #define MaxSector (~(sector_t)0) +enum md_submodule_type { + MD_PERSONALITY = 0, + MD_CLUSTER, + MD_BITMAP, /* TODO */ +}; + +enum md_submodule_id { + ID_LINEAR = LEVEL_LINEAR, + ID_RAID0 = 0, + ID_RAID1 = 1, + ID_RAID4 = 4, + ID_RAID5 = 5, + ID_RAID6 = 6, + ID_RAID10 = 10, + ID_CLUSTER, + ID_BITMAP, /* TODO */ + ID_LLBITMAP, /* TODO */ +}; + +struct md_submodule_head { + enum md_submodule_type type; + enum md_submodule_id id; + const char *name; + struct module *owner; +}; + /* * These flags should really be called "NO_RETRY" rather than * "FAILFAST" because they don't make any promise about time lapse, @@ -266,8 +292,8 @@ enum flag_bits { Nonrot, /* non-rotational device (SSD) */ }; -static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) +static inline int is_badblock(struct md_rdev *rdev, sector_t s, sector_t sectors, + sector_t *first_bad, sector_t *bad_sectors) { if (unlikely(rdev->badblocks.count)) { int rv = badblocks_check(&rdev->badblocks, rdev->data_offset + s, @@ -284,16 +310,17 @@ static inline int rdev_has_badblock(struct md_rdev *rdev, sector_t s, int sectors) { sector_t first_bad; - int bad_sectors; + sector_t bad_sectors; return is_badblock(rdev, s, sectors, &first_bad, &bad_sectors); } -extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, - int is_new); -extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, - int is_new); +extern bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new); +extern void rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, + int is_new); struct md_cluster_info; +struct md_cluster_operations; /** * enum mddev_flags - md device flags. @@ -576,6 +603,7 @@ struct mddev { mempool_t *serial_info_pool; void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); struct md_cluster_info *cluster_info; + struct md_cluster_operations *cluster_ops; unsigned int good_device_nr; /* good device num within cluster raid */ unsigned int noio_flag; /* for memalloc scope API */ @@ -699,10 +727,8 @@ static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors) struct md_personality { - char *name; - int level; - struct list_head list; - struct module *owner; + struct md_submodule_head head; + bool __must_check (*make_request)(struct mddev *mddev, struct bio *bio); /* * start up works that do NOT require md_thread. tasks that @@ -843,13 +869,9 @@ static inline void safe_put_page(struct page *p) if (p) put_page(p); } -extern int register_md_personality(struct md_personality *p); -extern int unregister_md_personality(struct md_personality *p); -extern int register_md_cluster_operations(const struct md_cluster_operations *ops, - struct module *module); -extern int unregister_md_cluster_operations(void); -extern int md_setup_cluster(struct mddev *mddev, int nodes); -extern void md_cluster_stop(struct mddev *mddev); +int register_md_submodule(struct md_submodule_head *msh); +void unregister_md_submodule(struct md_submodule_head *msh); + extern struct md_thread *md_register_thread( void (*run)(struct md_thread *thread), struct mddev *mddev, @@ -906,7 +928,6 @@ extern void md_idle_sync_thread(struct mddev *mddev); extern void md_frozen_sync_thread(struct mddev *mddev); extern void md_unfrozen_sync_thread(struct mddev *mddev); -extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev); extern void mddev_destroy_serial_pool(struct mddev *mddev, @@ -928,7 +949,6 @@ static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) } } -extern const struct md_cluster_operations *md_cluster_ops; static inline int mddev_is_clustered(struct mddev *mddev) { return mddev->cluster_info && mddev->bitmap_info.nodes > 1; diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig index f4f948b0e173..dbb97a7233ab 100644 --- a/drivers/md/persistent-data/Kconfig +++ b/drivers/md/persistent-data/Kconfig @@ -2,7 +2,7 @@ config DM_PERSISTENT_DATA tristate depends on BLK_DEV_DM - select LIBCRC32C + select CRC32 select DM_BUFIO help Library providing immutable on-disk data structure support for diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c index c7ba4e6cbbc7..98c745d90f48 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.c +++ b/drivers/md/persistent-data/dm-transaction-manager.c @@ -13,6 +13,7 @@ #include <linux/export.h> #include <linux/mutex.h> #include <linux/hash.h> +#include <linux/rbtree.h> #include <linux/slab.h> #include <linux/device-mapper.h> @@ -77,7 +78,7 @@ static void prefetch_issue(struct prefetch_set *p, struct dm_block_manager *bm) /*----------------------------------------------------------------*/ struct shadow_info { - struct hlist_node hlist; + struct rb_node node; dm_block_t where; }; @@ -95,7 +96,7 @@ struct dm_transaction_manager { struct dm_space_map *sm; spinlock_t lock; - struct hlist_head buckets[DM_HASH_SIZE]; + struct rb_root buckets[DM_HASH_SIZE]; struct prefetch_set prefetches; }; @@ -106,14 +107,22 @@ static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b) { int r = 0; unsigned int bucket = dm_hash_block(b, DM_HASH_MASK); - struct shadow_info *si; + struct rb_node **node; spin_lock(&tm->lock); - hlist_for_each_entry(si, tm->buckets + bucket, hlist) - if (si->where == b) { + node = &tm->buckets[bucket].rb_node; + while (*node) { + struct shadow_info *si = + rb_entry(*node, struct shadow_info, node); + if (b == si->where) { r = 1; break; } + if (b < si->where) + node = &si->node.rb_left; + else + node = &si->node.rb_right; + } spin_unlock(&tm->lock); return r; @@ -130,30 +139,41 @@ static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b) si = kmalloc(sizeof(*si), GFP_NOIO); if (si) { + struct rb_node **node, *parent; si->where = b; bucket = dm_hash_block(b, DM_HASH_MASK); + spin_lock(&tm->lock); - hlist_add_head(&si->hlist, tm->buckets + bucket); + node = &tm->buckets[bucket].rb_node; + parent = NULL; + while (*node) { + struct shadow_info *si = + rb_entry(*node, struct shadow_info, node); + parent = *node; + if (b < si->where) + node = &si->node.rb_left; + else + node = &si->node.rb_right; + } + rb_link_node(&si->node, parent, node); + rb_insert_color(&si->node, &tm->buckets[bucket]); spin_unlock(&tm->lock); } } static void wipe_shadow_table(struct dm_transaction_manager *tm) { - struct shadow_info *si; - struct hlist_node *tmp; - struct hlist_head *bucket; - int i; + unsigned int i; spin_lock(&tm->lock); for (i = 0; i < DM_HASH_SIZE; i++) { - bucket = tm->buckets + i; - hlist_for_each_entry_safe(si, tmp, bucket, hlist) + while (!RB_EMPTY_ROOT(&tm->buckets[i])) { + struct shadow_info *si = + rb_entry(tm->buckets[i].rb_node, struct shadow_info, node); + rb_erase(&si->node, &tm->buckets[i]); kfree(si); - - INIT_HLIST_HEAD(bucket); + } } - spin_unlock(&tm->lock); } @@ -162,7 +182,7 @@ static void wipe_shadow_table(struct dm_transaction_manager *tm) static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm, struct dm_space_map *sm) { - int i; + unsigned int i; struct dm_transaction_manager *tm; tm = kmalloc(sizeof(*tm), GFP_KERNEL); @@ -176,7 +196,7 @@ static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm, spin_lock_init(&tm->lock); for (i = 0; i < DM_HASH_SIZE; i++) - INIT_HLIST_HEAD(tm->buckets + i); + tm->buckets[i] = RB_ROOT; prefetch_init(&tm->prefetches); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index e8802309ed60..d8f639f4ae12 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -384,7 +384,7 @@ static int raid0_set_limits(struct mddev *mddev) lim.max_write_zeroes_sectors = mddev->chunk_sectors; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * mddev->raid_disks; - lim.features |= BLK_FEAT_ATOMIC_WRITES_STACKED; + lim.features |= BLK_FEAT_ATOMIC_WRITES; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); if (err) return err; @@ -809,9 +809,13 @@ static void raid0_quiesce(struct mddev *mddev, int quiesce) static struct md_personality raid0_personality= { - .name = "raid0", - .level = 0, - .owner = THIS_MODULE, + .head = { + .type = MD_PERSONALITY, + .id = ID_RAID0, + .name = "raid0", + .owner = THIS_MODULE, + }, + .make_request = raid0_make_request, .run = raid0_run, .free = raid0_free, @@ -822,14 +826,14 @@ static struct md_personality raid0_personality= .error_handler = raid0_error, }; -static int __init raid0_init (void) +static int __init raid0_init(void) { - return register_md_personality (&raid0_personality); + return register_md_submodule(&raid0_personality.head); } -static void raid0_exit (void) +static void __exit raid0_exit(void) { - unregister_md_personality (&raid0_personality); + unregister_md_submodule(&raid0_personality.head); } module_init(raid0_init); diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 4378d3250bd7..b8b3a9069701 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -247,7 +247,7 @@ static inline int raid1_check_read_range(struct md_rdev *rdev, sector_t this_sector, int *len) { sector_t first_bad; - int bad_sectors; + sector_t bad_sectors; /* no bad block overlap */ if (!is_badblock(rdev, this_sector, *len, &first_bad, &bad_sectors)) @@ -287,9 +287,19 @@ static inline bool raid1_should_read_first(struct mddev *mddev, return true; if (mddev_is_clustered(mddev) && - md_cluster_ops->area_resyncing(mddev, READ, this_sector, - this_sector + len)) + mddev->cluster_ops->area_resyncing(mddev, READ, this_sector, + this_sector + len)) return true; return false; } + +/* + * bio with REQ_RAHEAD or REQ_NOWAIT can fail at anytime, before such IO is + * submitted to the underlying disks, hence don't record badblocks or retry + * in this case. + */ +static inline bool raid1_should_handle_error(struct bio *bio) +{ + return !(bio->bi_opf & (REQ_RAHEAD | REQ_NOWAIT)); +} diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 3c75a69376f4..1fe645e63001 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -36,6 +36,7 @@ #include "md.h" #include "raid1.h" #include "md-bitmap.h" +#include "md-cluster.h" #define UNSUPPORTED_MDDEV_FLAGS \ ((1L << MD_HAS_JOURNAL) | \ @@ -45,6 +46,7 @@ static void allow_barrier(struct r1conf *conf, sector_t sector_nr); static void lower_barrier(struct r1conf *conf, sector_t sector_nr); +static void raid1_free(struct mddev *mddev, void *priv); #define RAID_1_10_NAME "raid1" #include "raid1-10.c" @@ -371,14 +373,16 @@ static void raid1_end_read_request(struct bio *bio) */ update_head_pos(r1_bio->read_disk, r1_bio); - if (uptodate) + if (uptodate) { set_bit(R1BIO_Uptodate, &r1_bio->state); - else if (test_bit(FailFast, &rdev->flags) && - test_bit(R1BIO_FailFast, &r1_bio->state)) + } else if (test_bit(FailFast, &rdev->flags) && + test_bit(R1BIO_FailFast, &r1_bio->state)) { /* This was a fail-fast read so we definitely * want to retry */ ; - else { + } else if (!raid1_should_handle_error(bio)) { + uptodate = 1; + } else { /* If all other devices have failed, we want to return * the error upwards rather than fail the last device. * Here we redefine "uptodate" to mean "Don't want to retry" @@ -449,16 +453,15 @@ static void raid1_end_write_request(struct bio *bio) struct bio *to_put = NULL; int mirror = find_bio_disk(r1_bio, bio); struct md_rdev *rdev = conf->mirrors[mirror].rdev; - bool discard_error; sector_t lo = r1_bio->sector; sector_t hi = r1_bio->sector + r1_bio->sectors; - - discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; + bool ignore_error = !raid1_should_handle_error(bio) || + (bio->bi_status && bio_op(bio) == REQ_OP_DISCARD); /* * 'one mirror IO has finished' event handler: */ - if (bio->bi_status && !discard_error) { + if (bio->bi_status && !ignore_error) { set_bit(WriteErrorSeen, &rdev->flags); if (!test_and_set_bit(WantReplacement, &rdev->flags)) set_bit(MD_RECOVERY_NEEDED, & @@ -509,7 +512,7 @@ static void raid1_end_write_request(struct bio *bio) /* Maybe we can clear some bad blocks. */ if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) && - !discard_error) { + !ignore_error) { r1_bio->bios[mirror] = IO_MADE_GOOD; set_bit(R1BIO_MadeGood, &r1_bio->state); } @@ -1315,8 +1318,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, struct r1conf *conf = mddev->private; struct raid1_info *mirror; struct bio *read_bio; - const enum req_op op = bio_op(bio); - const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; int max_sectors; int rdisk, error; bool r1bio_existed = !!r1_bio; @@ -1404,7 +1405,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, read_bio->bi_iter.bi_sector = r1_bio->sector + mirror->rdev->data_offset; read_bio->bi_end_io = raid1_end_read_request; - read_bio->bi_opf = op | do_sync; if (test_bit(FailFast, &mirror->rdev->flags) && test_bit(R1BIO_FailFast, &r1_bio->state)) read_bio->bi_opf |= MD_FAILFAST; @@ -1467,7 +1467,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, bool is_discard = (bio_op(bio) == REQ_OP_DISCARD); if (mddev_is_clustered(mddev) && - md_cluster_ops->area_resyncing(mddev, WRITE, + mddev->cluster_ops->area_resyncing(mddev, WRITE, bio->bi_iter.bi_sector, bio_end_sector(bio))) { DEFINE_WAIT(w); @@ -1478,7 +1478,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, for (;;) { prepare_to_wait(&conf->wait_barrier, &w, TASK_IDLE); - if (!md_cluster_ops->area_resyncing(mddev, WRITE, + if (!mddev->cluster_ops->area_resyncing(mddev, WRITE, bio->bi_iter.bi_sector, bio_end_sector(bio))) break; @@ -1537,7 +1537,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, atomic_inc(&rdev->nr_pending); if (test_bit(WriteErrorSeen, &rdev->flags)) { sector_t first_bad; - int bad_sectors; + sector_t bad_sectors; int is_bad; is_bad = is_badblock(rdev, r1_bio->sector, max_sectors, @@ -1653,8 +1653,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, mbio->bi_iter.bi_sector = (r1_bio->sector + rdev->data_offset); mbio->bi_end_io = raid1_end_write_request; - mbio->bi_opf = bio_op(bio) | - (bio->bi_opf & (REQ_SYNC | REQ_FUA | REQ_ATOMIC)); if (test_bit(FailFast, &rdev->flags) && !test_bit(WriteMostly, &rdev->flags) && conf->raid_disks - mddev->degraded > 1) @@ -2203,14 +2201,9 @@ static int fix_sync_read_error(struct r1bio *r1_bio) if (!rdev_set_badblocks(rdev, sect, s, 0)) abort = 1; } - if (abort) { - conf->recovery_disabled = - mddev->recovery_disabled; - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_done_sync(mddev, r1_bio->sectors, 0); - put_buf(r1_bio); + if (abort) return 0; - } + /* Try next page */ sectors -= s; sect += s; @@ -2349,10 +2342,21 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) int disks = conf->raid_disks * 2; struct bio *wbio; - if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) - /* ouch - failed to read all of that. */ - if (!fix_sync_read_error(r1_bio)) + if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { + /* + * ouch - failed to read all of that. + * No need to fix read error for check/repair + * because all member disks are read. + */ + if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) || + !fix_sync_read_error(r1_bio)) { + conf->recovery_disabled = mddev->recovery_disabled; + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_done_sync(mddev, r1_bio->sectors, 0); + put_buf(r1_bio); return; + } + } if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) process_checks(r1_bio); @@ -2486,7 +2490,7 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio) } } -static int narrow_write_error(struct r1bio *r1_bio, int i) +static bool narrow_write_error(struct r1bio *r1_bio, int i) { struct mddev *mddev = r1_bio->mddev; struct r1conf *conf = mddev->private; @@ -2507,10 +2511,10 @@ static int narrow_write_error(struct r1bio *r1_bio, int i) sector_t sector; int sectors; int sect_to_write = r1_bio->sectors; - int ok = 1; + bool ok = true; if (rdev->badblocks.shift < 0) - return 0; + return false; block_sectors = roundup(1 << rdev->badblocks.shift, bdev_logical_block_size(rdev->bdev) >> 9); @@ -2886,7 +2890,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, } else { /* may need to read from here */ sector_t first_bad = MaxSector; - int bad_sectors; + sector_t bad_sectors; if (is_badblock(rdev, sector_nr, good_sectors, &first_bad, &bad_sectors)) { @@ -3038,9 +3042,9 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, conf->cluster_sync_low = mddev->curr_resync_completed; conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS; /* Send resync message */ - md_cluster_ops->resync_info_update(mddev, - conf->cluster_sync_low, - conf->cluster_sync_high); + mddev->cluster_ops->resync_info_update(mddev, + conf->cluster_sync_low, + conf->cluster_sync_high); } /* For a user-requested sync, we read all readable devices and do a @@ -3217,7 +3221,7 @@ static int raid1_set_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; - lim.features |= BLK_FEAT_ATOMIC_WRITES_STACKED; + lim.features |= BLK_FEAT_ATOMIC_WRITES; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); if (err) return err; @@ -3256,8 +3260,11 @@ static int raid1_run(struct mddev *mddev) if (!mddev_is_dm(mddev)) { ret = raid1_set_limits(mddev); - if (ret) + if (ret) { + if (!mddev->private) + raid1_free(mddev, conf); return ret; + } } mddev->degraded = 0; @@ -3271,6 +3278,8 @@ static int raid1_run(struct mddev *mddev) */ if (conf->raid_disks - mddev->degraded < 1) { md_unregister_thread(mddev, &conf->thread); + if (!mddev->private) + raid1_free(mddev, conf); return -EINVAL; } @@ -3491,9 +3500,13 @@ static void *raid1_takeover(struct mddev *mddev) static struct md_personality raid1_personality = { - .name = "raid1", - .level = 1, - .owner = THIS_MODULE, + .head = { + .type = MD_PERSONALITY, + .id = ID_RAID1, + .name = "raid1", + .owner = THIS_MODULE, + }, + .make_request = raid1_make_request, .run = raid1_run, .free = raid1_free, @@ -3510,18 +3523,18 @@ static struct md_personality raid1_personality = .takeover = raid1_takeover, }; -static int __init raid_init(void) +static int __init raid1_init(void) { - return register_md_personality(&raid1_personality); + return register_md_submodule(&raid1_personality.head); } -static void raid_exit(void) +static void __exit raid1_exit(void) { - unregister_md_personality(&raid1_personality); + unregister_md_submodule(&raid1_personality.head); } -module_init(raid_init); -module_exit(raid_exit); +module_init(raid1_init); +module_exit(raid1_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD"); MODULE_ALIAS("md-personality-3"); /* RAID1 */ diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 8b736f30ef92..54320a887ecc 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -24,6 +24,7 @@ #include "raid10.h" #include "raid0.h" #include "md-bitmap.h" +#include "md-cluster.h" /* * RAID10 provides a combination of RAID0 and RAID1 functionality. @@ -398,6 +399,8 @@ static void raid10_end_read_request(struct bio *bio) * wait for the 'master' bio. */ set_bit(R10BIO_Uptodate, &r10_bio->state); + } else if (!raid1_should_handle_error(bio)) { + uptodate = 1; } else { /* If all other devices that store this block have * failed, we want to return the error upwards rather @@ -455,9 +458,8 @@ static void raid10_end_write_request(struct bio *bio) int slot, repl; struct md_rdev *rdev = NULL; struct bio *to_put = NULL; - bool discard_error; - - discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; + bool ignore_error = !raid1_should_handle_error(bio) || + (bio->bi_status && bio_op(bio) == REQ_OP_DISCARD); dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); @@ -471,7 +473,7 @@ static void raid10_end_write_request(struct bio *bio) /* * this branch is our 'one mirror IO has finished' event handler: */ - if (bio->bi_status && !discard_error) { + if (bio->bi_status && !ignore_error) { if (repl) /* Never record new bad blocks to replacement, * just fail it. @@ -526,7 +528,7 @@ static void raid10_end_write_request(struct bio *bio) /* Maybe we can clear some bad blocks. */ if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr, r10_bio->sectors) && - !discard_error) { + !ignore_error) { bio_put(bio); if (repl) r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; @@ -747,7 +749,7 @@ static struct md_rdev *read_balance(struct r10conf *conf, for (slot = 0; slot < conf->copies ; slot++) { sector_t first_bad; - int bad_sectors; + sector_t bad_sectors; sector_t dev_sector; unsigned int pending; bool nonrot; @@ -1146,8 +1148,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, { struct r10conf *conf = mddev->private; struct bio *read_bio; - const enum req_op op = bio_op(bio); - const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; int max_sectors; struct md_rdev *rdev; char b[BDEVNAME_SIZE]; @@ -1228,7 +1228,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr + choose_data_offset(r10_bio, rdev); read_bio->bi_end_io = raid10_end_read_request; - read_bio->bi_opf = op | do_sync; if (test_bit(FailFast, &rdev->flags) && test_bit(R10BIO_FailFast, &r10_bio->state)) read_bio->bi_opf |= MD_FAILFAST; @@ -1247,10 +1246,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, struct bio *bio, bool replacement, int n_copy) { - const enum req_op op = bio_op(bio); - const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; - const blk_opf_t do_fua = bio->bi_opf & REQ_FUA; - const blk_opf_t do_atomic = bio->bi_opf & REQ_ATOMIC; unsigned long flags; struct r10conf *conf = mddev->private; struct md_rdev *rdev; @@ -1269,7 +1264,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr + choose_data_offset(r10_bio, rdev)); mbio->bi_end_io = raid10_end_write_request; - mbio->bi_opf = op | do_sync | do_fua | do_atomic; if (!replacement && test_bit(FailFast, &conf->mirrors[devnum].rdev->flags) && enough(conf, devnum)) @@ -1355,9 +1349,9 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, int error; if ((mddev_is_clustered(mddev) && - md_cluster_ops->area_resyncing(mddev, WRITE, - bio->bi_iter.bi_sector, - bio_end_sector(bio)))) { + mddev->cluster_ops->area_resyncing(mddev, WRITE, + bio->bi_iter.bi_sector, + bio_end_sector(bio)))) { DEFINE_WAIT(w); /* Bail out if REQ_NOWAIT is set for the bio */ if (bio->bi_opf & REQ_NOWAIT) { @@ -1367,7 +1361,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, for (;;) { prepare_to_wait(&conf->wait_barrier, &w, TASK_IDLE); - if (!md_cluster_ops->area_resyncing(mddev, WRITE, + if (!mddev->cluster_ops->area_resyncing(mddev, WRITE, bio->bi_iter.bi_sector, bio_end_sector(bio))) break; schedule(); @@ -1438,7 +1432,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { sector_t first_bad; sector_t dev_sector = r10_bio->devs[i].addr; - int bad_sectors; + sector_t bad_sectors; int is_bad; is_bad = is_badblock(rdev, dev_sector, max_sectors, @@ -1631,11 +1625,10 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) return -EAGAIN; - if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) { + if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) { bio_wouldblock_error(bio); return 0; } - wait_barrier(conf, false); /* * Check reshape again to avoid reshape happens after checking @@ -1743,6 +1736,7 @@ retry_discard: * The discard bio returns only first r10bio finishes */ if (first_copy) { + md_account_bio(mddev, &bio); r10_bio->master_bio = bio; set_bit(R10BIO_Discard, &r10_bio->state); first_copy = false; @@ -2786,7 +2780,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 } } -static int narrow_write_error(struct r10bio *r10_bio, int i) +static bool narrow_write_error(struct r10bio *r10_bio, int i) { struct bio *bio = r10_bio->master_bio; struct mddev *mddev = r10_bio->mddev; @@ -2807,10 +2801,10 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) sector_t sector; int sectors; int sect_to_write = r10_bio->sectors; - int ok = 1; + bool ok = true; if (rdev->badblocks.shift < 0) - return 0; + return false; block_sectors = roundup(1 << rdev->badblocks.shift, bdev_logical_block_size(rdev->bdev) >> 9); @@ -3413,7 +3407,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, sector_t from_addr, to_addr; struct md_rdev *rdev = conf->mirrors[d].rdev; sector_t sector, first_bad; - int bad_sectors; + sector_t bad_sectors; if (!rdev || !test_bit(In_sync, &rdev->flags)) continue; @@ -3609,7 +3603,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, for (i = 0; i < conf->copies; i++) { int d = r10_bio->devs[i].devnum; sector_t first_bad, sector; - int bad_sectors; + sector_t bad_sectors; struct md_rdev *rdev; if (r10_bio->devs[i].repl_bio) @@ -3716,7 +3710,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, conf->cluster_sync_low = mddev->curr_resync_completed; raid10_set_cluster_sync_high(conf); /* Send resync message */ - md_cluster_ops->resync_info_update(mddev, + mddev->cluster_ops->resync_info_update(mddev, conf->cluster_sync_low, conf->cluster_sync_high); } @@ -3749,7 +3743,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } if (broadcast_msg) { raid10_set_cluster_sync_high(conf); - md_cluster_ops->resync_info_update(mddev, + mddev->cluster_ops->resync_info_update(mddev, conf->cluster_sync_low, conf->cluster_sync_high); } @@ -4018,7 +4012,7 @@ static int raid10_set_queue_limits(struct mddev *mddev) lim.max_write_zeroes_sectors = 0; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * raid10_nr_stripes(conf); - lim.features |= BLK_FEAT_ATOMIC_WRITES_STACKED; + lim.features |= BLK_FEAT_ATOMIC_WRITES; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); if (err) return err; @@ -4541,7 +4535,7 @@ static int raid10_start_reshape(struct mddev *mddev) if (ret) goto abort; - ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize); + ret = mddev->cluster_ops->resize_bitmaps(mddev, newsize, oldsize); if (ret) { mddev->bitmap_ops->resize(mddev, oldsize, 0, false); goto abort; @@ -4832,7 +4826,7 @@ read_more: conf->cluster_sync_low = sb_reshape_pos; } - md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low, + mddev->cluster_ops->resync_info_update(mddev, conf->cluster_sync_low, conf->cluster_sync_high); } @@ -4977,7 +4971,7 @@ static void raid10_update_reshape_pos(struct mddev *mddev) struct r10conf *conf = mddev->private; sector_t lo, hi; - md_cluster_ops->resync_info_get(mddev, &lo, &hi); + mddev->cluster_ops->resync_info_get(mddev, &lo, &hi); if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo)) || mddev->reshape_position == MaxSector) conf->reshape_progress = mddev->reshape_position; @@ -5123,9 +5117,13 @@ static void raid10_finish_reshape(struct mddev *mddev) static struct md_personality raid10_personality = { - .name = "raid10", - .level = 10, - .owner = THIS_MODULE, + .head = { + .type = MD_PERSONALITY, + .id = ID_RAID10, + .name = "raid10", + .owner = THIS_MODULE, + }, + .make_request = raid10_make_request, .run = raid10_run, .free = raid10_free, @@ -5145,18 +5143,18 @@ static struct md_personality raid10_personality = .update_reshape_pos = raid10_update_reshape_pos, }; -static int __init raid_init(void) +static int __init raid10_init(void) { - return register_md_personality(&raid10_personality); + return register_md_submodule(&raid10_personality.head); } -static void raid_exit(void) +static void __exit raid10_exit(void) { - unregister_md_personality(&raid10_personality); + unregister_md_submodule(&raid10_personality.head); } -module_init(raid_init); -module_exit(raid_exit); +module_init(raid10_init); +module_exit(raid10_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD"); MODULE_ALIAS("md-personality-9"); /* RAID10 */ diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 011246e16a99..ba768ca7f422 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -714,7 +714,7 @@ static void r5l_submit_current_io(struct r5l_log *log) block = page_address(io->meta_page); block->meta_size = cpu_to_le32(io->meta_offset); - crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); + crc = crc32c(log->uuid_checksum, block, PAGE_SIZE); block->checksum = cpu_to_le32(crc); log->current_io = NULL; @@ -1019,10 +1019,10 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) /* checksum is already calculated in last run */ if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) continue; - addr = kmap_atomic(sh->dev[i].page); - sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, - addr, PAGE_SIZE); - kunmap_atomic(addr); + addr = kmap_local_page(sh->dev[i].page); + sh->dev[i].log_checksum = crc32c(log->uuid_checksum, + addr, PAGE_SIZE); + kunmap_local(addr); } parity_pages = 1 + !!(sh->qd_idx >= 0); data_pages = write_disks - parity_pages; @@ -1741,7 +1741,7 @@ static int r5l_recovery_read_meta_block(struct r5l_log *log, le64_to_cpu(mb->position) != ctx->pos) return -EINVAL; - crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); + crc = crc32c(log->uuid_checksum, mb, PAGE_SIZE); if (stored_crc != crc) return -EINVAL; @@ -1780,8 +1780,7 @@ static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, return -ENOMEM; r5l_recovery_create_empty_meta_block(log, page, pos, seq); mb = page_address(page); - mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, - mb, PAGE_SIZE)); + mb->checksum = cpu_to_le32(crc32c(log->uuid_checksum, mb, PAGE_SIZE)); if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE | REQ_SYNC | REQ_FUA, false)) { __free_page(page); @@ -1975,9 +1974,9 @@ r5l_recovery_verify_data_checksum(struct r5l_log *log, u32 checksum; r5l_recovery_read_page(log, ctx, page, log_offset); - addr = kmap_atomic(page); - checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); - kunmap_atomic(addr); + addr = kmap_local_page(page); + checksum = crc32c(log->uuid_checksum, addr, PAGE_SIZE); + kunmap_local(addr); return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL; } @@ -2377,11 +2376,11 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, payload->size = cpu_to_le32(BLOCK_SECTORS); payload->location = cpu_to_le64( raid5_compute_blocknr(sh, i, 0)); - addr = kmap_atomic(dev->page); + addr = kmap_local_page(dev->page); payload->checksum[0] = cpu_to_le32( - crc32c_le(log->uuid_checksum, addr, - PAGE_SIZE)); - kunmap_atomic(addr); + crc32c(log->uuid_checksum, addr, + PAGE_SIZE)); + kunmap_local(addr); sync_page_io(log->rdev, write_pos, PAGE_SIZE, dev->page, REQ_OP_WRITE, false); write_pos = r5l_ring_add(log, write_pos, @@ -2392,8 +2391,8 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, } } mb->meta_size = cpu_to_le32(offset); - mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, - mb, PAGE_SIZE)); + mb->checksum = cpu_to_le32(crc32c(log->uuid_checksum, + mb, PAGE_SIZE)); sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_WRITE | REQ_SYNC | REQ_FUA, false); sh->log_start = ctx->pos; @@ -2884,10 +2883,10 @@ int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh) if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) continue; - addr = kmap_atomic(sh->dev[i].page); - sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, - addr, PAGE_SIZE); - kunmap_atomic(addr); + addr = kmap_local_page(sh->dev[i].page); + sh->dev[i].log_checksum = crc32c(log->uuid_checksum, + addr, PAGE_SIZE); + kunmap_local(addr); pages++; } WARN_ON(pages == 0); @@ -2969,7 +2968,7 @@ static int r5l_load_log(struct r5l_log *log) } stored_crc = le32_to_cpu(mb->checksum); mb->checksum = 0; - expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); + expected_crc = crc32c(log->uuid_checksum, mb, PAGE_SIZE); if (stored_crc != expected_crc) { create_super = true; goto create; @@ -3077,8 +3076,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) return -ENOMEM; log->rdev = rdev; log->need_cache_flush = bdev_write_cache(rdev->bdev); - log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, - sizeof(rdev->mddev->uuid)); + log->uuid_checksum = crc32c(~0, rdev->mddev->uuid, + sizeof(rdev->mddev->uuid)); mutex_init(&log->io_mutex); diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 37c4da5311ca..c0fb335311aa 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -346,9 +346,9 @@ static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh) if (!test_bit(STRIPE_FULL_WRITE, &sh->state)) { le32_add_cpu(&e->pp_size, PAGE_SIZE); io->pp_size += PAGE_SIZE; - e->checksum = cpu_to_le32(crc32c_le(le32_to_cpu(e->checksum), - page_address(sh->ppl_page), - PAGE_SIZE)); + e->checksum = cpu_to_le32(crc32c(le32_to_cpu(e->checksum), + page_address(sh->ppl_page), + PAGE_SIZE)); } list_add_tail(&sh->log_list, &io->stripe_list); @@ -454,7 +454,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io) } pplhdr->entries_count = cpu_to_le32(io->entries_count); - pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE)); + pplhdr->checksum = cpu_to_le32(~crc32c(~0, pplhdr, PPL_HEADER_SIZE)); /* Rewind the buffer if current PPL is larger then remaining space */ if (log->use_multippl && @@ -998,7 +998,7 @@ static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr, goto out; } - crc = crc32c_le(crc, page_address(page), s); + crc = crc32c(crc, page_address(page), s); pp_size -= s; sector += s >> 9; @@ -1052,7 +1052,7 @@ static int ppl_write_empty_header(struct ppl_log *log) log->rdev->ppl.size, GFP_NOIO, 0); memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED); pplhdr->signature = cpu_to_le32(log->ppl_conf->signature); - pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE)); + pplhdr->checksum = cpu_to_le32(~crc32c(~0, pplhdr, PAGE_SIZE)); if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset, PPL_HEADER_SIZE, page, REQ_OP_WRITE | REQ_SYNC | @@ -1106,7 +1106,7 @@ static int ppl_load_distributed(struct ppl_log *log) /* check header validity */ crc_stored = le32_to_cpu(pplhdr->checksum); pplhdr->checksum = 0; - crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE); + crc = ~crc32c(~0, pplhdr, PAGE_SIZE); if (crc_stored != crc) { pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x (offset: %llu)\n", @@ -1390,7 +1390,7 @@ int ppl_init_log(struct r5conf *conf) spin_lock_init(&ppl_conf->no_mem_stripes_lock); if (!mddev->external) { - ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid)); + ppl_conf->signature = ~crc32c(~0, mddev->uuid, sizeof(mddev->uuid)); ppl_conf->block_size = 512; } else { ppl_conf->block_size = diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5c79429acc64..6389383166c0 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5858,6 +5858,9 @@ static enum reshape_loc get_reshape_loc(struct mddev *mddev, struct r5conf *conf, sector_t logical_sector) { sector_t reshape_progress, reshape_safe; + + if (likely(conf->reshape_progress == MaxSector)) + return LOC_NO_RESHAPE; /* * Spinlock is needed as reshape_progress may be * 64bit on a 32bit platform, and so it might be @@ -5935,22 +5938,19 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, const int rw = bio_data_dir(bi); enum stripe_result ret; struct stripe_head *sh; + enum reshape_loc loc; sector_t new_sector; int previous = 0, flags = 0; int seq, dd_idx; seq = read_seqcount_begin(&conf->gen_lock); - - if (unlikely(conf->reshape_progress != MaxSector)) { - enum reshape_loc loc = get_reshape_loc(mddev, conf, - logical_sector); - if (loc == LOC_INSIDE_RESHAPE) { - ret = STRIPE_SCHEDULE_AND_RETRY; - goto out; - } - if (loc == LOC_AHEAD_OF_RESHAPE) - previous = 1; + loc = get_reshape_loc(mddev, conf, logical_sector); + if (loc == LOC_INSIDE_RESHAPE) { + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out; } + if (loc == LOC_AHEAD_OF_RESHAPE) + previous = 1; new_sector = raid5_compute_sector(conf, logical_sector, previous, &dd_idx, NULL); @@ -6127,7 +6127,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) /* Bail out if conflicts with reshape and REQ_NOWAIT is set */ if ((bi->bi_opf & REQ_NOWAIT) && - (conf->reshape_progress != MaxSector) && get_reshape_loc(mddev, conf, logical_sector) == LOC_INSIDE_RESHAPE) { bio_wouldblock_error(bi); if (rw == WRITE) @@ -8954,9 +8953,13 @@ static void raid5_prepare_suspend(struct mddev *mddev) static struct md_personality raid6_personality = { - .name = "raid6", - .level = 6, - .owner = THIS_MODULE, + .head = { + .type = MD_PERSONALITY, + .id = ID_RAID6, + .name = "raid6", + .owner = THIS_MODULE, + }, + .make_request = raid5_make_request, .run = raid5_run, .start = raid5_start, @@ -8980,9 +8983,13 @@ static struct md_personality raid6_personality = }; static struct md_personality raid5_personality = { - .name = "raid5", - .level = 5, - .owner = THIS_MODULE, + .head = { + .type = MD_PERSONALITY, + .id = ID_RAID5, + .name = "raid5", + .owner = THIS_MODULE, + }, + .make_request = raid5_make_request, .run = raid5_run, .start = raid5_start, @@ -9007,9 +9014,13 @@ static struct md_personality raid5_personality = static struct md_personality raid4_personality = { - .name = "raid4", - .level = 4, - .owner = THIS_MODULE, + .head = { + .type = MD_PERSONALITY, + .id = ID_RAID4, + .name = "raid4", + .owner = THIS_MODULE, + }, + .make_request = raid5_make_request, .run = raid5_run, .start = raid5_start, @@ -9045,21 +9056,39 @@ static int __init raid5_init(void) "md/raid5:prepare", raid456_cpu_up_prepare, raid456_cpu_dead); - if (ret) { - destroy_workqueue(raid5_wq); - return ret; - } - register_md_personality(&raid6_personality); - register_md_personality(&raid5_personality); - register_md_personality(&raid4_personality); + if (ret) + goto err_destroy_wq; + + ret = register_md_submodule(&raid6_personality.head); + if (ret) + goto err_cpuhp_remove; + + ret = register_md_submodule(&raid5_personality.head); + if (ret) + goto err_unregister_raid6; + + ret = register_md_submodule(&raid4_personality.head); + if (ret) + goto err_unregister_raid5; + return 0; + +err_unregister_raid5: + unregister_md_submodule(&raid5_personality.head); +err_unregister_raid6: + unregister_md_submodule(&raid6_personality.head); +err_cpuhp_remove: + cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE); +err_destroy_wq: + destroy_workqueue(raid5_wq); + return ret; } -static void raid5_exit(void) +static void __exit raid5_exit(void) { - unregister_md_personality(&raid6_personality); - unregister_md_personality(&raid5_personality); - unregister_md_personality(&raid4_personality); + unregister_md_submodule(&raid6_personality.head); + unregister_md_submodule(&raid5_personality.head); + unregister_md_submodule(&raid4_personality.head); cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE); destroy_workqueue(raid5_wq); } |