diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Kconfig | 11 | ||||
-rw-r--r-- | drivers/md/dm-cache-policy-smq.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-core.h | 10 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 15 | ||||
-rw-r--r-- | drivers/md/dm-integrity.c | 23 | ||||
-rw-r--r-- | drivers/md/dm-ioctl.c | 18 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 26 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-rq.c | 316 | ||||
-rw-r--r-- | drivers/md/dm-rq.h | 4 | ||||
-rw-r--r-- | drivers/md/dm-sysfs.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 46 | ||||
-rw-r--r-- | drivers/md/dm-thin.c | 8 | ||||
-rw-r--r-- | drivers/md/dm-verity-fec.c | 5 | ||||
-rw-r--r-- | drivers/md/dm-writecache.c | 5 | ||||
-rw-r--r-- | drivers/md/dm-zoned-metadata.c | 80 | ||||
-rw-r--r-- | drivers/md/dm-zoned-target.c | 20 | ||||
-rw-r--r-- | drivers/md/dm.c | 25 | ||||
-rw-r--r-- | drivers/md/dm.h | 1 | ||||
-rw-r--r-- | drivers/md/md-bitmap.c | 9 | ||||
-rw-r--r-- | drivers/md/md-cluster.c | 234 | ||||
-rw-r--r-- | drivers/md/md-cluster.h | 2 | ||||
-rw-r--r-- | drivers/md/md.c | 113 | ||||
-rw-r--r-- | drivers/md/md.h | 1 | ||||
-rw-r--r-- | drivers/md/raid1.c | 1 | ||||
-rw-r--r-- | drivers/md/raid10.c | 109 | ||||
-rw-r--r-- | drivers/md/raid5-cache.c | 2 | ||||
-rw-r--r-- | drivers/md/raid5.c | 12 |
28 files changed, 508 insertions, 595 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 8b8c123cae66..3db222509e44 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -215,17 +215,6 @@ config BLK_DEV_DM If unsure, say N. -config DM_MQ_DEFAULT - bool "request-based DM: use blk-mq I/O path by default" - depends on BLK_DEV_DM - ---help--- - This option enables the blk-mq based I/O path for request-based - DM devices by default. With the option the dm_mod.use_blk_mq - module/boot option defaults to Y, without it to N, but it can - still be overriden either way. - - If unsure say N. - config DM_DEBUG bool "Device mapper debugging support" depends on BLK_DEV_DM diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index 1b5b9ad9e492..b61aac00ff40 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -1200,7 +1200,7 @@ static void queue_demotion(struct smq_policy *mq) struct policy_work work; struct entry *e; - if (unlikely(WARN_ON_ONCE(!mq->migrations_allowed))) + if (WARN_ON_ONCE(!mq->migrations_allowed)) return; e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true); diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 7d480c930eaf..224d44503a06 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -112,18 +112,8 @@ struct mapped_device { struct dm_stats stats; - struct kthread_worker kworker; - struct task_struct *kworker_task; - - /* for request-based merge heuristic in dm_request_fn() */ - unsigned seq_rq_merge_deadline_usecs; - int last_rq_rw; - sector_t last_rq_pos; - ktime_t last_rq_start_time; - /* for blk-mq request-based DM support */ struct blk_mq_tag_set *tag_set; - bool use_blk_mq:1; bool init_tio_pdu:1; struct srcu_struct io_barrier; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 0481223b1deb..b8eec515a003 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2661,6 +2661,7 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct crypt_config *cc; + const char *devname = dm_table_device_name(ti->table); int key_size; unsigned int align_mask; unsigned long long tmpll; @@ -2806,18 +2807,22 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } ret = -ENOMEM; - cc->io_queue = alloc_workqueue("kcryptd_io", WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1); + cc->io_queue = alloc_workqueue("kcryptd_io/%s", + WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, + 1, devname); if (!cc->io_queue) { ti->error = "Couldn't create kcryptd io queue"; goto bad; } if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) - cc->crypt_queue = alloc_workqueue("kcryptd", WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1); + cc->crypt_queue = alloc_workqueue("kcryptd/%s", + WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, + 1, devname); else - cc->crypt_queue = alloc_workqueue("kcryptd", + cc->crypt_queue = alloc_workqueue("kcryptd/%s", WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, - num_online_cpus()); + num_online_cpus(), devname); if (!cc->crypt_queue) { ti->error = "Couldn't create kcryptd queue"; goto bad; @@ -2826,7 +2831,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) spin_lock_init(&cc->write_thread_lock); cc->write_tree = RB_ROOT; - cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write"); + cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write/%s", devname); if (IS_ERR(cc->write_thread)) { ret = PTR_ERR(cc->write_thread); cc->write_thread = NULL; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index e1fa6baf4e8e..bb3096bf2cc6 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -559,7 +559,12 @@ static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result } memset(result + size, 0, JOURNAL_MAC_SIZE - size); } else { - __u8 digest[size]; + __u8 digest[HASH_MAX_DIGESTSIZE]; + + if (WARN_ON(size > sizeof(digest))) { + dm_integrity_io_error(ic, "digest_size", -EINVAL); + goto err; + } r = crypto_shash_final(desc, digest); if (unlikely(r)) { dm_integrity_io_error(ic, "crypto_shash_final", r); @@ -1324,7 +1329,7 @@ static void integrity_metadata(struct work_struct *w) struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); char *checksums; unsigned extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0; - char checksums_onstack[ic->tag_size + extra_space]; + char checksums_onstack[HASH_MAX_DIGESTSIZE]; unsigned sectors_to_process = dio->range.n_sectors; sector_t sector = dio->range.logical_sector; @@ -1333,8 +1338,14 @@ static void integrity_metadata(struct work_struct *w) checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space, GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN); - if (!checksums) + if (!checksums) { checksums = checksums_onstack; + if (WARN_ON(extra_space && + digest_size > sizeof(checksums_onstack))) { + r = -EINVAL; + goto error; + } + } __bio_for_each_segment(bv, bio, iter, dio->orig_bi_iter) { unsigned pos; @@ -1546,7 +1557,7 @@ retry_kmap: } while (++s < ic->sectors_per_block); #ifdef INTERNAL_VERIFY if (ic->internal_hash) { - char checksums_onstack[max(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)]; + char checksums_onstack[max(HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack); if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) { @@ -1596,7 +1607,7 @@ retry_kmap: if (ic->internal_hash) { unsigned digest_size = crypto_shash_digestsize(ic->internal_hash); if (unlikely(digest_size > ic->tag_size)) { - char checksums_onstack[digest_size]; + char checksums_onstack[HASH_MAX_DIGESTSIZE]; integrity_sector_checksum(ic, logical_sector, (char *)js, checksums_onstack); memcpy(journal_entry_tag(ic, je), checksums_onstack, ic->tag_size); } else @@ -2023,7 +2034,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start, unlikely(from_replay) && #endif ic->internal_hash) { - char test_tag[max(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)]; + char test_tag[max_t(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block), (char *)access_journal_data(ic, i, l), test_tag); diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index b810ea77e6b1..f666778ad237 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1720,8 +1720,7 @@ static void free_params(struct dm_ioctl *param, size_t param_size, int param_fla } static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kernel, - int ioctl_flags, - struct dm_ioctl **param, int *param_flags) + int ioctl_flags, struct dm_ioctl **param, int *param_flags) { struct dm_ioctl *dmi; int secure_data; @@ -1762,18 +1761,13 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern *param_flags |= DM_PARAMS_MALLOC; - if (copy_from_user(dmi, user, param_kernel->data_size)) - goto bad; + /* Copy from param_kernel (which was already copied from user) */ + memcpy(dmi, param_kernel, minimum_data_size); -data_copied: - /* - * Abort if something changed the ioctl data while it was being copied. - */ - if (dmi->data_size != param_kernel->data_size) { - DMERR("rejecting ioctl: data size modified while processing parameters"); + if (copy_from_user(&dmi->data, (char __user *)user + minimum_data_size, + param_kernel->data_size - minimum_data_size)) goto bad; - } - +data_copied: /* Wipe the user buffer so we do not return it to userspace */ if (secure_data && clear_user(user, param_kernel->data_size)) goto bad; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 419362c2d8ac..d6a66921daf4 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -203,14 +203,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti) static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m) { if (m->queue_mode == DM_TYPE_NONE) { - /* - * Default to request-based. - */ - if (dm_use_blk_mq(dm_table_get_md(ti->table))) - m->queue_mode = DM_TYPE_MQ_REQUEST_BASED; - else - m->queue_mode = DM_TYPE_REQUEST_BASED; - + m->queue_mode = DM_TYPE_REQUEST_BASED; } else if (m->queue_mode == DM_TYPE_BIO_BASED) { INIT_WORK(&m->process_queued_bios, process_queued_bios); /* @@ -537,10 +530,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, * get the queue busy feedback (via BLK_STS_RESOURCE), * otherwise I/O merging can suffer. */ - if (q->mq_ops) - return DM_MAPIO_REQUEUE; - else - return DM_MAPIO_DELAY_REQUEUE; + return DM_MAPIO_REQUEUE; } clone->bio = clone->biotail = NULL; clone->rq_disk = bdev->bd_disk; @@ -668,7 +658,7 @@ static int multipath_map_bio(struct dm_target *ti, struct bio *bio) static void process_queued_io_list(struct multipath *m) { - if (m->queue_mode == DM_TYPE_MQ_REQUEST_BASED) + if (m->queue_mode == DM_TYPE_REQUEST_BASED) dm_mq_kick_requeue_list(dm_table_get_md(m->ti->table)); else if (m->queue_mode == DM_TYPE_BIO_BASED) queue_work(kmultipathd, &m->process_queued_bios); @@ -1089,10 +1079,9 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m) if (!strcasecmp(queue_mode_name, "bio")) m->queue_mode = DM_TYPE_BIO_BASED; - else if (!strcasecmp(queue_mode_name, "rq")) + else if (!strcasecmp(queue_mode_name, "rq") || + !strcasecmp(queue_mode_name, "mq")) m->queue_mode = DM_TYPE_REQUEST_BASED; - else if (!strcasecmp(queue_mode_name, "mq")) - m->queue_mode = DM_TYPE_MQ_REQUEST_BASED; else { ti->error = "Unknown 'queue_mode' requested"; r = -EINVAL; @@ -1726,9 +1715,6 @@ static void multipath_status(struct dm_target *ti, status_type_t type, case DM_TYPE_BIO_BASED: DMEMIT("queue_mode bio "); break; - case DM_TYPE_MQ_REQUEST_BASED: - DMEMIT("queue_mode mq "); - break; default: WARN_ON_ONCE(true); break; @@ -1972,7 +1958,7 @@ static int multipath_busy(struct dm_target *ti) /* no paths available, for blk-mq: rely on IO mapping to delay requeue */ if (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) - return (m->queue_mode != DM_TYPE_MQ_REQUEST_BASED); + return (m->queue_mode != DM_TYPE_REQUEST_BASED); /* Guess which priority_group will be used at next mapping time */ pg = READ_ONCE(m->current_pg); diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index c44925e4e481..e1dd1622a290 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -2475,7 +2475,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev) } /* Enable bitmap creation for RAID levels != 0 */ - mddev->bitmap_info.offset = rt_is_raid0(rs->raid_type) ? 0 : to_sector(4096); + mddev->bitmap_info.offset = (rt_is_raid0(rs->raid_type) || rs->journal_dev.dev) ? 0 : to_sector(4096); mddev->bitmap_info.default_offset = mddev->bitmap_info.offset; if (!test_and_clear_bit(FirstUse, &rdev->flags)) { diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 6e547b8dd298..7cd36e4d1310 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -23,19 +23,6 @@ static unsigned dm_mq_queue_depth = DM_MQ_QUEUE_DEPTH; #define RESERVED_REQUEST_BASED_IOS 256 static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; -static bool use_blk_mq = IS_ENABLED(CONFIG_DM_MQ_DEFAULT); - -bool dm_use_blk_mq_default(void) -{ - return use_blk_mq; -} - -bool dm_use_blk_mq(struct mapped_device *md) -{ - return md->use_blk_mq; -} -EXPORT_SYMBOL_GPL(dm_use_blk_mq); - unsigned dm_get_reserved_rq_based_ios(void) { return __dm_get_module_param(&reserved_rq_based_ios, @@ -59,41 +46,13 @@ int dm_request_based(struct mapped_device *md) return queue_is_rq_based(md->queue); } -static void dm_old_start_queue(struct request_queue *q) -{ - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - if (blk_queue_stopped(q)) - blk_start_queue(q); - spin_unlock_irqrestore(q->queue_lock, flags); -} - -static void dm_mq_start_queue(struct request_queue *q) +void dm_start_queue(struct request_queue *q) { blk_mq_unquiesce_queue(q); blk_mq_kick_requeue_list(q); } -void dm_start_queue(struct request_queue *q) -{ - if (!q->mq_ops) - dm_old_start_queue(q); - else - dm_mq_start_queue(q); -} - -static void dm_old_stop_queue(struct request_queue *q) -{ - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - if (!blk_queue_stopped(q)) - blk_stop_queue(q); - spin_unlock_irqrestore(q->queue_lock, flags); -} - -static void dm_mq_stop_queue(struct request_queue *q) +void dm_stop_queue(struct request_queue *q) { if (blk_mq_queue_stopped(q)) return; @@ -101,14 +60,6 @@ static void dm_mq_stop_queue(struct request_queue *q) blk_mq_quiesce_queue(q); } -void dm_stop_queue(struct request_queue *q) -{ - if (!q->mq_ops) - dm_old_stop_queue(q); - else - dm_mq_stop_queue(q); -} - /* * Partial completion handling for request-based dm */ @@ -179,9 +130,6 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig) */ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) { - struct request_queue *q = md->queue; - unsigned long flags; - atomic_dec(&md->pending[rw]); /* nudge anyone waiting on suspend queue */ @@ -189,18 +137,6 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) wake_up(&md->wait); /* - * Run this off this callpath, as drivers could invoke end_io while - * inside their request_fn (and holding the queue lock). Calling - * back into ->request_fn() could deadlock attempting to grab the - * queue lock again. - */ - if (!q->mq_ops && run_queue) { - spin_lock_irqsave(q->queue_lock, flags); - blk_run_queue_async(q); - spin_unlock_irqrestore(q->queue_lock, flags); - } - - /* * dm_put() must be at the end of this function. See the comment above */ dm_put(md); @@ -222,27 +158,10 @@ static void dm_end_request(struct request *clone, blk_status_t error) tio->ti->type->release_clone_rq(clone); rq_end_stats(md, rq); - if (!rq->q->mq_ops) - blk_end_request_all(rq, error); - else - blk_mq_end_request(rq, error); + blk_mq_end_request(rq, error); rq_completed(md, rw, true); } -/* - * Requeue the original request of a clone. - */ -static void dm_old_requeue_request(struct request *rq, unsigned long delay_ms) -{ - struct request_queue *q = rq->q; - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - blk_requeue_request(q, rq); - blk_delay_queue(q, delay_ms); - spin_unlock_irqrestore(q->queue_lock, flags); -} - static void __dm_mq_kick_requeue_list(struct request_queue *q, unsigned long msecs) { blk_mq_delay_kick_requeue_list(q, msecs); @@ -273,11 +192,7 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_ tio->ti->type->release_clone_rq(tio->clone); } - if (!rq->q->mq_ops) - dm_old_requeue_request(rq, delay_ms); - else - dm_mq_delay_requeue_request(rq, delay_ms); - + dm_mq_delay_requeue_request(rq, delay_ms); rq_completed(md, rw, false); } @@ -340,10 +255,7 @@ static void dm_softirq_done(struct request *rq) rq_end_stats(md, rq); rw = rq_data_dir(rq); - if (!rq->q->mq_ops) - blk_end_request_all(rq, tio->error); - else - blk_mq_end_request(rq, tio->error); + blk_mq_end_request(rq, tio->error); rq_completed(md, rw, false); return; } @@ -363,17 +275,14 @@ static void dm_complete_request(struct request *rq, blk_status_t error) struct dm_rq_target_io *tio = tio_from_request(rq); tio->error = error; - if (!rq->q->mq_ops) - blk_complete_request(rq); - else - blk_mq_complete_request(rq); + blk_mq_complete_request(rq); } /* * Complete the not-mapped clone and the original request with the error status * through softirq context. * Target's rq_end_io() function isn't called. - * This may be used when the target's map_rq() or clone_and_map_rq() functions fail. + * This may be used when the target's clone_and_map_rq() function fails. */ static void dm_kill_unmapped_request(struct request *rq, blk_status_t error) { @@ -381,21 +290,10 @@ static void dm_kill_unmapped_request(struct request *rq, blk_status_t error) dm_complete_request(rq, error); } -/* - * Called with the clone's queue lock held (in the case of .request_fn) - */ static void end_clone_request(struct request *clone, blk_status_t error) { struct dm_rq_target_io *tio = clone->end_io_data; - /* - * Actual request completion is done in a softirq context which doesn't - * hold the clone's queue lock. Otherwise, deadlock could occur because: - * - another request may be submitted by the upper level driver - * of the stacking during the completion - * - the submission which requires queue lock may be done - * against this clone's queue - */ dm_complete_request(tio->orig, error); } @@ -446,8 +344,6 @@ static int setup_clone(struct request *clone, struct request *rq, return 0; } -static void map_tio_request(struct kthread_work *work); - static void init_tio(struct dm_rq_target_io *tio, struct request *rq, struct mapped_device *md) { @@ -464,8 +360,6 @@ static void init_tio(struct dm_rq_target_io *tio, struct request *rq, */ if (!md->init_tio_pdu) memset(&tio->info, 0, sizeof(tio->info)); - if (md->kworker_task) - kthread_init_work(&tio->work, map_tio_request); } /* @@ -504,10 +398,7 @@ check_again: blk_rq_unprep_clone(clone); tio->ti->type->release_clone_rq(clone); tio->clone = NULL; - if (!rq->q->mq_ops) - r = DM_MAPIO_DELAY_REQUEUE; - else - r = DM_MAPIO_REQUEUE; + r = DM_MAPIO_REQUEUE; goto check_again; } break; @@ -530,20 +421,23 @@ check_again: return r; } +/* DEPRECATED: previously used for request-based merge heuristic in dm_request_fn() */ +ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf) +{ + return sprintf(buf, "%u\n", 0); +} + +ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md, + const char *buf, size_t count) +{ + return count; +} + static void dm_start_request(struct mapped_device *md, struct request *orig) { - if (!orig->q->mq_ops) - blk_start_request(orig); - else - blk_mq_start_request(orig); + blk_mq_start_request(orig); atomic_inc(&md->pending[rq_data_dir(orig)]); - if (md->seq_rq_merge_deadline_usecs) { - md->last_rq_pos = rq_end_sector(orig); - md->last_rq_rw = rq_data_dir(orig); - md->last_rq_start_time = ktime_get(); - } - if (unlikely(dm_stats_used(&md->stats))) { struct dm_rq_target_io *tio = tio_from_request(orig); tio->duration_jiffies = jiffies; @@ -563,8 +457,10 @@ static void dm_start_request(struct mapped_device *md, struct request *orig) dm_get(md); } -static int __dm_rq_init_rq(struct mapped_device *md, struct request *rq) +static int dm_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, + unsigned int hctx_idx, unsigned int numa_node) { + struct mapped_device *md = set->driver_data; struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); /* @@ -581,163 +477,6 @@ static int __dm_rq_init_rq(struct mapped_device *md, struct request *rq) return 0; } -static int dm_rq_init_rq(struct request_queue *q, struct request *rq, gfp_t gfp) -{ - return __dm_rq_init_rq(q->rq_alloc_data, rq); -} - -static void map_tio_request(struct kthread_work *work) -{ - struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work); - - if (map_request(tio) == DM_MAPIO_REQUEUE) - dm_requeue_original_request(tio, false); -} - -ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf) -{ - return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs); -} - -#define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000 - -ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md, - const char *buf, size_t count) -{ - unsigned deadline; - - if (dm_get_md_type(md) != DM_TYPE_REQUEST_BASED) - return count; - - if (kstrtouint(buf, 10, &deadline)) - return -EINVAL; - - if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS) - deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS; - - md->seq_rq_merge_deadline_usecs = deadline; - - return count; -} - -static bool dm_old_request_peeked_before_merge_deadline(struct mapped_device *md) -{ - ktime_t kt_deadline; - - if (!md->seq_rq_merge_deadline_usecs) - return false; - - kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC); - kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline); - - return !ktime_after(ktime_get(), kt_deadline); -} - -/* - * q->request_fn for old request-based dm. - * Called with the queue lock held. - */ -static void dm_old_request_fn(struct request_queue *q) -{ - struct mapped_device *md = q->queuedata; - struct dm_target *ti = md->immutable_target; - struct request *rq; - struct dm_rq_target_io *tio; - sector_t pos = 0; - - if (unlikely(!ti)) { - int srcu_idx; - struct dm_table *map = dm_get_live_table(md, &srcu_idx); - - if (unlikely(!map)) { - dm_put_live_table(md, srcu_idx); - return; - } - ti = dm_table_find_target(map, pos); - dm_put_live_table(md, srcu_idx); - } - - /* - * For suspend, check blk_queue_stopped() and increment - * ->pending within a single queue_lock not to increment the - * number of in-flight I/Os after the queue is stopped in - * dm_suspend(). - */ - while (!blk_queue_stopped(q)) { - rq = blk_peek_request(q); - if (!rq) - return; - - /* always use block 0 to find the target for flushes for now */ - pos = 0; - if (req_op(rq) != REQ_OP_FLUSH) - pos = blk_rq_pos(rq); - - if ((dm_old_request_peeked_before_merge_deadline(md) && - md_in_flight(md) && rq->bio && !bio_multiple_segments(rq->bio) && - md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) || - (ti->type->busy && ti->type->busy(ti))) { - blk_delay_queue(q, 10); - return; - } - - dm_start_request(md, rq); - - tio = tio_from_request(rq); - init_tio(tio, rq, md); - /* Establish tio->ti before queuing work (map_tio_request) */ - tio->ti = ti; - kthread_queue_work(&md->kworker, &tio->work); - BUG_ON(!irqs_disabled()); - } -} - -/* - * Fully initialize a .request_fn request-based queue. - */ -int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t) -{ - struct dm_target *immutable_tgt; - - /* Fully initialize the queue */ - md->queue->cmd_size = sizeof(struct dm_rq_target_io); - md->queue->rq_alloc_data = md; - md->queue->request_fn = dm_old_request_fn; - md->queue->init_rq_fn = dm_rq_init_rq; - - immutable_tgt = dm_table_get_immutable_target(t); - if (immutable_tgt && immutable_tgt->per_io_data_size) { - /* any target-specific per-io data is immediately after the tio */ - md->queue->cmd_size += immutable_tgt->per_io_data_size; - md->init_tio_pdu = true; - } - if (blk_init_allocated_queue(md->queue) < 0) - return -EINVAL; - - /* disable dm_old_request_fn's merge heuristic by default */ - md->seq_rq_merge_deadline_usecs = 0; - - blk_queue_softirq_done(md->queue, dm_softirq_done); - - /* Initialize the request-based DM worker thread */ - kthread_init_worker(&md->kworker); - md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker, - "kdmwork-%s", dm_device_name(md)); - if (IS_ERR(md->kworker_task)) { - int error = PTR_ERR(md->kworker_task); - md->kworker_task = NULL; - return error; - } - - return 0; -} - -static int dm_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, - unsigned int hctx_idx, unsigned int numa_node) -{ - return __dm_rq_init_rq(set->driver_data, rq); -} - static blk_status_t dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -790,11 +529,6 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) struct dm_target *immutable_tgt; int err; - if (!dm_table_all_blk_mq_devices(t)) { - DMERR("request-based dm-mq may only be stacked on blk-mq device(s)"); - return -EINVAL; - } - md->tag_set = kzalloc_node(sizeof(struct blk_mq_tag_set), GFP_KERNEL, md->numa_node_id); if (!md->tag_set) return -ENOMEM; @@ -845,6 +579,8 @@ void dm_mq_cleanup_mapped_device(struct mapped_device *md) module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools"); +/* Unused, but preserved for userspace compatibility */ +static bool use_blk_mq = true; module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices"); diff --git a/drivers/md/dm-rq.h b/drivers/md/dm-rq.h index f43c45460aac..b39245545229 100644 --- a/drivers/md/dm-rq.h +++ b/drivers/md/dm-rq.h @@ -46,10 +46,6 @@ struct dm_rq_clone_bio_info { struct bio clone; }; -bool dm_use_blk_mq_default(void); -bool dm_use_blk_mq(struct mapped_device *md); - -int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t); int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t); void dm_mq_cleanup_mapped_device(struct mapped_device *md); diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c index c209b8a19b84..a05fcd50e1b9 100644 --- a/drivers/md/dm-sysfs.c +++ b/drivers/md/dm-sysfs.c @@ -92,7 +92,8 @@ static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf) static ssize_t dm_attr_use_blk_mq_show(struct mapped_device *md, char *buf) { - sprintf(buf, "%d\n", dm_use_blk_mq(md)); + /* Purely for userspace compatibility */ + sprintf(buf, "%d\n", true); return strlen(buf); } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index fb4bea20657b..9038c302d5c2 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -47,7 +47,6 @@ struct dm_table { bool integrity_supported:1; bool singleton:1; - bool all_blk_mq:1; unsigned integrity_added:1; /* @@ -872,8 +871,7 @@ static bool __table_type_bio_based(enum dm_queue_mode table_type) static bool __table_type_request_based(enum dm_queue_mode table_type) { - return (table_type == DM_TYPE_REQUEST_BASED || - table_type == DM_TYPE_MQ_REQUEST_BASED); + return table_type == DM_TYPE_REQUEST_BASED; } void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type) @@ -999,10 +997,6 @@ verify_bio_based: BUG_ON(!request_based); /* No targets in this table */ - /* - * The only way to establish DM_TYPE_MQ_REQUEST_BASED is by - * having a compatible target use dm_table_set_type. - */ t->type = DM_TYPE_REQUEST_BASED; verify_rq_based: @@ -1022,11 +1016,9 @@ verify_rq_based: int srcu_idx; struct dm_table *live_table = dm_get_live_table(t->md, &srcu_idx); - /* inherit live table's type and all_blk_mq */ - if (live_table) { + /* inherit live table's type */ + if (live_table) t->type = live_table->type; - t->all_blk_mq = live_table->all_blk_mq; - } dm_put_live_table(t->md, srcu_idx); return 0; } @@ -1046,17 +1038,10 @@ verify_rq_based: DMERR("table load rejected: including non-request-stackable devices"); return -EINVAL; } - if (v.sq_count && v.mq_count) { + if (v.sq_count > 0) { DMERR("table load rejected: not all devices are blk-mq request-stackable"); return -EINVAL; } - t->all_blk_mq = v.mq_count > 0; - - if (!t->all_blk_mq && - (t->type == DM_TYPE_MQ_REQUEST_BASED || t->type == DM_TYPE_NVME_BIO_BASED)) { - DMERR("table load rejected: all devices are not blk-mq request-stackable"); - return -EINVAL; - } return 0; } @@ -1105,11 +1090,6 @@ bool dm_table_request_based(struct dm_table *t) return __table_type_request_based(dm_table_get_type(t)); } -bool dm_table_all_blk_mq_devices(struct dm_table *t) -{ - return t->all_blk_mq; -} - static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md) { enum dm_queue_mode type = dm_table_get_type(t); @@ -2089,26 +2069,24 @@ struct mapped_device *dm_table_get_md(struct dm_table *t) } EXPORT_SYMBOL(dm_table_get_md); +const char *dm_table_device_name(struct dm_table *t) +{ + return dm_device_name(t->md); +} +EXPORT_SYMBOL_GPL(dm_table_device_name); + void dm_table_run_md_queue_async(struct dm_table *t) { struct mapped_device *md; struct request_queue *queue; - unsigned long flags; if (!dm_table_request_based(t)) return; md = dm_table_get_md(t); queue = dm_get_md_queue(md); - if (queue) { - if (queue->mq_ops) - blk_mq_run_hw_queues(queue, true); - else { - spin_lock_irqsave(queue->queue_lock, flags); - blk_run_queue_async(queue); - spin_unlock_irqrestore(queue->queue_lock, flags); - } - } + if (queue) + blk_mq_run_hw_queues(queue, true); } EXPORT_SYMBOL(dm_table_run_md_queue_async); diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index aaf1ad481ee8..0bd8d498b3b9 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -325,7 +325,7 @@ struct thin_c { * Ensures the thin is not destroyed until the worker has finished * iterating the active_thins list. */ - atomic_t refcount; + refcount_t refcount; struct completion can_destroy; }; @@ -4044,12 +4044,12 @@ static struct target_type pool_target = { *--------------------------------------------------------------*/ static void thin_get(struct thin_c *tc) { - atomic_inc(&tc->refcount); + refcount_inc(&tc->refcount); } static void thin_put(struct thin_c *tc) { - if (atomic_dec_and_test(&tc->refcount)) + if (refcount_dec_and_test(&tc->refcount)) complete(&tc->can_destroy); } @@ -4193,7 +4193,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) r = -EINVAL; goto bad; } - atomic_set(&tc->refcount, 1); + refcount_set(&tc->refcount, 1); init_completion(&tc->can_destroy); list_add_tail_rcu(&tc->list, &tc->pool->active_thins); spin_unlock_irqrestore(&tc->pool->lock, flags); diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 684af08d0747..0ce04e5b4afb 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -212,12 +212,15 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, struct dm_verity_fec_io *fio = fec_io(io); u64 block, ileaved; u8 *bbuf, *rs_block; - u8 want_digest[v->digest_size]; + u8 want_digest[HASH_MAX_DIGESTSIZE]; unsigned n, k; if (neras) *neras = 0; + if (WARN_ON(v->digest_size > sizeof(want_digest))) + return -EINVAL; + /* * read each of the rsn data blocks that are part of the RS block, and * interleave contents to available bufs diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 5f1f80d424dd..2d50eec94cd7 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -350,10 +350,7 @@ static struct wc_memory_superblock *sb(struct dm_writecache *wc) static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e) { - if (is_power_of_2(sizeof(struct wc_entry)) && 0) - return &sb(wc)->entries[e - wc->entries]; - else - return &sb(wc)->entries[e->index]; + return &sb(wc)->entries[e->index]; } static void *memory_data(struct dm_writecache *wc, struct wc_entry *e) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 969954915566..fa68336560c3 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -99,7 +99,7 @@ struct dmz_mblock { struct rb_node node; struct list_head link; sector_t no; - atomic_t ref; + unsigned int ref; unsigned long state; struct page *page; void *data; @@ -296,7 +296,7 @@ static struct dmz_mblock *dmz_alloc_mblock(struct dmz_metadata *zmd, RB_CLEAR_NODE(&mblk->node); INIT_LIST_HEAD(&mblk->link); - atomic_set(&mblk->ref, 0); + mblk->ref = 0; mblk->state = 0; mblk->no = mblk_no; mblk->data = page_address(mblk->page); @@ -339,10 +339,11 @@ static void dmz_insert_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk) } /* - * Lookup a metadata block in the rbtree. + * Lookup a metadata block in the rbtree. If the block is found, increment + * its reference count. */ -static struct dmz_mblock *dmz_lookup_mblock(struct dmz_metadata *zmd, - sector_t mblk_no) +static struct dmz_mblock *dmz_get_mblock_fast(struct dmz_metadata *zmd, + sector_t mblk_no) { struct rb_root *root = &zmd->mblk_rbtree; struct rb_node *node = root->rb_node; @@ -350,8 +351,17 @@ static struct dmz_mblock *dmz_lookup_mblock(struct dmz_metadata *zmd, while (node) { mblk = container_of(node, struct dmz_mblock, node); - if (mblk->no == mblk_no) + if (mblk->no == mblk_no) { + /* + * If this is the first reference to the block, + * remove it from the LRU list. + */ + mblk->ref++; + if (mblk->ref == 1 && + !test_bit(DMZ_META_DIRTY, &mblk->state)) + list_del_init(&mblk->link); return mblk; + } node = (mblk->no < mblk_no) ? node->rb_left : node->rb_right; } @@ -382,32 +392,47 @@ static void dmz_mblock_bio_end_io(struct bio *bio) } /* - * Read a metadata block from disk. + * Read an uncached metadata block from disk and add it to the cache. */ -static struct dmz_mblock *dmz_fetch_mblock(struct dmz_metadata *zmd, - sector_t mblk_no) +static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd, + sector_t mblk_no) { - struct dmz_mblock *mblk; + struct dmz_mblock *mblk, *m; sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no; struct bio *bio; - /* Get block and insert it */ + /* Get a new block and a BIO to read it */ mblk = dmz_alloc_mblock(zmd, mblk_no); if (!mblk) return NULL; - spin_lock(&zmd->mblk_lock); - atomic_inc(&mblk->ref); - set_bit(DMZ_META_READING, &mblk->state); - dmz_insert_mblock(zmd, mblk); - spin_unlock(&zmd->mblk_lock); - bio = bio_alloc(GFP_NOIO, 1); if (!bio) { dmz_free_mblock(zmd, mblk); return NULL; } + spin_lock(&zmd->mblk_lock); + + /* + * Make sure that another context did not start reading + * the block already. + */ + m = dmz_get_mblock_fast(zmd, mblk_no); + if (m) { + spin_unlock(&zmd->mblk_lock); + dmz_free_mblock(zmd, mblk); + bio_put(bio); + return m; + } + + mblk->ref++; + set_bit(DMZ_META_READING, &mblk->state); + dmz_insert_mblock(zmd, mblk); + + spin_unlock(&zmd->mblk_lock); + + /* Submit read BIO */ bio->bi_iter.bi_sector = dmz_blk2sect(block); bio_set_dev(bio, zmd->dev->bdev); bio->bi_private = mblk; @@ -484,7 +509,8 @@ static void dmz_release_mblock(struct dmz_metadata *zmd, spin_lock(&zmd->mblk_lock); - if (atomic_dec_and_test(&mblk->ref)) { + mblk->ref--; + if (mblk->ref == 0) { if (test_bit(DMZ_META_ERROR, &mblk->state)) { rb_erase(&mblk->node, &zmd->mblk_rbtree); dmz_free_mblock(zmd, mblk); @@ -508,18 +534,12 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd, /* Check rbtree */ spin_lock(&zmd->mblk_lock); - mblk = dmz_lookup_mblock(zmd, mblk_no); - if (mblk) { - /* Cache hit: remove block from LRU list */ - if (atomic_inc_return(&mblk->ref) == 1 && - !test_bit(DMZ_META_DIRTY, &mblk->state)) - list_del_init(&mblk->link); - } + mblk = dmz_get_mblock_fast(zmd, mblk_no); spin_unlock(&zmd->mblk_lock); if (!mblk) { /* Cache miss: read the block from disk */ - mblk = dmz_fetch_mblock(zmd, mblk_no); + mblk = dmz_get_mblock_slow(zmd, mblk_no); if (!mblk) return ERR_PTR(-ENOMEM); } @@ -753,7 +773,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) spin_lock(&zmd->mblk_lock); clear_bit(DMZ_META_DIRTY, &mblk->state); - if (atomic_read(&mblk->ref) == 0) + if (mblk->ref == 0) list_add_tail(&mblk->link, &zmd->mblk_lru_list); spin_unlock(&zmd->mblk_lock); } @@ -2308,7 +2328,7 @@ static void dmz_cleanup_metadata(struct dmz_metadata *zmd) mblk = list_first_entry(&zmd->mblk_dirty_list, struct dmz_mblock, link); dmz_dev_warn(zmd->dev, "mblock %llu still in dirty list (ref %u)", - (u64)mblk->no, atomic_read(&mblk->ref)); + (u64)mblk->no, mblk->ref); list_del_init(&mblk->link); rb_erase(&mblk->node, &zmd->mblk_rbtree); dmz_free_mblock(zmd, mblk); @@ -2326,8 +2346,8 @@ static void dmz_cleanup_metadata(struct dmz_metadata *zmd) root = &zmd->mblk_rbtree; rbtree_postorder_for_each_entry_safe(mblk, next, root, node) { dmz_dev_warn(zmd->dev, "mblock %llu ref %u still in rbtree", - (u64)mblk->no, atomic_read(&mblk->ref)); - atomic_set(&mblk->ref, 0); + (u64)mblk->no, mblk->ref); + mblk->ref = 0; dmz_free_mblock(zmd, mblk); } diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 12d96a263623..981154e59461 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -19,7 +19,7 @@ struct dmz_bioctx { struct dmz_target *target; struct dm_zone *zone; struct bio *bio; - atomic_t ref; + refcount_t ref; blk_status_t status; }; @@ -28,7 +28,7 @@ struct dmz_bioctx { */ struct dm_chunk_work { struct work_struct work; - atomic_t refcount; + refcount_t refcount; struct dmz_target *target; unsigned int chunk; struct bio_list bio_list; @@ -115,7 +115,7 @@ static int dmz_submit_read_bio(struct dmz_target *dmz, struct dm_zone *zone, if (nr_blocks == dmz_bio_blocks(bio)) { /* Setup and submit the BIO */ bio->bi_iter.bi_sector = sector; - atomic_inc(&bioctx->ref); + refcount_inc(&bioctx->ref); generic_make_request(bio); return 0; } @@ -134,7 +134,7 @@ static int dmz_submit_read_bio(struct dmz_target *dmz, struct dm_zone *zone, bio_advance(bio, clone->bi_iter.bi_size); /* Submit the clone */ - atomic_inc(&bioctx->ref); + refcount_inc(&bioctx->ref); generic_make_request(clone); return 0; @@ -240,7 +240,7 @@ static void dmz_submit_write_bio(struct dmz_target *dmz, struct dm_zone *zone, /* Setup and submit the BIO */ bio_set_dev(bio, dmz->dev->bdev); bio->bi_iter.bi_sector = dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block); - atomic_inc(&bioctx->ref); + refcount_inc(&bioctx->ref); generic_make_request(bio); if (dmz_is_seq(zone)) @@ -456,7 +456,7 @@ out: */ static inline void dmz_get_chunk_work(struct dm_chunk_work *cw) { - atomic_inc(&cw->refcount); + refcount_inc(&cw->refcount); } /* @@ -465,7 +465,7 @@ static inline void dmz_get_chunk_work(struct dm_chunk_work *cw) */ static void dmz_put_chunk_work(struct dm_chunk_work *cw) { - if (atomic_dec_and_test(&cw->refcount)) { + if (refcount_dec_and_test(&cw->refcount)) { WARN_ON(!bio_list_empty(&cw->bio_list)); radix_tree_delete(&cw->target->chunk_rxtree, cw->chunk); kfree(cw); @@ -546,7 +546,7 @@ static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio) goto out; INIT_WORK(&cw->work, dmz_chunk_work); - atomic_set(&cw->refcount, 0); + refcount_set(&cw->refcount, 0); cw->target = dmz; cw->chunk = chunk; bio_list_init(&cw->bio_list); @@ -599,7 +599,7 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) bioctx->target = dmz; bioctx->zone = NULL; bioctx->bio = bio; - atomic_set(&bioctx->ref, 1); + refcount_set(&bioctx->ref, 1); bioctx->status = BLK_STS_OK; /* Set the BIO pending in the flush list */ @@ -633,7 +633,7 @@ static int dmz_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error if (bioctx->status == BLK_STS_OK && *error) bioctx->status = *error; - if (!atomic_dec_and_test(&bioctx->ref)) + if (!refcount_dec_and_test(&bioctx->ref)) return DM_ENDIO_INCOMPLETE; /* Done */ diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 6be21dc210a1..c510179a7f84 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1664,7 +1664,7 @@ static blk_qc_t __process_bio(struct mapped_device *md, * Defend against IO still getting in during teardown * - as was seen for a time with nvme-fcloop */ - if (unlikely(WARN_ON_ONCE(!ti || !dm_target_is_valid(ti)))) { + if (WARN_ON_ONCE(!ti || !dm_target_is_valid(ti))) { error = -EIO; goto out; } @@ -1806,8 +1806,6 @@ static void dm_wq_work(struct work_struct *work); static void dm_init_normal_md_queue(struct mapped_device *md) { - md->use_blk_mq = false; - /* * Initialize aspects of queue that aren't relevant for blk-mq */ @@ -1818,8 +1816,6 @@ static void cleanup_mapped_device(struct mapped_device *md) { if (md->wq) destroy_workqueue(md->wq); - if (md->kworker_task) - kthread_stop(md->kworker_task); bioset_exit(&md->bs); bioset_exit(&md->io_bs); @@ -1886,7 +1882,6 @@ static struct mapped_device *alloc_dev(int minor) goto bad_io_barrier; md->numa_node_id = numa_node_id; - md->use_blk_mq = dm_use_blk_mq_default(); md->init_tio_pdu = false; md->type = DM_TYPE_NONE; mutex_init(&md->suspend_lock); @@ -1917,7 +1912,6 @@ static struct mapped_device *alloc_dev(int minor) INIT_WORK(&md->work, dm_wq_work); init_waitqueue_head(&md->eventq); init_completion(&md->kobj_holder.completion); - md->kworker_task = NULL; md->disk->major = _major; md->disk->first_minor = minor; @@ -2217,14 +2211,6 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) switch (type) { case DM_TYPE_REQUEST_BASED: - dm_init_normal_md_queue(md); - r = dm_old_init_request_queue(md, t); - if (r) { - DMERR("Cannot initialize queue for request-based mapped device"); - return r; - } - break; - case DM_TYPE_MQ_REQUEST_BASED: r = dm_mq_init_request_queue(md, t); if (r) { DMERR("Cannot initialize queue for request-based dm-mq mapped device"); @@ -2329,9 +2315,6 @@ static void __dm_destroy(struct mapped_device *md, bool wait) blk_set_queue_dying(md->queue); - if (dm_request_based(md) && md->kworker_task) - kthread_flush_worker(&md->kworker); - /* * Take suspend_lock so that presuspend and postsuspend methods * do not race with internal suspend. @@ -2584,11 +2567,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, * Stop md->queue before flushing md->wq in case request-based * dm defers requests to md->wq from md->queue. */ - if (dm_request_based(md)) { + if (dm_request_based(md)) dm_stop_queue(md->queue); - if (md->kworker_task) - kthread_flush_worker(&md->kworker); - } flush_workqueue(md->wq); @@ -2963,7 +2943,6 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu goto out; break; case DM_TYPE_REQUEST_BASED: - case DM_TYPE_MQ_REQUEST_BASED: pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size); front_pad = offsetof(struct dm_rq_clone_bio_info, clone); /* per_io_data_size is used for blk-mq pdu at queue allocation */ diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 114a81b27c37..2d539b82ec08 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -70,7 +70,6 @@ struct dm_target *dm_table_get_immutable_target(struct dm_table *t); struct dm_target *dm_table_get_wildcard_target(struct dm_table *t); bool dm_table_bio_based(struct dm_table *t); bool dm_table_request_based(struct dm_table *t); -bool dm_table_all_blk_mq_devices(struct dm_table *t); void dm_table_free_md_mempools(struct dm_table *t); struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 2fc8c113977f..1cd4f991792c 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2288,9 +2288,9 @@ location_store(struct mddev *mddev, const char *buf, size_t len) goto out; } if (mddev->pers) { - mddev->pers->quiesce(mddev, 1); + mddev_suspend(mddev); md_bitmap_destroy(mddev); - mddev->pers->quiesce(mddev, 0); + mddev_resume(mddev); } mddev->bitmap_info.offset = 0; if (mddev->bitmap_info.file) { @@ -2327,8 +2327,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len) mddev->bitmap_info.offset = offset; if (mddev->pers) { struct bitmap *bitmap; - mddev->pers->quiesce(mddev, 1); bitmap = md_bitmap_create(mddev, -1); + mddev_suspend(mddev); if (IS_ERR(bitmap)) rv = PTR_ERR(bitmap); else { @@ -2337,11 +2337,12 @@ location_store(struct mddev *mddev, const char *buf, size_t len) if (rv) mddev->bitmap_info.offset = 0; } - mddev->pers->quiesce(mddev, 0); if (rv) { md_bitmap_destroy(mddev); + mddev_resume(mddev); goto out; } + mddev_resume(mddev); } } } diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 0b2af6e74fc3..8dff19d5502e 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -33,13 +33,6 @@ struct dlm_lock_resource { int mode; }; -struct suspend_info { - int slot; - sector_t lo; - sector_t hi; - struct list_head list; -}; - struct resync_info { __le64 lo; __le64 hi; @@ -80,7 +73,13 @@ struct md_cluster_info { struct dlm_lock_resource **other_bitmap_lockres; struct dlm_lock_resource *resync_lockres; struct list_head suspend_list; + spinlock_t suspend_lock; + /* record the region which write should be suspended */ + sector_t suspend_lo; + sector_t suspend_hi; + int suspend_from; /* the slot which broadcast suspend_lo/hi */ + struct md_thread *recovery_thread; unsigned long recovery_map; /* communication loc resources */ @@ -105,6 +104,7 @@ enum msg_type { RE_ADD, BITMAP_NEEDS_SYNC, CHANGE_CAPACITY, + BITMAP_RESIZE, }; struct cluster_msg { @@ -270,25 +270,22 @@ static void add_resync_info(struct dlm_lock_resource *lockres, ri->hi = cpu_to_le64(hi); } -static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres) +static int read_resync_info(struct mddev *mddev, + struct dlm_lock_resource *lockres) { struct resync_info ri; - struct suspend_info *s = NULL; - sector_t hi = 0; + struct md_cluster_info *cinfo = mddev->cluster_info; + int ret = 0; dlm_lock_sync(lockres, DLM_LOCK_CR); memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info)); - hi = le64_to_cpu(ri.hi); - if (hi > 0) { - s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); - if (!s) - goto out; - s->hi = hi; - s->lo = le64_to_cpu(ri.lo); + if (le64_to_cpu(ri.hi) > 0) { + cinfo->suspend_hi = le64_to_cpu(ri.hi); + cinfo->suspend_lo = le64_to_cpu(ri.lo); + ret = 1; } dlm_unlock_sync(lockres); -out: - return s; + return ret; } static void recover_bitmaps(struct md_thread *thread) @@ -298,7 +295,6 @@ static void recover_bitmaps(struct md_thread *thread) struct dlm_lock_resource *bm_lockres; char str[64]; int slot, ret; - struct suspend_info *s, *tmp; sector_t lo, hi; while (cinfo->recovery_map) { @@ -325,13 +321,17 @@ static void recover_bitmaps(struct md_thread *thread) /* Clear suspend_area associated with the bitmap */ spin_lock_irq(&cinfo->suspend_lock); - list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list) - if (slot == s->slot) { - list_del(&s->list); - kfree(s); - } + cinfo->suspend_hi = 0; + cinfo->suspend_lo = 0; + cinfo->suspend_from = -1; spin_unlock_irq(&cinfo->suspend_lock); + /* Kick off a reshape if needed */ + if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && + test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + mddev->reshape_position != MaxSector) + md_wakeup_thread(mddev->sync_thread); + if (hi > 0) { if (lo < mddev->recovery_cp) mddev->recovery_cp = lo; @@ -434,34 +434,23 @@ static void ack_bast(void *arg, int mode) } } -static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot) -{ - struct suspend_info *s, *tmp; - - list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list) - if (slot == s->slot) { - list_del(&s->list); - kfree(s); - break; - } -} - static void remove_suspend_info(struct mddev *mddev, int slot) { struct md_cluster_info *cinfo = mddev->cluster_info; mddev->pers->quiesce(mddev, 1); spin_lock_irq(&cinfo->suspend_lock); - __remove_suspend_info(cinfo, slot); + cinfo->suspend_hi = 0; + cinfo->suspend_lo = 0; spin_unlock_irq(&cinfo->suspend_lock); mddev->pers->quiesce(mddev, 0); } - static void process_suspend_info(struct mddev *mddev, int slot, sector_t lo, sector_t hi) { struct md_cluster_info *cinfo = mddev->cluster_info; - struct suspend_info *s; + struct mdp_superblock_1 *sb = NULL; + struct md_rdev *rdev; if (!hi) { /* @@ -475,6 +464,12 @@ static void process_suspend_info(struct mddev *mddev, return; } + rdev_for_each(rdev, mddev) + if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) { + sb = page_address(rdev->sb_page); + break; + } + /* * The bitmaps are not same for different nodes * if RESYNCING is happening in one node, then @@ -487,26 +482,26 @@ static void process_suspend_info(struct mddev *mddev, * sync_low/hi is used to record the region which * arrived in the previous RESYNCING message, * - * Call bitmap_sync_with_cluster to clear - * NEEDED_MASK and set RESYNC_MASK since - * resync thread is running in another node, - * so we don't need to do the resync again - * with the same section */ - md_bitmap_sync_with_cluster(mddev, cinfo->sync_low, cinfo->sync_hi, lo, hi); + * Call md_bitmap_sync_with_cluster to clear NEEDED_MASK + * and set RESYNC_MASK since resync thread is running + * in another node, so we don't need to do the resync + * again with the same section. + * + * Skip md_bitmap_sync_with_cluster in case reshape + * happening, because reshaping region is small and + * we don't want to trigger lots of WARN. + */ + if (sb && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) + md_bitmap_sync_with_cluster(mddev, cinfo->sync_low, + cinfo->sync_hi, lo, hi); cinfo->sync_low = lo; cinfo->sync_hi = hi; - s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); - if (!s) - return; - s->slot = slot; - s->lo = lo; - s->hi = hi; mddev->pers->quiesce(mddev, 1); spin_lock_irq(&cinfo->suspend_lock); - /* Remove existing entry (if exists) before adding */ - __remove_suspend_info(cinfo, slot); - list_add(&s->list, &cinfo->suspend_list); + cinfo->suspend_from = slot; + cinfo->suspend_lo = lo; + cinfo->suspend_hi = hi; spin_unlock_irq(&cinfo->suspend_lock); mddev->pers->quiesce(mddev, 0); } @@ -612,6 +607,11 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) case BITMAP_NEEDS_SYNC: __recover_slot(mddev, le32_to_cpu(msg->slot)); break; + case BITMAP_RESIZE: + if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0)) + ret = md_bitmap_resize(mddev->bitmap, + le64_to_cpu(msg->high), 0, 0); + break; default: ret = -1; pr_warn("%s:%d Received unknown message from %d\n", @@ -800,7 +800,6 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots) struct md_cluster_info *cinfo = mddev->cluster_info; int i, ret = 0; struct dlm_lock_resource *bm_lockres; - struct suspend_info *s; char str[64]; sector_t lo, hi; @@ -819,16 +818,13 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots) bm_lockres->flags |= DLM_LKF_NOQUEUE; ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); if (ret == -EAGAIN) { - s = read_resync_info(mddev, bm_lockres); - if (s) { + if (read_resync_info(mddev, bm_lockres)) { pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n", __func__, __LINE__, - (unsigned long long) s->lo, - (unsigned long long) s->hi, i); - spin_lock_irq(&cinfo->suspend_lock); - s->slot = i; - list_add(&s->list, &cinfo->suspend_list); - spin_unlock_irq(&cinfo->suspend_lock); + (unsigned long long) cinfo->suspend_lo, + (unsigned long long) cinfo->suspend_hi, + i); + cinfo->suspend_from = i; } ret = 0; lockres_free(bm_lockres); @@ -1001,10 +997,17 @@ static int leave(struct mddev *mddev) if (!cinfo) return 0; - /* BITMAP_NEEDS_SYNC message should be sent when node + /* + * BITMAP_NEEDS_SYNC message should be sent when node * is leaving the cluster with dirty bitmap, also we - * can only deliver it when dlm connection is available */ - if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector) + * can only deliver it when dlm connection is available. + * + * Also, we should send BITMAP_NEEDS_SYNC message in + * case reshaping is interrupted. + */ + if ((cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector) || + (mddev->reshape_position != MaxSector && + test_bit(MD_CLOSING, &mddev->flags))) resync_bitmap(mddev); set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); @@ -1102,6 +1105,80 @@ static void metadata_update_cancel(struct mddev *mddev) unlock_comm(cinfo); } +static int update_bitmap_size(struct mddev *mddev, sector_t size) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + struct cluster_msg cmsg = {0}; + int ret; + + cmsg.type = cpu_to_le32(BITMAP_RESIZE); + cmsg.high = cpu_to_le64(size); + ret = sendmsg(cinfo, &cmsg, 0); + if (ret) + pr_err("%s:%d: failed to send BITMAP_RESIZE message (%d)\n", + __func__, __LINE__, ret); + return ret; +} + +static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsize) +{ + struct bitmap_counts *counts; + char str[64]; + struct dlm_lock_resource *bm_lockres; + struct bitmap *bitmap = mddev->bitmap; + unsigned long my_pages = bitmap->counts.pages; + int i, rv; + + /* + * We need to ensure all the nodes can grow to a larger + * bitmap size before make the reshaping. + */ + rv = update_bitmap_size(mddev, newsize); + if (rv) + return rv; + + for (i = 0; i < mddev->bitmap_info.nodes; i++) { + if (i == md_cluster_ops->slot_number(mddev)) + continue; + + bitmap = get_bitmap_from_slot(mddev, i); + if (IS_ERR(bitmap)) { + pr_err("can't get bitmap from slot %d\n", i); + goto out; + } + counts = &bitmap->counts; + + /* + * If we can hold the bitmap lock of one node then + * the slot is not occupied, update the pages. + */ + snprintf(str, 64, "bitmap%04d", i); + bm_lockres = lockres_init(mddev, str, NULL, 1); + if (!bm_lockres) { + pr_err("Cannot initialize %s lock\n", str); + goto out; + } + bm_lockres->flags |= DLM_LKF_NOQUEUE; + rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); + if (!rv) + counts->pages = my_pages; + lockres_free(bm_lockres); + + if (my_pages != counts->pages) + /* + * Let's revert the bitmap size if one node + * can't resize bitmap + */ + goto out; + } + + return 0; +out: + md_bitmap_free(bitmap); + update_bitmap_size(mddev, oldsize); + return -1; +} + /* * return 0 if all the bitmaps have the same sync_size */ @@ -1243,6 +1320,16 @@ static int resync_start(struct mddev *mddev) return dlm_lock_sync_interruptible(cinfo->resync_lockres, DLM_LOCK_EX, mddev); } +static void resync_info_get(struct mddev *mddev, sector_t *lo, sector_t *hi) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + + spin_lock_irq(&cinfo->suspend_lock); + *lo = cinfo->suspend_lo; + *hi = cinfo->suspend_hi; + spin_unlock_irq(&cinfo->suspend_lock); +} + static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) { struct md_cluster_info *cinfo = mddev->cluster_info; @@ -1295,21 +1382,14 @@ static int area_resyncing(struct mddev *mddev, int direction, { struct md_cluster_info *cinfo = mddev->cluster_info; int ret = 0; - struct suspend_info *s; if ((direction == READ) && test_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state)) return 1; spin_lock_irq(&cinfo->suspend_lock); - if (list_empty(&cinfo->suspend_list)) - goto out; - list_for_each_entry(s, &cinfo->suspend_list, list) - if (hi > s->lo && lo < s->hi) { - ret = 1; - break; - } -out: + if (hi > cinfo->suspend_lo && lo < cinfo->suspend_hi) + ret = 1; spin_unlock_irq(&cinfo->suspend_lock); return ret; } @@ -1482,6 +1562,7 @@ static struct md_cluster_operations cluster_ops = { .resync_start = resync_start, .resync_finish = resync_finish, .resync_info_update = resync_info_update, + .resync_info_get = resync_info_get, .metadata_update_start = metadata_update_start, .metadata_update_finish = metadata_update_finish, .metadata_update_cancel = metadata_update_cancel, @@ -1492,6 +1573,7 @@ static struct md_cluster_operations cluster_ops = { .remove_disk = remove_disk, .load_bitmaps = load_bitmaps, .gather_bitmaps = gather_bitmaps, + .resize_bitmaps = resize_bitmaps, .lock_all_bitmaps = lock_all_bitmaps, .unlock_all_bitmaps = unlock_all_bitmaps, .update_size = update_size, diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h index c0240708f443..a78e3021775d 100644 --- a/drivers/md/md-cluster.h +++ b/drivers/md/md-cluster.h @@ -14,6 +14,7 @@ struct md_cluster_operations { int (*leave)(struct mddev *mddev); int (*slot_number)(struct mddev *mddev); int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi); + void (*resync_info_get)(struct mddev *mddev, sector_t *lo, sector_t *hi); int (*metadata_update_start)(struct mddev *mddev); int (*metadata_update_finish)(struct mddev *mddev); void (*metadata_update_cancel)(struct mddev *mddev); @@ -26,6 +27,7 @@ struct md_cluster_operations { int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev); void (*load_bitmaps)(struct mddev *mddev, int total_slots); int (*gather_bitmaps)(struct md_rdev *rdev); + int (*resize_bitmaps)(struct mddev *mddev, sector_t newsize, sector_t oldsize); int (*lock_all_bitmaps)(struct mddev *mddev); void (*unlock_all_bitmaps)(struct mddev *mddev); void (*update_size)(struct mddev *mddev, sector_t old_dev_sectors); diff --git a/drivers/md/md.c b/drivers/md/md.c index 63ceabb4e020..fc488cb30a94 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -452,10 +452,11 @@ static void md_end_flush(struct bio *fbio) rdev_dec_pending(rdev, mddev); if (atomic_dec_and_test(&fi->flush_pending)) { - if (bio->bi_iter.bi_size == 0) + if (bio->bi_iter.bi_size == 0) { /* an empty barrier - all done */ bio_endio(bio); - else { + mempool_free(fi, mddev->flush_pool); + } else { INIT_WORK(&fi->flush_work, submit_flushes); queue_work(md_wq, &fi->flush_work); } @@ -509,10 +510,11 @@ void md_flush_request(struct mddev *mddev, struct bio *bio) rcu_read_unlock(); if (atomic_dec_and_test(&fi->flush_pending)) { - if (bio->bi_iter.bi_size == 0) + if (bio->bi_iter.bi_size == 0) { /* an empty barrier - all done */ bio_endio(bio); - else { + mempool_free(fi, mddev->flush_pool); + } else { INIT_WORK(&fi->flush_work, submit_flushes); queue_work(md_wq, &fi->flush_work); } @@ -5904,14 +5906,6 @@ static void __md_stop(struct mddev *mddev) mddev->to_remove = &md_redundancy_group; module_put(pers->owner); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); -} - -void md_stop(struct mddev *mddev) -{ - /* stop the array and free an attached data structures. - * This is called from dm-raid - */ - __md_stop(mddev); if (mddev->flush_bio_pool) { mempool_destroy(mddev->flush_bio_pool); mddev->flush_bio_pool = NULL; @@ -5920,6 +5914,14 @@ void md_stop(struct mddev *mddev) mempool_destroy(mddev->flush_pool); mddev->flush_pool = NULL; } +} + +void md_stop(struct mddev *mddev) +{ + /* stop the array and free an attached data structures. + * This is called from dm-raid + */ + __md_stop(mddev); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); } @@ -8370,9 +8372,17 @@ void md_do_sync(struct md_thread *thread) else if (!mddev->bitmap) j = mddev->recovery_cp; - } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { max_sectors = mddev->resync_max_sectors; - else { + /* + * If the original node aborts reshaping then we continue the + * reshaping, so set j again to avoid restart reshape from the + * first beginning + */ + if (mddev_is_clustered(mddev) && + mddev->reshape_position != MaxSector) + j = mddev->reshape_position; + } else { /* recovery follows the physical size of devices */ max_sectors = mddev->dev_sectors; j = MaxSector; @@ -8623,8 +8633,10 @@ void md_do_sync(struct md_thread *thread) mddev_lock_nointr(mddev); md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); mddev_unlock(mddev); - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); + if (!mddev_is_clustered(mddev)) { + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + } } spin_lock(&mddev->lock); @@ -8790,6 +8802,18 @@ static void md_start_sync(struct work_struct *ws) */ void md_check_recovery(struct mddev *mddev) { + if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) { + /* Write superblock - thread that called mddev_suspend() + * holds reconfig_mutex for us. + */ + set_bit(MD_UPDATING_SB, &mddev->flags); + smp_mb__after_atomic(); + if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags)) + md_update_sb(mddev, 0); + clear_bit_unlock(MD_UPDATING_SB, &mddev->flags); + wake_up(&mddev->sb_wait); + } + if (mddev->suspended) return; @@ -8949,16 +8973,6 @@ void md_check_recovery(struct mddev *mddev) unlock: wake_up(&mddev->sb_wait); mddev_unlock(mddev); - } else if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) { - /* Write superblock - thread that called mddev_suspend() - * holds reconfig_mutex for us. - */ - set_bit(MD_UPDATING_SB, &mddev->flags); - smp_mb__after_atomic(); - if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags)) - md_update_sb(mddev, 0); - clear_bit_unlock(MD_UPDATING_SB, &mddev->flags); - wake_up(&mddev->sb_wait); } } EXPORT_SYMBOL(md_check_recovery); @@ -8966,6 +8980,8 @@ EXPORT_SYMBOL(md_check_recovery); void md_reap_sync_thread(struct mddev *mddev) { struct md_rdev *rdev; + sector_t old_dev_sectors = mddev->dev_sectors; + bool is_reshaped = false; /* resync has finished, collect result */ md_unregister_thread(&mddev->sync_thread); @@ -8980,8 +8996,11 @@ void md_reap_sync_thread(struct mddev *mddev) } } if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - mddev->pers->finish_reshape) + mddev->pers->finish_reshape) { mddev->pers->finish_reshape(mddev); + if (mddev_is_clustered(mddev)) + is_reshaped = true; + } /* If array is no-longer degraded, then any saved_raid_disk * information must be scrapped. @@ -9002,6 +9021,14 @@ void md_reap_sync_thread(struct mddev *mddev) clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + /* + * We call md_cluster_ops->update_size here because sync_size could + * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, + * so it is time to update size across cluster. + */ + if (mddev_is_clustered(mddev) && is_reshaped + && !test_bit(MD_CLOSING, &mddev->flags)) + md_cluster_ops->update_size(mddev, old_dev_sectors); wake_up(&resync_wait); /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -9201,8 +9228,12 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) } if (role != rdev2->raid_disk) { - /* got activated */ - if (rdev2->raid_disk == -1 && role != 0xffff) { + /* + * got activated except reshape is happening. + */ + if (rdev2->raid_disk == -1 && role != 0xffff && + !(le32_to_cpu(sb->feature_map) & + MD_FEATURE_RESHAPE_ACTIVE)) { rdev2->saved_raid_disk = role; ret = remove_and_add_spares(mddev, rdev2); pr_info("Activated spare: %s\n", @@ -9228,6 +9259,30 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); + /* + * Since mddev->delta_disks has already updated in update_raid_disks, + * so it is time to check reshape. + */ + if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && + (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { + /* + * reshape is happening in the remote node, we need to + * update reshape_position and call start_reshape. + */ + mddev->reshape_position = sb->reshape_position; + if (mddev->pers->update_reshape_pos) + mddev->pers->update_reshape_pos(mddev); + if (mddev->pers->start_reshape) + mddev->pers->start_reshape(mddev); + } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && + mddev->reshape_position != MaxSector && + !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { + /* reshape is just done in another node. */ + mddev->reshape_position = MaxSector; + if (mddev->pers->update_reshape_pos) + mddev->pers->update_reshape_pos(mddev); + } + /* Finally set the event to be up to date */ mddev->events = le64_to_cpu(sb->events); } diff --git a/drivers/md/md.h b/drivers/md/md.h index 8afd6bfdbfb9..c52afb52c776 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -557,6 +557,7 @@ struct md_personality int (*check_reshape) (struct mddev *mddev); int (*start_reshape) (struct mddev *mddev); void (*finish_reshape) (struct mddev *mddev); + void (*update_reshape_pos) (struct mddev *mddev); /* quiesce suspends or resumes internal processing. * 1 - stop new actions and wait for action io to complete * 0 - return to normal behaviour diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 4e990246225e..1d54109071cc 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1734,6 +1734,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) */ if (rdev->saved_raid_disk >= 0 && rdev->saved_raid_disk >= first && + rdev->saved_raid_disk < conf->raid_disks && conf->mirrors[rdev->saved_raid_disk].rdev == NULL) first = last = rdev->saved_raid_disk; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index d6f7978b4449..b98e746e7fc4 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -25,6 +25,7 @@ #include <linux/seq_file.h> #include <linux/ratelimit.h> #include <linux/kthread.h> +#include <linux/raid/md_p.h> #include <trace/events/block.h> #include "md.h" #include "raid10.h" @@ -1808,6 +1809,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) first = last = rdev->raid_disk; if (rdev->saved_raid_disk >= first && + rdev->saved_raid_disk < conf->geo.raid_disks && conf->mirrors[rdev->saved_raid_disk].rdev == NULL) mirror = rdev->saved_raid_disk; else @@ -3079,6 +3081,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, sector_t sect; int must_sync; int any_working; + int need_recover = 0; + int need_replace = 0; struct raid10_info *mirror = &conf->mirrors[i]; struct md_rdev *mrdev, *mreplace; @@ -3086,11 +3090,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, mrdev = rcu_dereference(mirror->rdev); mreplace = rcu_dereference(mirror->replacement); - if ((mrdev == NULL || - test_bit(Faulty, &mrdev->flags) || - test_bit(In_sync, &mrdev->flags)) && - (mreplace == NULL || - test_bit(Faulty, &mreplace->flags))) { + if (mrdev != NULL && + !test_bit(Faulty, &mrdev->flags) && + !test_bit(In_sync, &mrdev->flags)) + need_recover = 1; + if (mreplace != NULL && + !test_bit(Faulty, &mreplace->flags)) + need_replace = 1; + + if (!need_recover && !need_replace) { rcu_read_unlock(); continue; } @@ -3213,7 +3221,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, r10_bio->devs[1].devnum = i; r10_bio->devs[1].addr = to_addr; - if (!test_bit(In_sync, &mrdev->flags)) { + if (need_recover) { bio = r10_bio->devs[1].bio; bio->bi_next = biolist; biolist = bio; @@ -3230,16 +3238,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio = r10_bio->devs[1].repl_bio; if (bio) bio->bi_end_io = NULL; - /* Note: if mreplace != NULL, then bio + /* Note: if need_replace, then bio * cannot be NULL as r10buf_pool_alloc will * have allocated it. - * So the second test here is pointless. - * But it keeps semantic-checkers happy, and - * this comment keeps human reviewers - * happy. */ - if (mreplace == NULL || bio == NULL || - test_bit(Faulty, &mreplace->flags)) + if (!need_replace) break; bio->bi_next = biolist; biolist = bio; @@ -4286,12 +4289,46 @@ static int raid10_start_reshape(struct mddev *mddev) spin_unlock_irq(&conf->device_lock); if (mddev->delta_disks && mddev->bitmap) { - ret = md_bitmap_resize(mddev->bitmap, - raid10_size(mddev, 0, conf->geo.raid_disks), - 0, 0); + struct mdp_superblock_1 *sb = NULL; + sector_t oldsize, newsize; + + oldsize = raid10_size(mddev, 0, 0); + newsize = raid10_size(mddev, 0, conf->geo.raid_disks); + + if (!mddev_is_clustered(mddev)) { + ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); + if (ret) + goto abort; + else + goto out; + } + + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk > -1 && + !test_bit(Faulty, &rdev->flags)) + sb = page_address(rdev->sb_page); + } + + /* + * some node is already performing reshape, and no need to + * call md_bitmap_resize again since it should be called when + * receiving BITMAP_RESIZE msg + */ + if ((sb && (le32_to_cpu(sb->feature_map) & + MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize)) + goto out; + + ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); if (ret) goto abort; + + ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize); + if (ret) { + md_bitmap_resize(mddev->bitmap, oldsize, 0, 0); + goto abort; + } } +out: if (mddev->delta_disks > 0) { rdev_for_each(rdev, mddev) if (rdev->raid_disk < 0 && @@ -4568,6 +4605,32 @@ read_more: r10_bio->master_bio = read_bio; r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; + /* + * Broadcast RESYNC message to other nodes, so all nodes would not + * write to the region to avoid conflict. + */ + if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) { + struct mdp_superblock_1 *sb = NULL; + int sb_reshape_pos = 0; + + conf->cluster_sync_low = sector_nr; + conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS; + sb = page_address(rdev->sb_page); + if (sb) { + sb_reshape_pos = le64_to_cpu(sb->reshape_position); + /* + * Set cluster_sync_low again if next address for array + * reshape is less than cluster_sync_low. Since we can't + * update cluster_sync_low until it has finished reshape. + */ + if (sb_reshape_pos < conf->cluster_sync_low) + conf->cluster_sync_low = sb_reshape_pos; + } + + md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low, + conf->cluster_sync_high); + } + /* Now find the locations in the new layout */ __raid10_find_phys(&conf->geo, r10_bio); @@ -4719,6 +4782,19 @@ static void end_reshape(struct r10conf *conf) conf->fullsync = 0; } +static void raid10_update_reshape_pos(struct mddev *mddev) +{ + struct r10conf *conf = mddev->private; + sector_t lo, hi; + + md_cluster_ops->resync_info_get(mddev, &lo, &hi); + if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo)) + || mddev->reshape_position == MaxSector) + conf->reshape_progress = mddev->reshape_position; + else + WARN_ON_ONCE(1); +} + static int handle_reshape_read_error(struct mddev *mddev, struct r10bio *r10_bio) { @@ -4887,6 +4963,7 @@ static struct md_personality raid10_personality = .check_reshape = raid10_check_reshape, .start_reshape = raid10_start_reshape, .finish_reshape = raid10_finish_reshape, + .update_reshape_pos = raid10_update_reshape_pos, .congested = raid10_congested, }; diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index e6e925add700..ec3a5ef7fee0 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -3151,8 +3151,6 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); return 0; - rcu_assign_pointer(conf->log, NULL); - md_unregister_thread(&log->reclaim_thread); reclaim_thread: mempool_exit(&log->meta_pool); out_mempool: diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e4e98f47865d..4990f0319f6c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2681,6 +2681,18 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) pr_debug("raid456: error called\n"); spin_lock_irqsave(&conf->device_lock, flags); + + if (test_bit(In_sync, &rdev->flags) && + mddev->degraded == conf->max_degraded) { + /* + * Don't allow to achieve failed state + * Don't try to recover this device + */ + conf->recovery_disabled = mddev->recovery_disabled; + spin_unlock_irqrestore(&conf->device_lock, flags); + return; + } + set_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); mddev->degraded = raid5_calc_degraded(conf); |