summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig11
-rw-r--r--drivers/md/dm-cache-policy-smq.c2
-rw-r--r--drivers/md/dm-core.h10
-rw-r--r--drivers/md/dm-crypt.c15
-rw-r--r--drivers/md/dm-integrity.c23
-rw-r--r--drivers/md/dm-ioctl.c18
-rw-r--r--drivers/md/dm-mpath.c26
-rw-r--r--drivers/md/dm-raid.c2
-rw-r--r--drivers/md/dm-rq.c316
-rw-r--r--drivers/md/dm-rq.h4
-rw-r--r--drivers/md/dm-sysfs.c3
-rw-r--r--drivers/md/dm-table.c46
-rw-r--r--drivers/md/dm-thin.c8
-rw-r--r--drivers/md/dm-verity-fec.c5
-rw-r--r--drivers/md/dm-writecache.c5
-rw-r--r--drivers/md/dm-zoned-metadata.c80
-rw-r--r--drivers/md/dm-zoned-target.c20
-rw-r--r--drivers/md/dm.c25
-rw-r--r--drivers/md/dm.h1
-rw-r--r--drivers/md/md-bitmap.c9
-rw-r--r--drivers/md/md-cluster.c234
-rw-r--r--drivers/md/md-cluster.h2
-rw-r--r--drivers/md/md.c113
-rw-r--r--drivers/md/md.h1
-rw-r--r--drivers/md/raid1.c1
-rw-r--r--drivers/md/raid10.c109
-rw-r--r--drivers/md/raid5-cache.c2
-rw-r--r--drivers/md/raid5.c12
28 files changed, 508 insertions, 595 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 8b8c123cae66..3db222509e44 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -215,17 +215,6 @@ config BLK_DEV_DM
If unsure, say N.
-config DM_MQ_DEFAULT
- bool "request-based DM: use blk-mq I/O path by default"
- depends on BLK_DEV_DM
- ---help---
- This option enables the blk-mq based I/O path for request-based
- DM devices by default. With the option the dm_mod.use_blk_mq
- module/boot option defaults to Y, without it to N, but it can
- still be overriden either way.
-
- If unsure say N.
-
config DM_DEBUG
bool "Device mapper debugging support"
depends on BLK_DEV_DM
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index 1b5b9ad9e492..b61aac00ff40 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -1200,7 +1200,7 @@ static void queue_demotion(struct smq_policy *mq)
struct policy_work work;
struct entry *e;
- if (unlikely(WARN_ON_ONCE(!mq->migrations_allowed)))
+ if (WARN_ON_ONCE(!mq->migrations_allowed))
return;
e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true);
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 7d480c930eaf..224d44503a06 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -112,18 +112,8 @@ struct mapped_device {
struct dm_stats stats;
- struct kthread_worker kworker;
- struct task_struct *kworker_task;
-
- /* for request-based merge heuristic in dm_request_fn() */
- unsigned seq_rq_merge_deadline_usecs;
- int last_rq_rw;
- sector_t last_rq_pos;
- ktime_t last_rq_start_time;
-
/* for blk-mq request-based DM support */
struct blk_mq_tag_set *tag_set;
- bool use_blk_mq:1;
bool init_tio_pdu:1;
struct srcu_struct io_barrier;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 0481223b1deb..b8eec515a003 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -2661,6 +2661,7 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar
static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct crypt_config *cc;
+ const char *devname = dm_table_device_name(ti->table);
int key_size;
unsigned int align_mask;
unsigned long long tmpll;
@@ -2806,18 +2807,22 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
ret = -ENOMEM;
- cc->io_queue = alloc_workqueue("kcryptd_io", WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
+ cc->io_queue = alloc_workqueue("kcryptd_io/%s",
+ WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM,
+ 1, devname);
if (!cc->io_queue) {
ti->error = "Couldn't create kcryptd io queue";
goto bad;
}
if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
- cc->crypt_queue = alloc_workqueue("kcryptd", WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1);
+ cc->crypt_queue = alloc_workqueue("kcryptd/%s",
+ WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM,
+ 1, devname);
else
- cc->crypt_queue = alloc_workqueue("kcryptd",
+ cc->crypt_queue = alloc_workqueue("kcryptd/%s",
WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
- num_online_cpus());
+ num_online_cpus(), devname);
if (!cc->crypt_queue) {
ti->error = "Couldn't create kcryptd queue";
goto bad;
@@ -2826,7 +2831,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
spin_lock_init(&cc->write_thread_lock);
cc->write_tree = RB_ROOT;
- cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write");
+ cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write/%s", devname);
if (IS_ERR(cc->write_thread)) {
ret = PTR_ERR(cc->write_thread);
cc->write_thread = NULL;
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index e1fa6baf4e8e..bb3096bf2cc6 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -559,7 +559,12 @@ static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result
}
memset(result + size, 0, JOURNAL_MAC_SIZE - size);
} else {
- __u8 digest[size];
+ __u8 digest[HASH_MAX_DIGESTSIZE];
+
+ if (WARN_ON(size > sizeof(digest))) {
+ dm_integrity_io_error(ic, "digest_size", -EINVAL);
+ goto err;
+ }
r = crypto_shash_final(desc, digest);
if (unlikely(r)) {
dm_integrity_io_error(ic, "crypto_shash_final", r);
@@ -1324,7 +1329,7 @@ static void integrity_metadata(struct work_struct *w)
struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
char *checksums;
unsigned extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
- char checksums_onstack[ic->tag_size + extra_space];
+ char checksums_onstack[HASH_MAX_DIGESTSIZE];
unsigned sectors_to_process = dio->range.n_sectors;
sector_t sector = dio->range.logical_sector;
@@ -1333,8 +1338,14 @@ static void integrity_metadata(struct work_struct *w)
checksums = kmalloc((PAGE_SIZE >> SECTOR_SHIFT >> ic->sb->log2_sectors_per_block) * ic->tag_size + extra_space,
GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN);
- if (!checksums)
+ if (!checksums) {
checksums = checksums_onstack;
+ if (WARN_ON(extra_space &&
+ digest_size > sizeof(checksums_onstack))) {
+ r = -EINVAL;
+ goto error;
+ }
+ }
__bio_for_each_segment(bv, bio, iter, dio->orig_bi_iter) {
unsigned pos;
@@ -1546,7 +1557,7 @@ retry_kmap:
} while (++s < ic->sectors_per_block);
#ifdef INTERNAL_VERIFY
if (ic->internal_hash) {
- char checksums_onstack[max(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)];
+ char checksums_onstack[max(HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack);
if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
@@ -1596,7 +1607,7 @@ retry_kmap:
if (ic->internal_hash) {
unsigned digest_size = crypto_shash_digestsize(ic->internal_hash);
if (unlikely(digest_size > ic->tag_size)) {
- char checksums_onstack[digest_size];
+ char checksums_onstack[HASH_MAX_DIGESTSIZE];
integrity_sector_checksum(ic, logical_sector, (char *)js, checksums_onstack);
memcpy(journal_entry_tag(ic, je), checksums_onstack, ic->tag_size);
} else
@@ -2023,7 +2034,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
unlikely(from_replay) &&
#endif
ic->internal_hash) {
- char test_tag[max(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)];
+ char test_tag[max_t(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block),
(char *)access_journal_data(ic, i, l), test_tag);
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index b810ea77e6b1..f666778ad237 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1720,8 +1720,7 @@ static void free_params(struct dm_ioctl *param, size_t param_size, int param_fla
}
static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kernel,
- int ioctl_flags,
- struct dm_ioctl **param, int *param_flags)
+ int ioctl_flags, struct dm_ioctl **param, int *param_flags)
{
struct dm_ioctl *dmi;
int secure_data;
@@ -1762,18 +1761,13 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern
*param_flags |= DM_PARAMS_MALLOC;
- if (copy_from_user(dmi, user, param_kernel->data_size))
- goto bad;
+ /* Copy from param_kernel (which was already copied from user) */
+ memcpy(dmi, param_kernel, minimum_data_size);
-data_copied:
- /*
- * Abort if something changed the ioctl data while it was being copied.
- */
- if (dmi->data_size != param_kernel->data_size) {
- DMERR("rejecting ioctl: data size modified while processing parameters");
+ if (copy_from_user(&dmi->data, (char __user *)user + minimum_data_size,
+ param_kernel->data_size - minimum_data_size))
goto bad;
- }
-
+data_copied:
/* Wipe the user buffer so we do not return it to userspace */
if (secure_data && clear_user(user, param_kernel->data_size))
goto bad;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 419362c2d8ac..d6a66921daf4 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -203,14 +203,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m)
{
if (m->queue_mode == DM_TYPE_NONE) {
- /*
- * Default to request-based.
- */
- if (dm_use_blk_mq(dm_table_get_md(ti->table)))
- m->queue_mode = DM_TYPE_MQ_REQUEST_BASED;
- else
- m->queue_mode = DM_TYPE_REQUEST_BASED;
-
+ m->queue_mode = DM_TYPE_REQUEST_BASED;
} else if (m->queue_mode == DM_TYPE_BIO_BASED) {
INIT_WORK(&m->process_queued_bios, process_queued_bios);
/*
@@ -537,10 +530,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
* get the queue busy feedback (via BLK_STS_RESOURCE),
* otherwise I/O merging can suffer.
*/
- if (q->mq_ops)
- return DM_MAPIO_REQUEUE;
- else
- return DM_MAPIO_DELAY_REQUEUE;
+ return DM_MAPIO_REQUEUE;
}
clone->bio = clone->biotail = NULL;
clone->rq_disk = bdev->bd_disk;
@@ -668,7 +658,7 @@ static int multipath_map_bio(struct dm_target *ti, struct bio *bio)
static void process_queued_io_list(struct multipath *m)
{
- if (m->queue_mode == DM_TYPE_MQ_REQUEST_BASED)
+ if (m->queue_mode == DM_TYPE_REQUEST_BASED)
dm_mq_kick_requeue_list(dm_table_get_md(m->ti->table));
else if (m->queue_mode == DM_TYPE_BIO_BASED)
queue_work(kmultipathd, &m->process_queued_bios);
@@ -1089,10 +1079,9 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
if (!strcasecmp(queue_mode_name, "bio"))
m->queue_mode = DM_TYPE_BIO_BASED;
- else if (!strcasecmp(queue_mode_name, "rq"))
+ else if (!strcasecmp(queue_mode_name, "rq") ||
+ !strcasecmp(queue_mode_name, "mq"))
m->queue_mode = DM_TYPE_REQUEST_BASED;
- else if (!strcasecmp(queue_mode_name, "mq"))
- m->queue_mode = DM_TYPE_MQ_REQUEST_BASED;
else {
ti->error = "Unknown 'queue_mode' requested";
r = -EINVAL;
@@ -1726,9 +1715,6 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
case DM_TYPE_BIO_BASED:
DMEMIT("queue_mode bio ");
break;
- case DM_TYPE_MQ_REQUEST_BASED:
- DMEMIT("queue_mode mq ");
- break;
default:
WARN_ON_ONCE(true);
break;
@@ -1972,7 +1958,7 @@ static int multipath_busy(struct dm_target *ti)
/* no paths available, for blk-mq: rely on IO mapping to delay requeue */
if (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
- return (m->queue_mode != DM_TYPE_MQ_REQUEST_BASED);
+ return (m->queue_mode != DM_TYPE_REQUEST_BASED);
/* Guess which priority_group will be used at next mapping time */
pg = READ_ONCE(m->current_pg);
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index c44925e4e481..e1dd1622a290 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2475,7 +2475,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
}
/* Enable bitmap creation for RAID levels != 0 */
- mddev->bitmap_info.offset = rt_is_raid0(rs->raid_type) ? 0 : to_sector(4096);
+ mddev->bitmap_info.offset = (rt_is_raid0(rs->raid_type) || rs->journal_dev.dev) ? 0 : to_sector(4096);
mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
if (!test_and_clear_bit(FirstUse, &rdev->flags)) {
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 6e547b8dd298..7cd36e4d1310 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -23,19 +23,6 @@ static unsigned dm_mq_queue_depth = DM_MQ_QUEUE_DEPTH;
#define RESERVED_REQUEST_BASED_IOS 256
static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
-static bool use_blk_mq = IS_ENABLED(CONFIG_DM_MQ_DEFAULT);
-
-bool dm_use_blk_mq_default(void)
-{
- return use_blk_mq;
-}
-
-bool dm_use_blk_mq(struct mapped_device *md)
-{
- return md->use_blk_mq;
-}
-EXPORT_SYMBOL_GPL(dm_use_blk_mq);
-
unsigned dm_get_reserved_rq_based_ios(void)
{
return __dm_get_module_param(&reserved_rq_based_ios,
@@ -59,41 +46,13 @@ int dm_request_based(struct mapped_device *md)
return queue_is_rq_based(md->queue);
}
-static void dm_old_start_queue(struct request_queue *q)
-{
- unsigned long flags;
-
- spin_lock_irqsave(q->queue_lock, flags);
- if (blk_queue_stopped(q))
- blk_start_queue(q);
- spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
-static void dm_mq_start_queue(struct request_queue *q)
+void dm_start_queue(struct request_queue *q)
{
blk_mq_unquiesce_queue(q);
blk_mq_kick_requeue_list(q);
}
-void dm_start_queue(struct request_queue *q)
-{
- if (!q->mq_ops)
- dm_old_start_queue(q);
- else
- dm_mq_start_queue(q);
-}
-
-static void dm_old_stop_queue(struct request_queue *q)
-{
- unsigned long flags;
-
- spin_lock_irqsave(q->queue_lock, flags);
- if (!blk_queue_stopped(q))
- blk_stop_queue(q);
- spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
-static void dm_mq_stop_queue(struct request_queue *q)
+void dm_stop_queue(struct request_queue *q)
{
if (blk_mq_queue_stopped(q))
return;
@@ -101,14 +60,6 @@ static void dm_mq_stop_queue(struct request_queue *q)
blk_mq_quiesce_queue(q);
}
-void dm_stop_queue(struct request_queue *q)
-{
- if (!q->mq_ops)
- dm_old_stop_queue(q);
- else
- dm_mq_stop_queue(q);
-}
-
/*
* Partial completion handling for request-based dm
*/
@@ -179,9 +130,6 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig)
*/
static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
{
- struct request_queue *q = md->queue;
- unsigned long flags;
-
atomic_dec(&md->pending[rw]);
/* nudge anyone waiting on suspend queue */
@@ -189,18 +137,6 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
wake_up(&md->wait);
/*
- * Run this off this callpath, as drivers could invoke end_io while
- * inside their request_fn (and holding the queue lock). Calling
- * back into ->request_fn() could deadlock attempting to grab the
- * queue lock again.
- */
- if (!q->mq_ops && run_queue) {
- spin_lock_irqsave(q->queue_lock, flags);
- blk_run_queue_async(q);
- spin_unlock_irqrestore(q->queue_lock, flags);
- }
-
- /*
* dm_put() must be at the end of this function. See the comment above
*/
dm_put(md);
@@ -222,27 +158,10 @@ static void dm_end_request(struct request *clone, blk_status_t error)
tio->ti->type->release_clone_rq(clone);
rq_end_stats(md, rq);
- if (!rq->q->mq_ops)
- blk_end_request_all(rq, error);
- else
- blk_mq_end_request(rq, error);
+ blk_mq_end_request(rq, error);
rq_completed(md, rw, true);
}
-/*
- * Requeue the original request of a clone.
- */
-static void dm_old_requeue_request(struct request *rq, unsigned long delay_ms)
-{
- struct request_queue *q = rq->q;
- unsigned long flags;
-
- spin_lock_irqsave(q->queue_lock, flags);
- blk_requeue_request(q, rq);
- blk_delay_queue(q, delay_ms);
- spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
static void __dm_mq_kick_requeue_list(struct request_queue *q, unsigned long msecs)
{
blk_mq_delay_kick_requeue_list(q, msecs);
@@ -273,11 +192,7 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_
tio->ti->type->release_clone_rq(tio->clone);
}
- if (!rq->q->mq_ops)
- dm_old_requeue_request(rq, delay_ms);
- else
- dm_mq_delay_requeue_request(rq, delay_ms);
-
+ dm_mq_delay_requeue_request(rq, delay_ms);
rq_completed(md, rw, false);
}
@@ -340,10 +255,7 @@ static void dm_softirq_done(struct request *rq)
rq_end_stats(md, rq);
rw = rq_data_dir(rq);
- if (!rq->q->mq_ops)
- blk_end_request_all(rq, tio->error);
- else
- blk_mq_end_request(rq, tio->error);
+ blk_mq_end_request(rq, tio->error);
rq_completed(md, rw, false);
return;
}
@@ -363,17 +275,14 @@ static void dm_complete_request(struct request *rq, blk_status_t error)
struct dm_rq_target_io *tio = tio_from_request(rq);
tio->error = error;
- if (!rq->q->mq_ops)
- blk_complete_request(rq);
- else
- blk_mq_complete_request(rq);
+ blk_mq_complete_request(rq);
}
/*
* Complete the not-mapped clone and the original request with the error status
* through softirq context.
* Target's rq_end_io() function isn't called.
- * This may be used when the target's map_rq() or clone_and_map_rq() functions fail.
+ * This may be used when the target's clone_and_map_rq() function fails.
*/
static void dm_kill_unmapped_request(struct request *rq, blk_status_t error)
{
@@ -381,21 +290,10 @@ static void dm_kill_unmapped_request(struct request *rq, blk_status_t error)
dm_complete_request(rq, error);
}
-/*
- * Called with the clone's queue lock held (in the case of .request_fn)
- */
static void end_clone_request(struct request *clone, blk_status_t error)
{
struct dm_rq_target_io *tio = clone->end_io_data;
- /*
- * Actual request completion is done in a softirq context which doesn't
- * hold the clone's queue lock. Otherwise, deadlock could occur because:
- * - another request may be submitted by the upper level driver
- * of the stacking during the completion
- * - the submission which requires queue lock may be done
- * against this clone's queue
- */
dm_complete_request(tio->orig, error);
}
@@ -446,8 +344,6 @@ static int setup_clone(struct request *clone, struct request *rq,
return 0;
}
-static void map_tio_request(struct kthread_work *work);
-
static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
struct mapped_device *md)
{
@@ -464,8 +360,6 @@ static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
*/
if (!md->init_tio_pdu)
memset(&tio->info, 0, sizeof(tio->info));
- if (md->kworker_task)
- kthread_init_work(&tio->work, map_tio_request);
}
/*
@@ -504,10 +398,7 @@ check_again:
blk_rq_unprep_clone(clone);
tio->ti->type->release_clone_rq(clone);
tio->clone = NULL;
- if (!rq->q->mq_ops)
- r = DM_MAPIO_DELAY_REQUEUE;
- else
- r = DM_MAPIO_REQUEUE;
+ r = DM_MAPIO_REQUEUE;
goto check_again;
}
break;
@@ -530,20 +421,23 @@ check_again:
return r;
}
+/* DEPRECATED: previously used for request-based merge heuristic in dm_request_fn() */
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf)
+{
+ return sprintf(buf, "%u\n", 0);
+}
+
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
+ const char *buf, size_t count)
+{
+ return count;
+}
+
static void dm_start_request(struct mapped_device *md, struct request *orig)
{
- if (!orig->q->mq_ops)
- blk_start_request(orig);
- else
- blk_mq_start_request(orig);
+ blk_mq_start_request(orig);
atomic_inc(&md->pending[rq_data_dir(orig)]);
- if (md->seq_rq_merge_deadline_usecs) {
- md->last_rq_pos = rq_end_sector(orig);
- md->last_rq_rw = rq_data_dir(orig);
- md->last_rq_start_time = ktime_get();
- }
-
if (unlikely(dm_stats_used(&md->stats))) {
struct dm_rq_target_io *tio = tio_from_request(orig);
tio->duration_jiffies = jiffies;
@@ -563,8 +457,10 @@ static void dm_start_request(struct mapped_device *md, struct request *orig)
dm_get(md);
}
-static int __dm_rq_init_rq(struct mapped_device *md, struct request *rq)
+static int dm_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
+ unsigned int hctx_idx, unsigned int numa_node)
{
+ struct mapped_device *md = set->driver_data;
struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
/*
@@ -581,163 +477,6 @@ static int __dm_rq_init_rq(struct mapped_device *md, struct request *rq)
return 0;
}
-static int dm_rq_init_rq(struct request_queue *q, struct request *rq, gfp_t gfp)
-{
- return __dm_rq_init_rq(q->rq_alloc_data, rq);
-}
-
-static void map_tio_request(struct kthread_work *work)
-{
- struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work);
-
- if (map_request(tio) == DM_MAPIO_REQUEUE)
- dm_requeue_original_request(tio, false);
-}
-
-ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf)
-{
- return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs);
-}
-
-#define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000
-
-ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
- const char *buf, size_t count)
-{
- unsigned deadline;
-
- if (dm_get_md_type(md) != DM_TYPE_REQUEST_BASED)
- return count;
-
- if (kstrtouint(buf, 10, &deadline))
- return -EINVAL;
-
- if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS)
- deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS;
-
- md->seq_rq_merge_deadline_usecs = deadline;
-
- return count;
-}
-
-static bool dm_old_request_peeked_before_merge_deadline(struct mapped_device *md)
-{
- ktime_t kt_deadline;
-
- if (!md->seq_rq_merge_deadline_usecs)
- return false;
-
- kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC);
- kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline);
-
- return !ktime_after(ktime_get(), kt_deadline);
-}
-
-/*
- * q->request_fn for old request-based dm.
- * Called with the queue lock held.
- */
-static void dm_old_request_fn(struct request_queue *q)
-{
- struct mapped_device *md = q->queuedata;
- struct dm_target *ti = md->immutable_target;
- struct request *rq;
- struct dm_rq_target_io *tio;
- sector_t pos = 0;
-
- if (unlikely(!ti)) {
- int srcu_idx;
- struct dm_table *map = dm_get_live_table(md, &srcu_idx);
-
- if (unlikely(!map)) {
- dm_put_live_table(md, srcu_idx);
- return;
- }
- ti = dm_table_find_target(map, pos);
- dm_put_live_table(md, srcu_idx);
- }
-
- /*
- * For suspend, check blk_queue_stopped() and increment
- * ->pending within a single queue_lock not to increment the
- * number of in-flight I/Os after the queue is stopped in
- * dm_suspend().
- */
- while (!blk_queue_stopped(q)) {
- rq = blk_peek_request(q);
- if (!rq)
- return;
-
- /* always use block 0 to find the target for flushes for now */
- pos = 0;
- if (req_op(rq) != REQ_OP_FLUSH)
- pos = blk_rq_pos(rq);
-
- if ((dm_old_request_peeked_before_merge_deadline(md) &&
- md_in_flight(md) && rq->bio && !bio_multiple_segments(rq->bio) &&
- md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) ||
- (ti->type->busy && ti->type->busy(ti))) {
- blk_delay_queue(q, 10);
- return;
- }
-
- dm_start_request(md, rq);
-
- tio = tio_from_request(rq);
- init_tio(tio, rq, md);
- /* Establish tio->ti before queuing work (map_tio_request) */
- tio->ti = ti;
- kthread_queue_work(&md->kworker, &tio->work);
- BUG_ON(!irqs_disabled());
- }
-}
-
-/*
- * Fully initialize a .request_fn request-based queue.
- */
-int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t)
-{
- struct dm_target *immutable_tgt;
-
- /* Fully initialize the queue */
- md->queue->cmd_size = sizeof(struct dm_rq_target_io);
- md->queue->rq_alloc_data = md;
- md->queue->request_fn = dm_old_request_fn;
- md->queue->init_rq_fn = dm_rq_init_rq;
-
- immutable_tgt = dm_table_get_immutable_target(t);
- if (immutable_tgt && immutable_tgt->per_io_data_size) {
- /* any target-specific per-io data is immediately after the tio */
- md->queue->cmd_size += immutable_tgt->per_io_data_size;
- md->init_tio_pdu = true;
- }
- if (blk_init_allocated_queue(md->queue) < 0)
- return -EINVAL;
-
- /* disable dm_old_request_fn's merge heuristic by default */
- md->seq_rq_merge_deadline_usecs = 0;
-
- blk_queue_softirq_done(md->queue, dm_softirq_done);
-
- /* Initialize the request-based DM worker thread */
- kthread_init_worker(&md->kworker);
- md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
- "kdmwork-%s", dm_device_name(md));
- if (IS_ERR(md->kworker_task)) {
- int error = PTR_ERR(md->kworker_task);
- md->kworker_task = NULL;
- return error;
- }
-
- return 0;
-}
-
-static int dm_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
- unsigned int hctx_idx, unsigned int numa_node)
-{
- return __dm_rq_init_rq(set->driver_data, rq);
-}
-
static blk_status_t dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
@@ -790,11 +529,6 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
struct dm_target *immutable_tgt;
int err;
- if (!dm_table_all_blk_mq_devices(t)) {
- DMERR("request-based dm-mq may only be stacked on blk-mq device(s)");
- return -EINVAL;
- }
-
md->tag_set = kzalloc_node(sizeof(struct blk_mq_tag_set), GFP_KERNEL, md->numa_node_id);
if (!md->tag_set)
return -ENOMEM;
@@ -845,6 +579,8 @@ void dm_mq_cleanup_mapped_device(struct mapped_device *md)
module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
+/* Unused, but preserved for userspace compatibility */
+static bool use_blk_mq = true;
module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices");
diff --git a/drivers/md/dm-rq.h b/drivers/md/dm-rq.h
index f43c45460aac..b39245545229 100644
--- a/drivers/md/dm-rq.h
+++ b/drivers/md/dm-rq.h
@@ -46,10 +46,6 @@ struct dm_rq_clone_bio_info {
struct bio clone;
};
-bool dm_use_blk_mq_default(void);
-bool dm_use_blk_mq(struct mapped_device *md);
-
-int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t);
int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t);
void dm_mq_cleanup_mapped_device(struct mapped_device *md);
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index c209b8a19b84..a05fcd50e1b9 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -92,7 +92,8 @@ static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
static ssize_t dm_attr_use_blk_mq_show(struct mapped_device *md, char *buf)
{
- sprintf(buf, "%d\n", dm_use_blk_mq(md));
+ /* Purely for userspace compatibility */
+ sprintf(buf, "%d\n", true);
return strlen(buf);
}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index fb4bea20657b..9038c302d5c2 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -47,7 +47,6 @@ struct dm_table {
bool integrity_supported:1;
bool singleton:1;
- bool all_blk_mq:1;
unsigned integrity_added:1;
/*
@@ -872,8 +871,7 @@ static bool __table_type_bio_based(enum dm_queue_mode table_type)
static bool __table_type_request_based(enum dm_queue_mode table_type)
{
- return (table_type == DM_TYPE_REQUEST_BASED ||
- table_type == DM_TYPE_MQ_REQUEST_BASED);
+ return table_type == DM_TYPE_REQUEST_BASED;
}
void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type)
@@ -999,10 +997,6 @@ verify_bio_based:
BUG_ON(!request_based); /* No targets in this table */
- /*
- * The only way to establish DM_TYPE_MQ_REQUEST_BASED is by
- * having a compatible target use dm_table_set_type.
- */
t->type = DM_TYPE_REQUEST_BASED;
verify_rq_based:
@@ -1022,11 +1016,9 @@ verify_rq_based:
int srcu_idx;
struct dm_table *live_table = dm_get_live_table(t->md, &srcu_idx);
- /* inherit live table's type and all_blk_mq */
- if (live_table) {
+ /* inherit live table's type */
+ if (live_table)
t->type = live_table->type;
- t->all_blk_mq = live_table->all_blk_mq;
- }
dm_put_live_table(t->md, srcu_idx);
return 0;
}
@@ -1046,17 +1038,10 @@ verify_rq_based:
DMERR("table load rejected: including non-request-stackable devices");
return -EINVAL;
}
- if (v.sq_count && v.mq_count) {
+ if (v.sq_count > 0) {
DMERR("table load rejected: not all devices are blk-mq request-stackable");
return -EINVAL;
}
- t->all_blk_mq = v.mq_count > 0;
-
- if (!t->all_blk_mq &&
- (t->type == DM_TYPE_MQ_REQUEST_BASED || t->type == DM_TYPE_NVME_BIO_BASED)) {
- DMERR("table load rejected: all devices are not blk-mq request-stackable");
- return -EINVAL;
- }
return 0;
}
@@ -1105,11 +1090,6 @@ bool dm_table_request_based(struct dm_table *t)
return __table_type_request_based(dm_table_get_type(t));
}
-bool dm_table_all_blk_mq_devices(struct dm_table *t)
-{
- return t->all_blk_mq;
-}
-
static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
{
enum dm_queue_mode type = dm_table_get_type(t);
@@ -2089,26 +2069,24 @@ struct mapped_device *dm_table_get_md(struct dm_table *t)
}
EXPORT_SYMBOL(dm_table_get_md);
+const char *dm_table_device_name(struct dm_table *t)
+{
+ return dm_device_name(t->md);
+}
+EXPORT_SYMBOL_GPL(dm_table_device_name);
+
void dm_table_run_md_queue_async(struct dm_table *t)
{
struct mapped_device *md;
struct request_queue *queue;
- unsigned long flags;
if (!dm_table_request_based(t))
return;
md = dm_table_get_md(t);
queue = dm_get_md_queue(md);
- if (queue) {
- if (queue->mq_ops)
- blk_mq_run_hw_queues(queue, true);
- else {
- spin_lock_irqsave(queue->queue_lock, flags);
- blk_run_queue_async(queue);
- spin_unlock_irqrestore(queue->queue_lock, flags);
- }
- }
+ if (queue)
+ blk_mq_run_hw_queues(queue, true);
}
EXPORT_SYMBOL(dm_table_run_md_queue_async);
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index aaf1ad481ee8..0bd8d498b3b9 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -325,7 +325,7 @@ struct thin_c {
* Ensures the thin is not destroyed until the worker has finished
* iterating the active_thins list.
*/
- atomic_t refcount;
+ refcount_t refcount;
struct completion can_destroy;
};
@@ -4044,12 +4044,12 @@ static struct target_type pool_target = {
*--------------------------------------------------------------*/
static void thin_get(struct thin_c *tc)
{
- atomic_inc(&tc->refcount);
+ refcount_inc(&tc->refcount);
}
static void thin_put(struct thin_c *tc)
{
- if (atomic_dec_and_test(&tc->refcount))
+ if (refcount_dec_and_test(&tc->refcount))
complete(&tc->can_destroy);
}
@@ -4193,7 +4193,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
r = -EINVAL;
goto bad;
}
- atomic_set(&tc->refcount, 1);
+ refcount_set(&tc->refcount, 1);
init_completion(&tc->can_destroy);
list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
spin_unlock_irqrestore(&tc->pool->lock, flags);
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 684af08d0747..0ce04e5b4afb 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -212,12 +212,15 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io,
struct dm_verity_fec_io *fio = fec_io(io);
u64 block, ileaved;
u8 *bbuf, *rs_block;
- u8 want_digest[v->digest_size];
+ u8 want_digest[HASH_MAX_DIGESTSIZE];
unsigned n, k;
if (neras)
*neras = 0;
+ if (WARN_ON(v->digest_size > sizeof(want_digest)))
+ return -EINVAL;
+
/*
* read each of the rsn data blocks that are part of the RS block, and
* interleave contents to available bufs
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index 5f1f80d424dd..2d50eec94cd7 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -350,10 +350,7 @@ static struct wc_memory_superblock *sb(struct dm_writecache *wc)
static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e)
{
- if (is_power_of_2(sizeof(struct wc_entry)) && 0)
- return &sb(wc)->entries[e - wc->entries];
- else
- return &sb(wc)->entries[e->index];
+ return &sb(wc)->entries[e->index];
}
static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index 969954915566..fa68336560c3 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -99,7 +99,7 @@ struct dmz_mblock {
struct rb_node node;
struct list_head link;
sector_t no;
- atomic_t ref;
+ unsigned int ref;
unsigned long state;
struct page *page;
void *data;
@@ -296,7 +296,7 @@ static struct dmz_mblock *dmz_alloc_mblock(struct dmz_metadata *zmd,
RB_CLEAR_NODE(&mblk->node);
INIT_LIST_HEAD(&mblk->link);
- atomic_set(&mblk->ref, 0);
+ mblk->ref = 0;
mblk->state = 0;
mblk->no = mblk_no;
mblk->data = page_address(mblk->page);
@@ -339,10 +339,11 @@ static void dmz_insert_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
}
/*
- * Lookup a metadata block in the rbtree.
+ * Lookup a metadata block in the rbtree. If the block is found, increment
+ * its reference count.
*/
-static struct dmz_mblock *dmz_lookup_mblock(struct dmz_metadata *zmd,
- sector_t mblk_no)
+static struct dmz_mblock *dmz_get_mblock_fast(struct dmz_metadata *zmd,
+ sector_t mblk_no)
{
struct rb_root *root = &zmd->mblk_rbtree;
struct rb_node *node = root->rb_node;
@@ -350,8 +351,17 @@ static struct dmz_mblock *dmz_lookup_mblock(struct dmz_metadata *zmd,
while (node) {
mblk = container_of(node, struct dmz_mblock, node);
- if (mblk->no == mblk_no)
+ if (mblk->no == mblk_no) {
+ /*
+ * If this is the first reference to the block,
+ * remove it from the LRU list.
+ */
+ mblk->ref++;
+ if (mblk->ref == 1 &&
+ !test_bit(DMZ_META_DIRTY, &mblk->state))
+ list_del_init(&mblk->link);
return mblk;
+ }
node = (mblk->no < mblk_no) ? node->rb_left : node->rb_right;
}
@@ -382,32 +392,47 @@ static void dmz_mblock_bio_end_io(struct bio *bio)
}
/*
- * Read a metadata block from disk.
+ * Read an uncached metadata block from disk and add it to the cache.
*/
-static struct dmz_mblock *dmz_fetch_mblock(struct dmz_metadata *zmd,
- sector_t mblk_no)
+static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd,
+ sector_t mblk_no)
{
- struct dmz_mblock *mblk;
+ struct dmz_mblock *mblk, *m;
sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no;
struct bio *bio;
- /* Get block and insert it */
+ /* Get a new block and a BIO to read it */
mblk = dmz_alloc_mblock(zmd, mblk_no);
if (!mblk)
return NULL;
- spin_lock(&zmd->mblk_lock);
- atomic_inc(&mblk->ref);
- set_bit(DMZ_META_READING, &mblk->state);
- dmz_insert_mblock(zmd, mblk);
- spin_unlock(&zmd->mblk_lock);
-
bio = bio_alloc(GFP_NOIO, 1);
if (!bio) {
dmz_free_mblock(zmd, mblk);
return NULL;
}
+ spin_lock(&zmd->mblk_lock);
+
+ /*
+ * Make sure that another context did not start reading
+ * the block already.
+ */
+ m = dmz_get_mblock_fast(zmd, mblk_no);
+ if (m) {
+ spin_unlock(&zmd->mblk_lock);
+ dmz_free_mblock(zmd, mblk);
+ bio_put(bio);
+ return m;
+ }
+
+ mblk->ref++;
+ set_bit(DMZ_META_READING, &mblk->state);
+ dmz_insert_mblock(zmd, mblk);
+
+ spin_unlock(&zmd->mblk_lock);
+
+ /* Submit read BIO */
bio->bi_iter.bi_sector = dmz_blk2sect(block);
bio_set_dev(bio, zmd->dev->bdev);
bio->bi_private = mblk;
@@ -484,7 +509,8 @@ static void dmz_release_mblock(struct dmz_metadata *zmd,
spin_lock(&zmd->mblk_lock);
- if (atomic_dec_and_test(&mblk->ref)) {
+ mblk->ref--;
+ if (mblk->ref == 0) {
if (test_bit(DMZ_META_ERROR, &mblk->state)) {
rb_erase(&mblk->node, &zmd->mblk_rbtree);
dmz_free_mblock(zmd, mblk);
@@ -508,18 +534,12 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd,
/* Check rbtree */
spin_lock(&zmd->mblk_lock);
- mblk = dmz_lookup_mblock(zmd, mblk_no);
- if (mblk) {
- /* Cache hit: remove block from LRU list */
- if (atomic_inc_return(&mblk->ref) == 1 &&
- !test_bit(DMZ_META_DIRTY, &mblk->state))
- list_del_init(&mblk->link);
- }
+ mblk = dmz_get_mblock_fast(zmd, mblk_no);
spin_unlock(&zmd->mblk_lock);
if (!mblk) {
/* Cache miss: read the block from disk */
- mblk = dmz_fetch_mblock(zmd, mblk_no);
+ mblk = dmz_get_mblock_slow(zmd, mblk_no);
if (!mblk)
return ERR_PTR(-ENOMEM);
}
@@ -753,7 +773,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd)
spin_lock(&zmd->mblk_lock);
clear_bit(DMZ_META_DIRTY, &mblk->state);
- if (atomic_read(&mblk->ref) == 0)
+ if (mblk->ref == 0)
list_add_tail(&mblk->link, &zmd->mblk_lru_list);
spin_unlock(&zmd->mblk_lock);
}
@@ -2308,7 +2328,7 @@ static void dmz_cleanup_metadata(struct dmz_metadata *zmd)
mblk = list_first_entry(&zmd->mblk_dirty_list,
struct dmz_mblock, link);
dmz_dev_warn(zmd->dev, "mblock %llu still in dirty list (ref %u)",
- (u64)mblk->no, atomic_read(&mblk->ref));
+ (u64)mblk->no, mblk->ref);
list_del_init(&mblk->link);
rb_erase(&mblk->node, &zmd->mblk_rbtree);
dmz_free_mblock(zmd, mblk);
@@ -2326,8 +2346,8 @@ static void dmz_cleanup_metadata(struct dmz_metadata *zmd)
root = &zmd->mblk_rbtree;
rbtree_postorder_for_each_entry_safe(mblk, next, root, node) {
dmz_dev_warn(zmd->dev, "mblock %llu ref %u still in rbtree",
- (u64)mblk->no, atomic_read(&mblk->ref));
- atomic_set(&mblk->ref, 0);
+ (u64)mblk->no, mblk->ref);
+ mblk->ref = 0;
dmz_free_mblock(zmd, mblk);
}
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 12d96a263623..981154e59461 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -19,7 +19,7 @@ struct dmz_bioctx {
struct dmz_target *target;
struct dm_zone *zone;
struct bio *bio;
- atomic_t ref;
+ refcount_t ref;
blk_status_t status;
};
@@ -28,7 +28,7 @@ struct dmz_bioctx {
*/
struct dm_chunk_work {
struct work_struct work;
- atomic_t refcount;
+ refcount_t refcount;
struct dmz_target *target;
unsigned int chunk;
struct bio_list bio_list;
@@ -115,7 +115,7 @@ static int dmz_submit_read_bio(struct dmz_target *dmz, struct dm_zone *zone,
if (nr_blocks == dmz_bio_blocks(bio)) {
/* Setup and submit the BIO */
bio->bi_iter.bi_sector = sector;
- atomic_inc(&bioctx->ref);
+ refcount_inc(&bioctx->ref);
generic_make_request(bio);
return 0;
}
@@ -134,7 +134,7 @@ static int dmz_submit_read_bio(struct dmz_target *dmz, struct dm_zone *zone,
bio_advance(bio, clone->bi_iter.bi_size);
/* Submit the clone */
- atomic_inc(&bioctx->ref);
+ refcount_inc(&bioctx->ref);
generic_make_request(clone);
return 0;
@@ -240,7 +240,7 @@ static void dmz_submit_write_bio(struct dmz_target *dmz, struct dm_zone *zone,
/* Setup and submit the BIO */
bio_set_dev(bio, dmz->dev->bdev);
bio->bi_iter.bi_sector = dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block);
- atomic_inc(&bioctx->ref);
+ refcount_inc(&bioctx->ref);
generic_make_request(bio);
if (dmz_is_seq(zone))
@@ -456,7 +456,7 @@ out:
*/
static inline void dmz_get_chunk_work(struct dm_chunk_work *cw)
{
- atomic_inc(&cw->refcount);
+ refcount_inc(&cw->refcount);
}
/*
@@ -465,7 +465,7 @@ static inline void dmz_get_chunk_work(struct dm_chunk_work *cw)
*/
static void dmz_put_chunk_work(struct dm_chunk_work *cw)
{
- if (atomic_dec_and_test(&cw->refcount)) {
+ if (refcount_dec_and_test(&cw->refcount)) {
WARN_ON(!bio_list_empty(&cw->bio_list));
radix_tree_delete(&cw->target->chunk_rxtree, cw->chunk);
kfree(cw);
@@ -546,7 +546,7 @@ static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
goto out;
INIT_WORK(&cw->work, dmz_chunk_work);
- atomic_set(&cw->refcount, 0);
+ refcount_set(&cw->refcount, 0);
cw->target = dmz;
cw->chunk = chunk;
bio_list_init(&cw->bio_list);
@@ -599,7 +599,7 @@ static int dmz_map(struct dm_target *ti, struct bio *bio)
bioctx->target = dmz;
bioctx->zone = NULL;
bioctx->bio = bio;
- atomic_set(&bioctx->ref, 1);
+ refcount_set(&bioctx->ref, 1);
bioctx->status = BLK_STS_OK;
/* Set the BIO pending in the flush list */
@@ -633,7 +633,7 @@ static int dmz_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error
if (bioctx->status == BLK_STS_OK && *error)
bioctx->status = *error;
- if (!atomic_dec_and_test(&bioctx->ref))
+ if (!refcount_dec_and_test(&bioctx->ref))
return DM_ENDIO_INCOMPLETE;
/* Done */
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 6be21dc210a1..c510179a7f84 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1664,7 +1664,7 @@ static blk_qc_t __process_bio(struct mapped_device *md,
* Defend against IO still getting in during teardown
* - as was seen for a time with nvme-fcloop
*/
- if (unlikely(WARN_ON_ONCE(!ti || !dm_target_is_valid(ti)))) {
+ if (WARN_ON_ONCE(!ti || !dm_target_is_valid(ti))) {
error = -EIO;
goto out;
}
@@ -1806,8 +1806,6 @@ static void dm_wq_work(struct work_struct *work);
static void dm_init_normal_md_queue(struct mapped_device *md)
{
- md->use_blk_mq = false;
-
/*
* Initialize aspects of queue that aren't relevant for blk-mq
*/
@@ -1818,8 +1816,6 @@ static void cleanup_mapped_device(struct mapped_device *md)
{
if (md->wq)
destroy_workqueue(md->wq);
- if (md->kworker_task)
- kthread_stop(md->kworker_task);
bioset_exit(&md->bs);
bioset_exit(&md->io_bs);
@@ -1886,7 +1882,6 @@ static struct mapped_device *alloc_dev(int minor)
goto bad_io_barrier;
md->numa_node_id = numa_node_id;
- md->use_blk_mq = dm_use_blk_mq_default();
md->init_tio_pdu = false;
md->type = DM_TYPE_NONE;
mutex_init(&md->suspend_lock);
@@ -1917,7 +1912,6 @@ static struct mapped_device *alloc_dev(int minor)
INIT_WORK(&md->work, dm_wq_work);
init_waitqueue_head(&md->eventq);
init_completion(&md->kobj_holder.completion);
- md->kworker_task = NULL;
md->disk->major = _major;
md->disk->first_minor = minor;
@@ -2217,14 +2211,6 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
switch (type) {
case DM_TYPE_REQUEST_BASED:
- dm_init_normal_md_queue(md);
- r = dm_old_init_request_queue(md, t);
- if (r) {
- DMERR("Cannot initialize queue for request-based mapped device");
- return r;
- }
- break;
- case DM_TYPE_MQ_REQUEST_BASED:
r = dm_mq_init_request_queue(md, t);
if (r) {
DMERR("Cannot initialize queue for request-based dm-mq mapped device");
@@ -2329,9 +2315,6 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
blk_set_queue_dying(md->queue);
- if (dm_request_based(md) && md->kworker_task)
- kthread_flush_worker(&md->kworker);
-
/*
* Take suspend_lock so that presuspend and postsuspend methods
* do not race with internal suspend.
@@ -2584,11 +2567,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
* Stop md->queue before flushing md->wq in case request-based
* dm defers requests to md->wq from md->queue.
*/
- if (dm_request_based(md)) {
+ if (dm_request_based(md))
dm_stop_queue(md->queue);
- if (md->kworker_task)
- kthread_flush_worker(&md->kworker);
- }
flush_workqueue(md->wq);
@@ -2963,7 +2943,6 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu
goto out;
break;
case DM_TYPE_REQUEST_BASED:
- case DM_TYPE_MQ_REQUEST_BASED:
pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
/* per_io_data_size is used for blk-mq pdu at queue allocation */
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 114a81b27c37..2d539b82ec08 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -70,7 +70,6 @@ struct dm_target *dm_table_get_immutable_target(struct dm_table *t);
struct dm_target *dm_table_get_wildcard_target(struct dm_table *t);
bool dm_table_bio_based(struct dm_table *t);
bool dm_table_request_based(struct dm_table *t);
-bool dm_table_all_blk_mq_devices(struct dm_table *t);
void dm_table_free_md_mempools(struct dm_table *t);
struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 2fc8c113977f..1cd4f991792c 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -2288,9 +2288,9 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
goto out;
}
if (mddev->pers) {
- mddev->pers->quiesce(mddev, 1);
+ mddev_suspend(mddev);
md_bitmap_destroy(mddev);
- mddev->pers->quiesce(mddev, 0);
+ mddev_resume(mddev);
}
mddev->bitmap_info.offset = 0;
if (mddev->bitmap_info.file) {
@@ -2327,8 +2327,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
mddev->bitmap_info.offset = offset;
if (mddev->pers) {
struct bitmap *bitmap;
- mddev->pers->quiesce(mddev, 1);
bitmap = md_bitmap_create(mddev, -1);
+ mddev_suspend(mddev);
if (IS_ERR(bitmap))
rv = PTR_ERR(bitmap);
else {
@@ -2337,11 +2337,12 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
if (rv)
mddev->bitmap_info.offset = 0;
}
- mddev->pers->quiesce(mddev, 0);
if (rv) {
md_bitmap_destroy(mddev);
+ mddev_resume(mddev);
goto out;
}
+ mddev_resume(mddev);
}
}
}
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 0b2af6e74fc3..8dff19d5502e 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -33,13 +33,6 @@ struct dlm_lock_resource {
int mode;
};
-struct suspend_info {
- int slot;
- sector_t lo;
- sector_t hi;
- struct list_head list;
-};
-
struct resync_info {
__le64 lo;
__le64 hi;
@@ -80,7 +73,13 @@ struct md_cluster_info {
struct dlm_lock_resource **other_bitmap_lockres;
struct dlm_lock_resource *resync_lockres;
struct list_head suspend_list;
+
spinlock_t suspend_lock;
+ /* record the region which write should be suspended */
+ sector_t suspend_lo;
+ sector_t suspend_hi;
+ int suspend_from; /* the slot which broadcast suspend_lo/hi */
+
struct md_thread *recovery_thread;
unsigned long recovery_map;
/* communication loc resources */
@@ -105,6 +104,7 @@ enum msg_type {
RE_ADD,
BITMAP_NEEDS_SYNC,
CHANGE_CAPACITY,
+ BITMAP_RESIZE,
};
struct cluster_msg {
@@ -270,25 +270,22 @@ static void add_resync_info(struct dlm_lock_resource *lockres,
ri->hi = cpu_to_le64(hi);
}
-static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres)
+static int read_resync_info(struct mddev *mddev,
+ struct dlm_lock_resource *lockres)
{
struct resync_info ri;
- struct suspend_info *s = NULL;
- sector_t hi = 0;
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ int ret = 0;
dlm_lock_sync(lockres, DLM_LOCK_CR);
memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
- hi = le64_to_cpu(ri.hi);
- if (hi > 0) {
- s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
- if (!s)
- goto out;
- s->hi = hi;
- s->lo = le64_to_cpu(ri.lo);
+ if (le64_to_cpu(ri.hi) > 0) {
+ cinfo->suspend_hi = le64_to_cpu(ri.hi);
+ cinfo->suspend_lo = le64_to_cpu(ri.lo);
+ ret = 1;
}
dlm_unlock_sync(lockres);
-out:
- return s;
+ return ret;
}
static void recover_bitmaps(struct md_thread *thread)
@@ -298,7 +295,6 @@ static void recover_bitmaps(struct md_thread *thread)
struct dlm_lock_resource *bm_lockres;
char str[64];
int slot, ret;
- struct suspend_info *s, *tmp;
sector_t lo, hi;
while (cinfo->recovery_map) {
@@ -325,13 +321,17 @@ static void recover_bitmaps(struct md_thread *thread)
/* Clear suspend_area associated with the bitmap */
spin_lock_irq(&cinfo->suspend_lock);
- list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
- if (slot == s->slot) {
- list_del(&s->list);
- kfree(s);
- }
+ cinfo->suspend_hi = 0;
+ cinfo->suspend_lo = 0;
+ cinfo->suspend_from = -1;
spin_unlock_irq(&cinfo->suspend_lock);
+ /* Kick off a reshape if needed */
+ if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
+ test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ mddev->reshape_position != MaxSector)
+ md_wakeup_thread(mddev->sync_thread);
+
if (hi > 0) {
if (lo < mddev->recovery_cp)
mddev->recovery_cp = lo;
@@ -434,34 +434,23 @@ static void ack_bast(void *arg, int mode)
}
}
-static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
-{
- struct suspend_info *s, *tmp;
-
- list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
- if (slot == s->slot) {
- list_del(&s->list);
- kfree(s);
- break;
- }
-}
-
static void remove_suspend_info(struct mddev *mddev, int slot)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
mddev->pers->quiesce(mddev, 1);
spin_lock_irq(&cinfo->suspend_lock);
- __remove_suspend_info(cinfo, slot);
+ cinfo->suspend_hi = 0;
+ cinfo->suspend_lo = 0;
spin_unlock_irq(&cinfo->suspend_lock);
mddev->pers->quiesce(mddev, 0);
}
-
static void process_suspend_info(struct mddev *mddev,
int slot, sector_t lo, sector_t hi)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
- struct suspend_info *s;
+ struct mdp_superblock_1 *sb = NULL;
+ struct md_rdev *rdev;
if (!hi) {
/*
@@ -475,6 +464,12 @@ static void process_suspend_info(struct mddev *mddev,
return;
}
+ rdev_for_each(rdev, mddev)
+ if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) {
+ sb = page_address(rdev->sb_page);
+ break;
+ }
+
/*
* The bitmaps are not same for different nodes
* if RESYNCING is happening in one node, then
@@ -487,26 +482,26 @@ static void process_suspend_info(struct mddev *mddev,
* sync_low/hi is used to record the region which
* arrived in the previous RESYNCING message,
*
- * Call bitmap_sync_with_cluster to clear
- * NEEDED_MASK and set RESYNC_MASK since
- * resync thread is running in another node,
- * so we don't need to do the resync again
- * with the same section */
- md_bitmap_sync_with_cluster(mddev, cinfo->sync_low, cinfo->sync_hi, lo, hi);
+ * Call md_bitmap_sync_with_cluster to clear NEEDED_MASK
+ * and set RESYNC_MASK since resync thread is running
+ * in another node, so we don't need to do the resync
+ * again with the same section.
+ *
+ * Skip md_bitmap_sync_with_cluster in case reshape
+ * happening, because reshaping region is small and
+ * we don't want to trigger lots of WARN.
+ */
+ if (sb && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE))
+ md_bitmap_sync_with_cluster(mddev, cinfo->sync_low,
+ cinfo->sync_hi, lo, hi);
cinfo->sync_low = lo;
cinfo->sync_hi = hi;
- s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
- if (!s)
- return;
- s->slot = slot;
- s->lo = lo;
- s->hi = hi;
mddev->pers->quiesce(mddev, 1);
spin_lock_irq(&cinfo->suspend_lock);
- /* Remove existing entry (if exists) before adding */
- __remove_suspend_info(cinfo, slot);
- list_add(&s->list, &cinfo->suspend_list);
+ cinfo->suspend_from = slot;
+ cinfo->suspend_lo = lo;
+ cinfo->suspend_hi = hi;
spin_unlock_irq(&cinfo->suspend_lock);
mddev->pers->quiesce(mddev, 0);
}
@@ -612,6 +607,11 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
case BITMAP_NEEDS_SYNC:
__recover_slot(mddev, le32_to_cpu(msg->slot));
break;
+ case BITMAP_RESIZE:
+ if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0))
+ ret = md_bitmap_resize(mddev->bitmap,
+ le64_to_cpu(msg->high), 0, 0);
+ break;
default:
ret = -1;
pr_warn("%s:%d Received unknown message from %d\n",
@@ -800,7 +800,6 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
struct md_cluster_info *cinfo = mddev->cluster_info;
int i, ret = 0;
struct dlm_lock_resource *bm_lockres;
- struct suspend_info *s;
char str[64];
sector_t lo, hi;
@@ -819,16 +818,13 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
bm_lockres->flags |= DLM_LKF_NOQUEUE;
ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
if (ret == -EAGAIN) {
- s = read_resync_info(mddev, bm_lockres);
- if (s) {
+ if (read_resync_info(mddev, bm_lockres)) {
pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n",
__func__, __LINE__,
- (unsigned long long) s->lo,
- (unsigned long long) s->hi, i);
- spin_lock_irq(&cinfo->suspend_lock);
- s->slot = i;
- list_add(&s->list, &cinfo->suspend_list);
- spin_unlock_irq(&cinfo->suspend_lock);
+ (unsigned long long) cinfo->suspend_lo,
+ (unsigned long long) cinfo->suspend_hi,
+ i);
+ cinfo->suspend_from = i;
}
ret = 0;
lockres_free(bm_lockres);
@@ -1001,10 +997,17 @@ static int leave(struct mddev *mddev)
if (!cinfo)
return 0;
- /* BITMAP_NEEDS_SYNC message should be sent when node
+ /*
+ * BITMAP_NEEDS_SYNC message should be sent when node
* is leaving the cluster with dirty bitmap, also we
- * can only deliver it when dlm connection is available */
- if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector)
+ * can only deliver it when dlm connection is available.
+ *
+ * Also, we should send BITMAP_NEEDS_SYNC message in
+ * case reshaping is interrupted.
+ */
+ if ((cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector) ||
+ (mddev->reshape_position != MaxSector &&
+ test_bit(MD_CLOSING, &mddev->flags)))
resync_bitmap(mddev);
set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
@@ -1102,6 +1105,80 @@ static void metadata_update_cancel(struct mddev *mddev)
unlock_comm(cinfo);
}
+static int update_bitmap_size(struct mddev *mddev, sector_t size)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ struct cluster_msg cmsg = {0};
+ int ret;
+
+ cmsg.type = cpu_to_le32(BITMAP_RESIZE);
+ cmsg.high = cpu_to_le64(size);
+ ret = sendmsg(cinfo, &cmsg, 0);
+ if (ret)
+ pr_err("%s:%d: failed to send BITMAP_RESIZE message (%d)\n",
+ __func__, __LINE__, ret);
+ return ret;
+}
+
+static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsize)
+{
+ struct bitmap_counts *counts;
+ char str[64];
+ struct dlm_lock_resource *bm_lockres;
+ struct bitmap *bitmap = mddev->bitmap;
+ unsigned long my_pages = bitmap->counts.pages;
+ int i, rv;
+
+ /*
+ * We need to ensure all the nodes can grow to a larger
+ * bitmap size before make the reshaping.
+ */
+ rv = update_bitmap_size(mddev, newsize);
+ if (rv)
+ return rv;
+
+ for (i = 0; i < mddev->bitmap_info.nodes; i++) {
+ if (i == md_cluster_ops->slot_number(mddev))
+ continue;
+
+ bitmap = get_bitmap_from_slot(mddev, i);
+ if (IS_ERR(bitmap)) {
+ pr_err("can't get bitmap from slot %d\n", i);
+ goto out;
+ }
+ counts = &bitmap->counts;
+
+ /*
+ * If we can hold the bitmap lock of one node then
+ * the slot is not occupied, update the pages.
+ */
+ snprintf(str, 64, "bitmap%04d", i);
+ bm_lockres = lockres_init(mddev, str, NULL, 1);
+ if (!bm_lockres) {
+ pr_err("Cannot initialize %s lock\n", str);
+ goto out;
+ }
+ bm_lockres->flags |= DLM_LKF_NOQUEUE;
+ rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
+ if (!rv)
+ counts->pages = my_pages;
+ lockres_free(bm_lockres);
+
+ if (my_pages != counts->pages)
+ /*
+ * Let's revert the bitmap size if one node
+ * can't resize bitmap
+ */
+ goto out;
+ }
+
+ return 0;
+out:
+ md_bitmap_free(bitmap);
+ update_bitmap_size(mddev, oldsize);
+ return -1;
+}
+
/*
* return 0 if all the bitmaps have the same sync_size
*/
@@ -1243,6 +1320,16 @@ static int resync_start(struct mddev *mddev)
return dlm_lock_sync_interruptible(cinfo->resync_lockres, DLM_LOCK_EX, mddev);
}
+static void resync_info_get(struct mddev *mddev, sector_t *lo, sector_t *hi)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+
+ spin_lock_irq(&cinfo->suspend_lock);
+ *lo = cinfo->suspend_lo;
+ *hi = cinfo->suspend_hi;
+ spin_unlock_irq(&cinfo->suspend_lock);
+}
+
static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
@@ -1295,21 +1382,14 @@ static int area_resyncing(struct mddev *mddev, int direction,
{
struct md_cluster_info *cinfo = mddev->cluster_info;
int ret = 0;
- struct suspend_info *s;
if ((direction == READ) &&
test_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state))
return 1;
spin_lock_irq(&cinfo->suspend_lock);
- if (list_empty(&cinfo->suspend_list))
- goto out;
- list_for_each_entry(s, &cinfo->suspend_list, list)
- if (hi > s->lo && lo < s->hi) {
- ret = 1;
- break;
- }
-out:
+ if (hi > cinfo->suspend_lo && lo < cinfo->suspend_hi)
+ ret = 1;
spin_unlock_irq(&cinfo->suspend_lock);
return ret;
}
@@ -1482,6 +1562,7 @@ static struct md_cluster_operations cluster_ops = {
.resync_start = resync_start,
.resync_finish = resync_finish,
.resync_info_update = resync_info_update,
+ .resync_info_get = resync_info_get,
.metadata_update_start = metadata_update_start,
.metadata_update_finish = metadata_update_finish,
.metadata_update_cancel = metadata_update_cancel,
@@ -1492,6 +1573,7 @@ static struct md_cluster_operations cluster_ops = {
.remove_disk = remove_disk,
.load_bitmaps = load_bitmaps,
.gather_bitmaps = gather_bitmaps,
+ .resize_bitmaps = resize_bitmaps,
.lock_all_bitmaps = lock_all_bitmaps,
.unlock_all_bitmaps = unlock_all_bitmaps,
.update_size = update_size,
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index c0240708f443..a78e3021775d 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -14,6 +14,7 @@ struct md_cluster_operations {
int (*leave)(struct mddev *mddev);
int (*slot_number)(struct mddev *mddev);
int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
+ void (*resync_info_get)(struct mddev *mddev, sector_t *lo, sector_t *hi);
int (*metadata_update_start)(struct mddev *mddev);
int (*metadata_update_finish)(struct mddev *mddev);
void (*metadata_update_cancel)(struct mddev *mddev);
@@ -26,6 +27,7 @@ struct md_cluster_operations {
int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
void (*load_bitmaps)(struct mddev *mddev, int total_slots);
int (*gather_bitmaps)(struct md_rdev *rdev);
+ int (*resize_bitmaps)(struct mddev *mddev, sector_t newsize, sector_t oldsize);
int (*lock_all_bitmaps)(struct mddev *mddev);
void (*unlock_all_bitmaps)(struct mddev *mddev);
void (*update_size)(struct mddev *mddev, sector_t old_dev_sectors);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 63ceabb4e020..fc488cb30a94 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -452,10 +452,11 @@ static void md_end_flush(struct bio *fbio)
rdev_dec_pending(rdev, mddev);
if (atomic_dec_and_test(&fi->flush_pending)) {
- if (bio->bi_iter.bi_size == 0)
+ if (bio->bi_iter.bi_size == 0) {
/* an empty barrier - all done */
bio_endio(bio);
- else {
+ mempool_free(fi, mddev->flush_pool);
+ } else {
INIT_WORK(&fi->flush_work, submit_flushes);
queue_work(md_wq, &fi->flush_work);
}
@@ -509,10 +510,11 @@ void md_flush_request(struct mddev *mddev, struct bio *bio)
rcu_read_unlock();
if (atomic_dec_and_test(&fi->flush_pending)) {
- if (bio->bi_iter.bi_size == 0)
+ if (bio->bi_iter.bi_size == 0) {
/* an empty barrier - all done */
bio_endio(bio);
- else {
+ mempool_free(fi, mddev->flush_pool);
+ } else {
INIT_WORK(&fi->flush_work, submit_flushes);
queue_work(md_wq, &fi->flush_work);
}
@@ -5904,14 +5906,6 @@ static void __md_stop(struct mddev *mddev)
mddev->to_remove = &md_redundancy_group;
module_put(pers->owner);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
-}
-
-void md_stop(struct mddev *mddev)
-{
- /* stop the array and free an attached data structures.
- * This is called from dm-raid
- */
- __md_stop(mddev);
if (mddev->flush_bio_pool) {
mempool_destroy(mddev->flush_bio_pool);
mddev->flush_bio_pool = NULL;
@@ -5920,6 +5914,14 @@ void md_stop(struct mddev *mddev)
mempool_destroy(mddev->flush_pool);
mddev->flush_pool = NULL;
}
+}
+
+void md_stop(struct mddev *mddev)
+{
+ /* stop the array and free an attached data structures.
+ * This is called from dm-raid
+ */
+ __md_stop(mddev);
bioset_exit(&mddev->bio_set);
bioset_exit(&mddev->sync_set);
}
@@ -8370,9 +8372,17 @@ void md_do_sync(struct md_thread *thread)
else if (!mddev->bitmap)
j = mddev->recovery_cp;
- } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
max_sectors = mddev->resync_max_sectors;
- else {
+ /*
+ * If the original node aborts reshaping then we continue the
+ * reshaping, so set j again to avoid restart reshape from the
+ * first beginning
+ */
+ if (mddev_is_clustered(mddev) &&
+ mddev->reshape_position != MaxSector)
+ j = mddev->reshape_position;
+ } else {
/* recovery follows the physical size of devices */
max_sectors = mddev->dev_sectors;
j = MaxSector;
@@ -8623,8 +8633,10 @@ void md_do_sync(struct md_thread *thread)
mddev_lock_nointr(mddev);
md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
mddev_unlock(mddev);
- set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk(mddev->gendisk);
+ if (!mddev_is_clustered(mddev)) {
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
+ }
}
spin_lock(&mddev->lock);
@@ -8790,6 +8802,18 @@ static void md_start_sync(struct work_struct *ws)
*/
void md_check_recovery(struct mddev *mddev)
{
+ if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
+ /* Write superblock - thread that called mddev_suspend()
+ * holds reconfig_mutex for us.
+ */
+ set_bit(MD_UPDATING_SB, &mddev->flags);
+ smp_mb__after_atomic();
+ if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
+ md_update_sb(mddev, 0);
+ clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
+ wake_up(&mddev->sb_wait);
+ }
+
if (mddev->suspended)
return;
@@ -8949,16 +8973,6 @@ void md_check_recovery(struct mddev *mddev)
unlock:
wake_up(&mddev->sb_wait);
mddev_unlock(mddev);
- } else if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
- /* Write superblock - thread that called mddev_suspend()
- * holds reconfig_mutex for us.
- */
- set_bit(MD_UPDATING_SB, &mddev->flags);
- smp_mb__after_atomic();
- if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
- md_update_sb(mddev, 0);
- clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
- wake_up(&mddev->sb_wait);
}
}
EXPORT_SYMBOL(md_check_recovery);
@@ -8966,6 +8980,8 @@ EXPORT_SYMBOL(md_check_recovery);
void md_reap_sync_thread(struct mddev *mddev)
{
struct md_rdev *rdev;
+ sector_t old_dev_sectors = mddev->dev_sectors;
+ bool is_reshaped = false;
/* resync has finished, collect result */
md_unregister_thread(&mddev->sync_thread);
@@ -8980,8 +8996,11 @@ void md_reap_sync_thread(struct mddev *mddev)
}
}
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
- mddev->pers->finish_reshape)
+ mddev->pers->finish_reshape) {
mddev->pers->finish_reshape(mddev);
+ if (mddev_is_clustered(mddev))
+ is_reshaped = true;
+ }
/* If array is no-longer degraded, then any saved_raid_disk
* information must be scrapped.
@@ -9002,6 +9021,14 @@ void md_reap_sync_thread(struct mddev *mddev)
clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ /*
+ * We call md_cluster_ops->update_size here because sync_size could
+ * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
+ * so it is time to update size across cluster.
+ */
+ if (mddev_is_clustered(mddev) && is_reshaped
+ && !test_bit(MD_CLOSING, &mddev->flags))
+ md_cluster_ops->update_size(mddev, old_dev_sectors);
wake_up(&resync_wait);
/* flag recovery needed just to double check */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -9201,8 +9228,12 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
}
if (role != rdev2->raid_disk) {
- /* got activated */
- if (rdev2->raid_disk == -1 && role != 0xffff) {
+ /*
+ * got activated except reshape is happening.
+ */
+ if (rdev2->raid_disk == -1 && role != 0xffff &&
+ !(le32_to_cpu(sb->feature_map) &
+ MD_FEATURE_RESHAPE_ACTIVE)) {
rdev2->saved_raid_disk = role;
ret = remove_and_add_spares(mddev, rdev2);
pr_info("Activated spare: %s\n",
@@ -9228,6 +9259,30 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))
update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
+ /*
+ * Since mddev->delta_disks has already updated in update_raid_disks,
+ * so it is time to check reshape.
+ */
+ if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
+ (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
+ /*
+ * reshape is happening in the remote node, we need to
+ * update reshape_position and call start_reshape.
+ */
+ mddev->reshape_position = sb->reshape_position;
+ if (mddev->pers->update_reshape_pos)
+ mddev->pers->update_reshape_pos(mddev);
+ if (mddev->pers->start_reshape)
+ mddev->pers->start_reshape(mddev);
+ } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
+ mddev->reshape_position != MaxSector &&
+ !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
+ /* reshape is just done in another node. */
+ mddev->reshape_position = MaxSector;
+ if (mddev->pers->update_reshape_pos)
+ mddev->pers->update_reshape_pos(mddev);
+ }
+
/* Finally set the event to be up to date */
mddev->events = le64_to_cpu(sb->events);
}
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 8afd6bfdbfb9..c52afb52c776 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -557,6 +557,7 @@ struct md_personality
int (*check_reshape) (struct mddev *mddev);
int (*start_reshape) (struct mddev *mddev);
void (*finish_reshape) (struct mddev *mddev);
+ void (*update_reshape_pos) (struct mddev *mddev);
/* quiesce suspends or resumes internal processing.
* 1 - stop new actions and wait for action io to complete
* 0 - return to normal behaviour
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 4e990246225e..1d54109071cc 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1734,6 +1734,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
*/
if (rdev->saved_raid_disk >= 0 &&
rdev->saved_raid_disk >= first &&
+ rdev->saved_raid_disk < conf->raid_disks &&
conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
first = last = rdev->saved_raid_disk;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d6f7978b4449..b98e746e7fc4 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -25,6 +25,7 @@
#include <linux/seq_file.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
+#include <linux/raid/md_p.h>
#include <trace/events/block.h>
#include "md.h"
#include "raid10.h"
@@ -1808,6 +1809,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
first = last = rdev->raid_disk;
if (rdev->saved_raid_disk >= first &&
+ rdev->saved_raid_disk < conf->geo.raid_disks &&
conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
mirror = rdev->saved_raid_disk;
else
@@ -3079,6 +3081,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
sector_t sect;
int must_sync;
int any_working;
+ int need_recover = 0;
+ int need_replace = 0;
struct raid10_info *mirror = &conf->mirrors[i];
struct md_rdev *mrdev, *mreplace;
@@ -3086,11 +3090,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
mrdev = rcu_dereference(mirror->rdev);
mreplace = rcu_dereference(mirror->replacement);
- if ((mrdev == NULL ||
- test_bit(Faulty, &mrdev->flags) ||
- test_bit(In_sync, &mrdev->flags)) &&
- (mreplace == NULL ||
- test_bit(Faulty, &mreplace->flags))) {
+ if (mrdev != NULL &&
+ !test_bit(Faulty, &mrdev->flags) &&
+ !test_bit(In_sync, &mrdev->flags))
+ need_recover = 1;
+ if (mreplace != NULL &&
+ !test_bit(Faulty, &mreplace->flags))
+ need_replace = 1;
+
+ if (!need_recover && !need_replace) {
rcu_read_unlock();
continue;
}
@@ -3213,7 +3221,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
r10_bio->devs[1].devnum = i;
r10_bio->devs[1].addr = to_addr;
- if (!test_bit(In_sync, &mrdev->flags)) {
+ if (need_recover) {
bio = r10_bio->devs[1].bio;
bio->bi_next = biolist;
biolist = bio;
@@ -3230,16 +3238,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
bio = r10_bio->devs[1].repl_bio;
if (bio)
bio->bi_end_io = NULL;
- /* Note: if mreplace != NULL, then bio
+ /* Note: if need_replace, then bio
* cannot be NULL as r10buf_pool_alloc will
* have allocated it.
- * So the second test here is pointless.
- * But it keeps semantic-checkers happy, and
- * this comment keeps human reviewers
- * happy.
*/
- if (mreplace == NULL || bio == NULL ||
- test_bit(Faulty, &mreplace->flags))
+ if (!need_replace)
break;
bio->bi_next = biolist;
biolist = bio;
@@ -4286,12 +4289,46 @@ static int raid10_start_reshape(struct mddev *mddev)
spin_unlock_irq(&conf->device_lock);
if (mddev->delta_disks && mddev->bitmap) {
- ret = md_bitmap_resize(mddev->bitmap,
- raid10_size(mddev, 0, conf->geo.raid_disks),
- 0, 0);
+ struct mdp_superblock_1 *sb = NULL;
+ sector_t oldsize, newsize;
+
+ oldsize = raid10_size(mddev, 0, 0);
+ newsize = raid10_size(mddev, 0, conf->geo.raid_disks);
+
+ if (!mddev_is_clustered(mddev)) {
+ ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
+ if (ret)
+ goto abort;
+ else
+ goto out;
+ }
+
+ rdev_for_each(rdev, mddev) {
+ if (rdev->raid_disk > -1 &&
+ !test_bit(Faulty, &rdev->flags))
+ sb = page_address(rdev->sb_page);
+ }
+
+ /*
+ * some node is already performing reshape, and no need to
+ * call md_bitmap_resize again since it should be called when
+ * receiving BITMAP_RESIZE msg
+ */
+ if ((sb && (le32_to_cpu(sb->feature_map) &
+ MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize))
+ goto out;
+
+ ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
if (ret)
goto abort;
+
+ ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
+ if (ret) {
+ md_bitmap_resize(mddev->bitmap, oldsize, 0, 0);
+ goto abort;
+ }
}
+out:
if (mddev->delta_disks > 0) {
rdev_for_each(rdev, mddev)
if (rdev->raid_disk < 0 &&
@@ -4568,6 +4605,32 @@ read_more:
r10_bio->master_bio = read_bio;
r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
+ /*
+ * Broadcast RESYNC message to other nodes, so all nodes would not
+ * write to the region to avoid conflict.
+ */
+ if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) {
+ struct mdp_superblock_1 *sb = NULL;
+ int sb_reshape_pos = 0;
+
+ conf->cluster_sync_low = sector_nr;
+ conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS;
+ sb = page_address(rdev->sb_page);
+ if (sb) {
+ sb_reshape_pos = le64_to_cpu(sb->reshape_position);
+ /*
+ * Set cluster_sync_low again if next address for array
+ * reshape is less than cluster_sync_low. Since we can't
+ * update cluster_sync_low until it has finished reshape.
+ */
+ if (sb_reshape_pos < conf->cluster_sync_low)
+ conf->cluster_sync_low = sb_reshape_pos;
+ }
+
+ md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
+ conf->cluster_sync_high);
+ }
+
/* Now find the locations in the new layout */
__raid10_find_phys(&conf->geo, r10_bio);
@@ -4719,6 +4782,19 @@ static void end_reshape(struct r10conf *conf)
conf->fullsync = 0;
}
+static void raid10_update_reshape_pos(struct mddev *mddev)
+{
+ struct r10conf *conf = mddev->private;
+ sector_t lo, hi;
+
+ md_cluster_ops->resync_info_get(mddev, &lo, &hi);
+ if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo))
+ || mddev->reshape_position == MaxSector)
+ conf->reshape_progress = mddev->reshape_position;
+ else
+ WARN_ON_ONCE(1);
+}
+
static int handle_reshape_read_error(struct mddev *mddev,
struct r10bio *r10_bio)
{
@@ -4887,6 +4963,7 @@ static struct md_personality raid10_personality =
.check_reshape = raid10_check_reshape,
.start_reshape = raid10_start_reshape,
.finish_reshape = raid10_finish_reshape,
+ .update_reshape_pos = raid10_update_reshape_pos,
.congested = raid10_congested,
};
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index e6e925add700..ec3a5ef7fee0 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -3151,8 +3151,6 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
return 0;
- rcu_assign_pointer(conf->log, NULL);
- md_unregister_thread(&log->reclaim_thread);
reclaim_thread:
mempool_exit(&log->meta_pool);
out_mempool:
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e4e98f47865d..4990f0319f6c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2681,6 +2681,18 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
pr_debug("raid456: error called\n");
spin_lock_irqsave(&conf->device_lock, flags);
+
+ if (test_bit(In_sync, &rdev->flags) &&
+ mddev->degraded == conf->max_degraded) {
+ /*
+ * Don't allow to achieve failed state
+ * Don't try to recover this device
+ */
+ conf->recovery_disabled = mddev->recovery_disabled;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ return;
+ }
+
set_bit(Faulty, &rdev->flags);
clear_bit(In_sync, &rdev->flags);
mddev->degraded = raid5_calc_degraded(conf);