summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bitmap.c8
-rw-r--r--drivers/md/dm-bufio.c18
-rw-r--r--drivers/md/dm-cache-background-tracker.c5
-rw-r--r--drivers/md/dm-cache-policy-smq.c31
-rw-r--r--drivers/md/dm-cache-target.c27
-rw-r--r--drivers/md/dm-integrity.c42
-rw-r--r--drivers/md/dm-io.c4
-rw-r--r--drivers/md/dm-ioctl.c5
-rw-r--r--drivers/md/dm-mpath.c19
-rw-r--r--drivers/md/dm-raid.c17
-rw-r--r--drivers/md/dm-raid1.c23
-rw-r--r--drivers/md/dm-rq.c1
-rw-r--r--drivers/md/dm-snap-persistent.c3
-rw-r--r--drivers/md/dm-thin-metadata.c4
-rw-r--r--drivers/md/dm-thin.c26
-rw-r--r--drivers/md/dm-verity-target.c4
-rw-r--r--drivers/md/dm.c2
-rw-r--r--drivers/md/md-cluster.c4
-rw-r--r--drivers/md/md.c38
-rw-r--r--drivers/md/md.h3
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.c15
-rw-r--r--drivers/md/raid0.c116
-rw-r--r--drivers/md/raid1.c23
-rw-r--r--drivers/md/raid10.c10
-rw-r--r--drivers/md/raid5-cache.c51
-rw-r--r--drivers/md/raid5-log.h3
-rw-r--r--drivers/md/raid5-ppl.c4
-rw-r--r--drivers/md/raid5.c100
28 files changed, 398 insertions, 208 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index bf7419a56454..f4eace5ea184 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -485,10 +485,10 @@ void bitmap_print_sb(struct bitmap *bitmap)
pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic));
pr_debug(" version: %d\n", le32_to_cpu(sb->version));
pr_debug(" uuid: %08x.%08x.%08x.%08x\n",
- *(__u32 *)(sb->uuid+0),
- *(__u32 *)(sb->uuid+4),
- *(__u32 *)(sb->uuid+8),
- *(__u32 *)(sb->uuid+12));
+ le32_to_cpu(*(__u32 *)(sb->uuid+0)),
+ le32_to_cpu(*(__u32 *)(sb->uuid+4)),
+ le32_to_cpu(*(__u32 *)(sb->uuid+8)),
+ le32_to_cpu(*(__u32 *)(sb->uuid+12)));
pr_debug(" events: %llu\n",
(unsigned long long) le64_to_cpu(sb->events));
pr_debug("events cleared: %llu\n",
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 5db11a405129..840c1496b2b1 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -218,7 +218,7 @@ static DEFINE_SPINLOCK(param_spinlock);
* Buffers are freed after this timeout
*/
static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
-static unsigned dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
+static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
static unsigned long dm_bufio_peak_allocated;
static unsigned long dm_bufio_allocated_kmem_cache;
@@ -1334,7 +1334,7 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c)
{
struct dm_io_request io_req = {
.bi_op = REQ_OP_WRITE,
- .bi_op_flags = REQ_PREFLUSH,
+ .bi_op_flags = REQ_PREFLUSH | REQ_SYNC,
.mem.type = DM_IO_KMEM,
.mem.ptr.addr = NULL,
.client = c->dm_io,
@@ -1558,10 +1558,10 @@ static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
return true;
}
-static unsigned get_retain_buffers(struct dm_bufio_client *c)
+static unsigned long get_retain_buffers(struct dm_bufio_client *c)
{
- unsigned retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes);
- return retain_bytes / c->block_size;
+ unsigned long retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes);
+ return retain_bytes >> (c->sectors_per_block_bits + SECTOR_SHIFT);
}
static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
@@ -1571,7 +1571,7 @@ static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
struct dm_buffer *b, *tmp;
unsigned long freed = 0;
unsigned long count = nr_to_scan;
- unsigned retain_target = get_retain_buffers(c);
+ unsigned long retain_target = get_retain_buffers(c);
for (l = 0; l < LIST_SIZE; l++) {
list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
@@ -1794,8 +1794,8 @@ static bool older_than(struct dm_buffer *b, unsigned long age_hz)
static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
{
struct dm_buffer *b, *tmp;
- unsigned retain_target = get_retain_buffers(c);
- unsigned count;
+ unsigned long retain_target = get_retain_buffers(c);
+ unsigned long count;
LIST_HEAD(write_list);
dm_bufio_lock(c);
@@ -1955,7 +1955,7 @@ MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
-module_param_named(retain_bytes, dm_bufio_retain_bytes, uint, S_IRUGO | S_IWUSR);
+module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
diff --git a/drivers/md/dm-cache-background-tracker.c b/drivers/md/dm-cache-background-tracker.c
index 9b1afdfb13f0..707233891291 100644
--- a/drivers/md/dm-cache-background-tracker.c
+++ b/drivers/md/dm-cache-background-tracker.c
@@ -33,6 +33,11 @@ struct background_tracker *btracker_create(unsigned max_work)
{
struct background_tracker *b = kmalloc(sizeof(*b), GFP_KERNEL);
+ if (!b) {
+ DMERR("couldn't create background_tracker");
+ return NULL;
+ }
+
b->max_work = max_work;
atomic_set(&b->pending_promotes, 0);
atomic_set(&b->pending_writebacks, 0);
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index 72479bd61e11..e5eb9c9b4bc8 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -1120,8 +1120,6 @@ static bool clean_target_met(struct smq_policy *mq, bool idle)
* Cache entries may not be populated. So we cannot rely on the
* size of the clean queue.
*/
- unsigned nr_clean;
-
if (idle) {
/*
* We'd like to clean everything.
@@ -1129,18 +1127,16 @@ static bool clean_target_met(struct smq_policy *mq, bool idle)
return q_size(&mq->dirty) == 0u;
}
- nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty);
- return (nr_clean + btracker_nr_writebacks_queued(mq->bg_work)) >=
- percent_to_target(mq, CLEAN_TARGET);
+ /*
+ * If we're busy we don't worry about cleaning at all.
+ */
+ return true;
}
-static bool free_target_met(struct smq_policy *mq, bool idle)
+static bool free_target_met(struct smq_policy *mq)
{
unsigned nr_free;
- if (!idle)
- return true;
-
nr_free = from_cblock(mq->cache_size) - mq->cache_alloc.nr_allocated;
return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >=
percent_to_target(mq, FREE_TARGET);
@@ -1190,9 +1186,9 @@ static void queue_demotion(struct smq_policy *mq)
if (unlikely(WARN_ON_ONCE(!mq->migrations_allowed)))
return;
- e = q_peek(&mq->clean, mq->clean.nr_levels, true);
+ e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true);
if (!e) {
- if (!clean_target_met(mq, false))
+ if (!clean_target_met(mq, true))
queue_writeback(mq);
return;
}
@@ -1220,7 +1216,7 @@ static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock,
* We always claim to be 'idle' to ensure some demotions happen
* with continuous loads.
*/
- if (!free_target_met(mq, true))
+ if (!free_target_met(mq))
queue_demotion(mq);
return;
}
@@ -1421,14 +1417,10 @@ static int smq_get_background_work(struct dm_cache_policy *p, bool idle,
spin_lock_irqsave(&mq->lock, flags);
r = btracker_issue(mq->bg_work, result);
if (r == -ENODATA) {
- /* find some writeback work to do */
- if (mq->migrations_allowed && !free_target_met(mq, idle))
- queue_demotion(mq);
-
- else if (!clean_target_met(mq, idle))
+ if (!clean_target_met(mq, idle)) {
queue_writeback(mq);
-
- r = btracker_issue(mq->bg_work, result);
+ r = btracker_issue(mq->bg_work, result);
+ }
}
spin_unlock_irqrestore(&mq->lock, flags);
@@ -1452,6 +1444,7 @@ static void __complete_background_work(struct smq_policy *mq,
clear_pending(mq, e);
if (success) {
e->oblock = work->oblock;
+ e->level = NR_CACHE_LEVELS - 1;
push(mq, e);
// h, q, a
} else {
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 1db375f50a13..d682a0511381 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -94,6 +94,9 @@ static void iot_io_begin(struct io_tracker *iot, sector_t len)
static void __iot_io_end(struct io_tracker *iot, sector_t len)
{
+ if (!len)
+ return;
+
iot->in_flight -= len;
if (!iot->in_flight)
iot->idle_time = jiffies;
@@ -474,7 +477,7 @@ struct cache {
spinlock_t invalidation_lock;
struct list_head invalidation_requests;
- struct io_tracker origin_tracker;
+ struct io_tracker tracker;
struct work_struct commit_ws;
struct batcher committer;
@@ -901,8 +904,7 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
static bool accountable_bio(struct cache *cache, struct bio *bio)
{
- return ((bio->bi_bdev == cache->origin_dev->bdev) &&
- bio_op(bio) != REQ_OP_DISCARD);
+ return bio_op(bio) != REQ_OP_DISCARD;
}
static void accounted_begin(struct cache *cache, struct bio *bio)
@@ -912,7 +914,7 @@ static void accounted_begin(struct cache *cache, struct bio *bio)
if (accountable_bio(cache, bio)) {
pb->len = bio_sectors(bio);
- iot_io_begin(&cache->origin_tracker, pb->len);
+ iot_io_begin(&cache->tracker, pb->len);
}
}
@@ -921,7 +923,7 @@ static void accounted_complete(struct cache *cache, struct bio *bio)
size_t pb_data_size = get_per_bio_data_size(cache);
struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
- iot_io_end(&cache->origin_tracker, pb->len);
+ iot_io_end(&cache->tracker, pb->len);
}
static void accounted_request(struct cache *cache, struct bio *bio)
@@ -1716,20 +1718,19 @@ static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
enum busy {
IDLE,
- MODERATE,
BUSY
};
static enum busy spare_migration_bandwidth(struct cache *cache)
{
- bool idle = iot_idle_for(&cache->origin_tracker, HZ);
+ bool idle = iot_idle_for(&cache->tracker, HZ);
sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
cache->sectors_per_block;
- if (current_volume <= cache->migration_threshold)
- return idle ? IDLE : MODERATE;
+ if (idle && current_volume <= cache->migration_threshold)
+ return IDLE;
else
- return idle ? MODERATE : BUSY;
+ return BUSY;
}
static void inc_hit_counter(struct cache *cache, struct bio *bio)
@@ -2045,8 +2046,6 @@ static void check_migrations(struct work_struct *ws)
for (;;) {
b = spare_migration_bandwidth(cache);
- if (b == BUSY)
- break;
r = policy_get_background_work(cache->policy, b == IDLE, &op);
if (r == -ENODATA)
@@ -2717,7 +2716,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
batcher_init(&cache->committer, commit_op, cache,
issue_op, cache, cache->wq);
- iot_init(&cache->origin_tracker);
+ iot_init(&cache->tracker);
init_rwsem(&cache->background_work_lock);
prevent_background_work(cache);
@@ -2941,7 +2940,7 @@ static void cache_postsuspend(struct dm_target *ti)
cancel_delayed_work(&cache->waker);
flush_workqueue(cache->wq);
- WARN_ON(cache->origin_tracker.in_flight);
+ WARN_ON(cache->tracker.in_flight);
/*
* If it's a flush suspend there won't be any deferred bios, so this
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index c7f7c8d76576..93b181088168 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -783,7 +783,8 @@ static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsi
for (i = 0; i < commit_sections; i++)
rw_section_mac(ic, commit_start + i, true);
}
- rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, commit_sections, &io_comp);
+ rw_journal(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, commit_start,
+ commit_sections, &io_comp);
} else {
unsigned to_end;
io_comp.in_flight = (atomic_t)ATOMIC_INIT(2);
@@ -1104,10 +1105,13 @@ static void schedule_autocommit(struct dm_integrity_c *ic)
static void submit_flush_bio(struct dm_integrity_c *ic, struct dm_integrity_io *dio)
{
struct bio *bio;
- spin_lock_irq(&ic->endio_wait.lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&ic->endio_wait.lock, flags);
bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
bio_list_add(&ic->flush_bio_list, bio);
- spin_unlock_irq(&ic->endio_wait.lock);
+ spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
+
queue_work(ic->commit_wq, &ic->commit_work);
}
@@ -2374,21 +2378,6 @@ static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic)
blk_queue_max_integrity_segments(disk->queue, UINT_MAX);
}
-/* FIXME: use new kvmalloc */
-static void *dm_integrity_kvmalloc(size_t size, gfp_t gfp)
-{
- void *ptr = NULL;
-
- if (size <= PAGE_SIZE)
- ptr = kmalloc(size, GFP_KERNEL | gfp);
- if (!ptr && size <= KMALLOC_MAX_SIZE)
- ptr = kmalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | gfp);
- if (!ptr)
- ptr = __vmalloc(size, GFP_KERNEL | gfp, PAGE_KERNEL);
-
- return ptr;
-}
-
static void dm_integrity_free_page_list(struct dm_integrity_c *ic, struct page_list *pl)
{
unsigned i;
@@ -2407,7 +2396,7 @@ static struct page_list *dm_integrity_alloc_page_list(struct dm_integrity_c *ic)
struct page_list *pl;
unsigned i;
- pl = dm_integrity_kvmalloc(page_list_desc_size, __GFP_ZERO);
+ pl = kvmalloc(page_list_desc_size, GFP_KERNEL | __GFP_ZERO);
if (!pl)
return NULL;
@@ -2437,7 +2426,7 @@ static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_int
struct scatterlist **sl;
unsigned i;
- sl = dm_integrity_kvmalloc(ic->journal_sections * sizeof(struct scatterlist *), __GFP_ZERO);
+ sl = kvmalloc(ic->journal_sections * sizeof(struct scatterlist *), GFP_KERNEL | __GFP_ZERO);
if (!sl)
return NULL;
@@ -2453,7 +2442,7 @@ static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_int
n_pages = (end_index - start_index + 1);
- s = dm_integrity_kvmalloc(n_pages * sizeof(struct scatterlist), 0);
+ s = kvmalloc(n_pages * sizeof(struct scatterlist), GFP_KERNEL);
if (!s) {
dm_integrity_free_journal_scatterlist(ic, sl);
return NULL;
@@ -2617,7 +2606,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
goto bad;
}
- sg = dm_integrity_kvmalloc((ic->journal_pages + 1) * sizeof(struct scatterlist), 0);
+ sg = kvmalloc((ic->journal_pages + 1) * sizeof(struct scatterlist), GFP_KERNEL);
if (!sg) {
*error = "Unable to allocate sg list";
r = -ENOMEM;
@@ -2673,7 +2662,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
r = -ENOMEM;
goto bad;
}
- ic->sk_requests = dm_integrity_kvmalloc(ic->journal_sections * sizeof(struct skcipher_request *), __GFP_ZERO);
+ ic->sk_requests = kvmalloc(ic->journal_sections * sizeof(struct skcipher_request *), GFP_KERNEL | __GFP_ZERO);
if (!ic->sk_requests) {
*error = "Unable to allocate sk requests";
r = -ENOMEM;
@@ -2740,7 +2729,7 @@ retest_commit_id:
r = -ENOMEM;
goto bad;
}
- ic->journal_tree = dm_integrity_kvmalloc(journal_tree_size, 0);
+ ic->journal_tree = kvmalloc(journal_tree_size, GFP_KERNEL);
if (!ic->journal_tree) {
*error = "Could not allocate memory for journal tree";
r = -ENOMEM;
@@ -3054,6 +3043,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->error = "The device is too small";
goto bad;
}
+ if (ti->len > ic->provided_data_sectors) {
+ r = -EINVAL;
+ ti->error = "Not enough provided sectors for requested mapping size";
+ goto bad;
+ }
if (!buffer_sectors)
buffer_sectors = 1;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 3702e502466d..8d5ca30f6551 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -317,8 +317,8 @@ static void do_region(int op, int op_flags, unsigned region,
else if (op == REQ_OP_WRITE_SAME)
special_cmd_max_sectors = q->limits.max_write_same_sectors;
if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES ||
- op == REQ_OP_WRITE_SAME) &&
- special_cmd_max_sectors == 0) {
+ op == REQ_OP_WRITE_SAME) && special_cmd_max_sectors == 0) {
+ atomic_inc(&io->count);
dec_count(io, region, -EOPNOTSUPP);
return;
}
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 0555b4410e05..41852ae287a5 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1710,12 +1710,13 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern
}
/*
- * Try to avoid low memory issues when a device is suspended.
+ * Use __GFP_HIGH to avoid low memory issues when a device is
+ * suspended and the ioctl is needed to resume it.
* Use kmalloc() rather than vmalloc() when we can.
*/
dmi = NULL;
noio_flag = memalloc_noio_save();
- dmi = kvmalloc(param_kernel->data_size, GFP_KERNEL);
+ dmi = kvmalloc(param_kernel->data_size, GFP_KERNEL | __GFP_HIGH);
memalloc_noio_restore(noio_flag);
if (!dmi) {
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 926a6bcb32c8..3df056b73b66 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -447,7 +447,7 @@ failed:
* it has been invoked.
*/
#define dm_report_EIO(m) \
-({ \
+do { \
struct mapped_device *md = dm_table_get_md((m)->ti->table); \
\
pr_debug("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d\n", \
@@ -455,8 +455,7 @@ failed:
test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags), \
test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags), \
dm_noflush_suspending((m)->ti)); \
- -EIO; \
-})
+} while (0)
/*
* Map cloned requests (request-based multipath)
@@ -481,7 +480,8 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
if (!pgpath) {
if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
return DM_MAPIO_DELAY_REQUEUE;
- return dm_report_EIO(m); /* Failed */
+ dm_report_EIO(m); /* Failed */
+ return DM_MAPIO_KILL;
} else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
if (pg_init_all_paths(m))
@@ -558,7 +558,8 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m
if (!pgpath) {
if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
return DM_MAPIO_REQUEUE;
- return dm_report_EIO(m);
+ dm_report_EIO(m);
+ return -EIO;
}
mpio->pgpath = pgpath;
@@ -1493,7 +1494,7 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
if (atomic_read(&m->nr_valid_paths) == 0 &&
!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
if (error == -EIO)
- error = dm_report_EIO(m);
+ dm_report_EIO(m);
/* complete with the original error */
r = DM_ENDIO_DONE;
}
@@ -1524,8 +1525,10 @@ static int do_end_io_bio(struct multipath *m, struct bio *clone,
fail_path(mpio->pgpath);
if (atomic_read(&m->nr_valid_paths) == 0 &&
- !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
- return dm_report_EIO(m);
+ !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
+ dm_report_EIO(m);
+ return -EIO;
+ }
/* Queue for the daemon to resubmit */
dm_bio_restore(get_bio_details_from_bio(clone), clone);
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 7d893228c40f..b4b75dad816a 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1927,7 +1927,7 @@ struct dm_raid_superblock {
/********************************************************************
* BELOW FOLLOW V1.9.0 EXTENSIONS TO THE PRISTINE SUPERBLOCK FORMAT!!!
*
- * FEATURE_FLAG_SUPPORTS_V190 in the features member indicates that those exist
+ * FEATURE_FLAG_SUPPORTS_V190 in the compat_features member indicates that those exist
*/
__le32 flags; /* Flags defining array states for reshaping */
@@ -2092,6 +2092,11 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->layout = cpu_to_le32(mddev->layout);
sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors);
+ /********************************************************************
+ * BELOW FOLLOW V1.9.0 EXTENSIONS TO THE PRISTINE SUPERBLOCK FORMAT!!!
+ *
+ * FEATURE_FLAG_SUPPORTS_V190 in the compat_features member indicates that those exist
+ */
sb->new_level = cpu_to_le32(mddev->new_level);
sb->new_layout = cpu_to_le32(mddev->new_layout);
sb->new_stripe_sectors = cpu_to_le32(mddev->new_chunk_sectors);
@@ -2438,8 +2443,14 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
if (!test_and_clear_bit(FirstUse, &rdev->flags)) {
- /* Retrieve device size stored in superblock to be prepared for shrink */
- rdev->sectors = le64_to_cpu(sb->sectors);
+ /*
+ * Retrieve rdev size stored in superblock to be prepared for shrink.
+ * Check extended superblock members are present otherwise the size
+ * will not be set!
+ */
+ if (le32_to_cpu(sb->compat_features) & FEATURE_FLAG_SUPPORTS_V190)
+ rdev->sectors = le64_to_cpu(sb->sectors);
+
rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
if (rdev->recovery_offset == MaxSector)
set_bit(In_sync, &rdev->flags);
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index a95cbb80fb34..4da8858856fb 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -145,6 +145,7 @@ static void dispatch_bios(void *context, struct bio_list *bio_list)
struct dm_raid1_bio_record {
struct mirror *m;
+ /* if details->bi_bdev == NULL, details were not saved */
struct dm_bio_details details;
region_t write_region;
};
@@ -260,7 +261,7 @@ static int mirror_flush(struct dm_target *ti)
struct mirror *m;
struct dm_io_request io_req = {
.bi_op = REQ_OP_WRITE,
- .bi_op_flags = REQ_PREFLUSH,
+ .bi_op_flags = REQ_PREFLUSH | REQ_SYNC,
.mem.type = DM_IO_KMEM,
.mem.ptr.addr = NULL,
.client = ms->io_client,
@@ -1198,6 +1199,8 @@ static int mirror_map(struct dm_target *ti, struct bio *bio)
struct dm_raid1_bio_record *bio_record =
dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record));
+ bio_record->details.bi_bdev = NULL;
+
if (rw == WRITE) {
/* Save region for mirror_end_io() handler */
bio_record->write_region = dm_rh_bio_to_region(ms->rh, bio);
@@ -1256,12 +1259,22 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
}
if (error == -EOPNOTSUPP)
- return error;
+ goto out;
if ((error == -EWOULDBLOCK) && (bio->bi_opf & REQ_RAHEAD))
- return error;
+ goto out;
if (unlikely(error)) {
+ if (!bio_record->details.bi_bdev) {
+ /*
+ * There wasn't enough memory to record necessary
+ * information for a retry or there was no other
+ * mirror in-sync.
+ */
+ DMERR_LIMIT("Mirror read failed.");
+ return -EIO;
+ }
+
m = bio_record->m;
DMERR("Mirror read failed from %s. Trying alternative device.",
@@ -1277,6 +1290,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
bd = &bio_record->details;
dm_bio_restore(bd, bio);
+ bio_record->details.bi_bdev = NULL;
bio->bi_error = 0;
queue_bio(ms, bio, rw);
@@ -1285,6 +1299,9 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
DMERR("All replicated volumes dead, failing I/O");
}
+out:
+ bio_record->details.bi_bdev = NULL;
+
return error;
}
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 2af27026aa2e..b639fa7246ee 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -507,6 +507,7 @@ static int map_request(struct dm_rq_target_io *tio)
case DM_MAPIO_KILL:
/* The target wants to complete the I/O */
dm_kill_unmapped_request(rq, -EIO);
+ break;
default:
DMWARN("unimplemented target map return value: %d", r);
BUG();
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index b93476c3ba3f..c5534d294773 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -741,7 +741,8 @@ static void persistent_commit_exception(struct dm_exception_store *store,
/*
* Commit exceptions to disk.
*/
- if (ps->valid && area_io(ps, REQ_OP_WRITE, REQ_PREFLUSH | REQ_FUA))
+ if (ps->valid && area_io(ps, REQ_OP_WRITE,
+ REQ_PREFLUSH | REQ_FUA | REQ_SYNC))
ps->valid = 0;
/*
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 0f0251d0d337..d31d18d9727c 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -484,11 +484,11 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd)
if (r < 0)
return r;
- r = save_sm_roots(pmd);
+ r = dm_tm_pre_commit(pmd->tm);
if (r < 0)
return r;
- r = dm_tm_pre_commit(pmd->tm);
+ r = save_sm_roots(pmd);
if (r < 0)
return r;
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 17ad50daed08..28808e5ec0fd 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1094,6 +1094,19 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
return;
}
+ /*
+ * Increment the unmapped blocks. This prevents a race between the
+ * passdown io and reallocation of freed blocks.
+ */
+ r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end);
+ if (r) {
+ metadata_operation_failed(pool, "dm_pool_inc_data_range", r);
+ bio_io_error(m->bio);
+ cell_defer_no_holder(tc, m->cell);
+ mempool_free(m, pool->mapping_pool);
+ return;
+ }
+
discard_parent = bio_alloc(GFP_NOIO, 1);
if (!discard_parent) {
DMWARN("%s: unable to allocate top level discard bio for passdown. Skipping passdown.",
@@ -1114,19 +1127,6 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
end_discard(&op, r);
}
}
-
- /*
- * Increment the unmapped blocks. This prevents a race between the
- * passdown io and reallocation of freed blocks.
- */
- r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end);
- if (r) {
- metadata_operation_failed(pool, "dm_pool_inc_data_range", r);
- bio_io_error(m->bio);
- cell_defer_no_holder(tc, m->cell);
- mempool_free(m, pool->mapping_pool);
- return;
- }
}
static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 97de961a3bfc..1ec9b2c51c07 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -166,7 +166,7 @@ static int verity_hash_init(struct dm_verity *v, struct ahash_request *req,
return r;
}
- if (likely(v->version >= 1))
+ if (likely(v->salt_size && (v->version >= 1)))
r = verity_hash_update(v, req, v->salt, v->salt_size, res);
return r;
@@ -177,7 +177,7 @@ static int verity_hash_final(struct dm_verity *v, struct ahash_request *req,
{
int r;
- if (unlikely(!v->version)) {
+ if (unlikely(v->salt_size && (!v->version))) {
r = verity_hash_update(v, req, v->salt, v->salt_size, res);
if (r < 0) {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 6ef9500226c0..37ccd73c79ec 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1657,7 +1657,7 @@ static struct mapped_device *alloc_dev(int minor)
bio_init(&md->flush_bio, NULL, 0);
md->flush_bio.bi_bdev = md->bdev;
- md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+ md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
dm_stats_init(&md->stats);
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 7299ce2f08a8..03082e17c65c 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -1311,8 +1311,10 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
lock_comm(cinfo, 1);
ret = __sendmsg(cinfo, &cmsg);
- if (ret)
+ if (ret) {
+ unlock_comm(cinfo);
return ret;
+ }
cinfo->no_new_dev_lockres->flags |= DLM_LKF_NOQUEUE;
ret = dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_EX);
cinfo->no_new_dev_lockres->flags &= ~DLM_LKF_NOQUEUE;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 82f798be964f..87edc342ccb3 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -765,7 +765,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
test_bit(FailFast, &rdev->flags) &&
!test_bit(LastDev, &rdev->flags))
ff = MD_FAILFAST;
- bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA | ff;
+ bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA | ff;
atomic_inc(&mddev->pending_writes);
submit_bio(bio);
@@ -5174,6 +5174,18 @@ static void mddev_delayed_delete(struct work_struct *ws)
static void no_op(struct percpu_ref *r) {}
+int mddev_init_writes_pending(struct mddev *mddev)
+{
+ if (mddev->writes_pending.percpu_count_ptr)
+ return 0;
+ if (percpu_ref_init(&mddev->writes_pending, no_op, 0, GFP_KERNEL) < 0)
+ return -ENOMEM;
+ /* We want to start with the refcount at zero */
+ percpu_ref_put(&mddev->writes_pending);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mddev_init_writes_pending);
+
static int md_alloc(dev_t dev, char *name)
{
/*
@@ -5239,10 +5251,6 @@ static int md_alloc(dev_t dev, char *name)
blk_queue_make_request(mddev->queue, md_make_request);
blk_set_stacking_limits(&mddev->queue->limits);
- if (percpu_ref_init(&mddev->writes_pending, no_op, 0, GFP_KERNEL) < 0)
- goto abort;
- /* We want to start with the refcount at zero */
- percpu_ref_put(&mddev->writes_pending);
disk = alloc_disk(1 << shift);
if (!disk) {
blk_cleanup_queue(mddev->queue);
@@ -8022,18 +8030,15 @@ EXPORT_SYMBOL(md_write_end);
* may proceed without blocking. It is important to call this before
* attempting a GFP_KERNEL allocation while holding the mddev lock.
* Must be called with mddev_lock held.
- *
- * In the ->external case MD_SB_CHANGE_PENDING can not be cleared until mddev->lock
- * is dropped, so return -EAGAIN after notifying userspace.
*/
-int md_allow_write(struct mddev *mddev)
+void md_allow_write(struct mddev *mddev)
{
if (!mddev->pers)
- return 0;
+ return;
if (mddev->ro)
- return 0;
+ return;
if (!mddev->pers->sync_request)
- return 0;
+ return;
spin_lock(&mddev->lock);
if (mddev->in_sync) {
@@ -8046,13 +8051,12 @@ int md_allow_write(struct mddev *mddev)
spin_unlock(&mddev->lock);
md_update_sb(mddev, 0);
sysfs_notify_dirent_safe(mddev->sysfs_state);
+ /* wait for the dirty state to be recorded in the metadata */
+ wait_event(mddev->sb_wait,
+ !test_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags) &&
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
} else
spin_unlock(&mddev->lock);
-
- if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
- return -EAGAIN;
- else
- return 0;
}
EXPORT_SYMBOL_GPL(md_allow_write);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 4e75d121bfcc..0fa1de42c42b 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -648,6 +648,7 @@ extern void md_unregister_thread(struct md_thread **threadp);
extern void md_wakeup_thread(struct md_thread *thread);
extern void md_check_recovery(struct mddev *mddev);
extern void md_reap_sync_thread(struct mddev *mddev);
+extern int mddev_init_writes_pending(struct mddev *mddev);
extern void md_write_start(struct mddev *mddev, struct bio *bi);
extern void md_write_inc(struct mddev *mddev, struct bio *bi);
extern void md_write_end(struct mddev *mddev);
@@ -665,7 +666,7 @@ extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
bool metadata_op);
extern void md_do_sync(struct md_thread *thread);
extern void md_new_event(struct mddev *mddev);
-extern int md_allow_write(struct mddev *mddev);
+extern void md_allow_write(struct mddev *mddev);
extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev);
extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors);
extern int md_check_no_bitmap(struct mddev *mddev);
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
index ebb280a14325..32adf6b4a9c7 100644
--- a/drivers/md/persistent-data/dm-space-map-disk.c
+++ b/drivers/md/persistent-data/dm-space-map-disk.c
@@ -142,10 +142,23 @@ static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b)
static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b)
{
+ int r;
+ uint32_t old_count;
enum allocation_event ev;
struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
- return sm_ll_dec(&smd->ll, b, &ev);
+ r = sm_ll_dec(&smd->ll, b, &ev);
+ if (!r && (ev == SM_FREE)) {
+ /*
+ * It's only free if it's also free in the last
+ * transaction.
+ */
+ r = sm_ll_lookup(&smd->old_ll, b, &old_count);
+ if (!r && !old_count)
+ smd->nr_allocated_this_transaction--;
+ }
+
+ return r;
}
static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 84e58596594d..d6c0bc76e837 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -385,7 +385,7 @@ static int raid0_run(struct mddev *mddev)
blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
- blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
+ blk_queue_max_discard_sectors(mddev->queue, UINT_MAX);
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
blk_queue_io_opt(mddev->queue,
@@ -459,6 +459,95 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
}
}
+static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
+{
+ struct r0conf *conf = mddev->private;
+ struct strip_zone *zone;
+ sector_t start = bio->bi_iter.bi_sector;
+ sector_t end;
+ unsigned int stripe_size;
+ sector_t first_stripe_index, last_stripe_index;
+ sector_t start_disk_offset;
+ unsigned int start_disk_index;
+ sector_t end_disk_offset;
+ unsigned int end_disk_index;
+ unsigned int disk;
+
+ zone = find_zone(conf, &start);
+
+ if (bio_end_sector(bio) > zone->zone_end) {
+ struct bio *split = bio_split(bio,
+ zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO,
+ mddev->bio_set);
+ bio_chain(split, bio);
+ generic_make_request(bio);
+ bio = split;
+ end = zone->zone_end;
+ } else
+ end = bio_end_sector(bio);
+
+ if (zone != conf->strip_zone)
+ end = end - zone[-1].zone_end;
+
+ /* Now start and end is the offset in zone */
+ stripe_size = zone->nb_dev * mddev->chunk_sectors;
+
+ first_stripe_index = start;
+ sector_div(first_stripe_index, stripe_size);
+ last_stripe_index = end;
+ sector_div(last_stripe_index, stripe_size);
+
+ start_disk_index = (int)(start - first_stripe_index * stripe_size) /
+ mddev->chunk_sectors;
+ start_disk_offset = ((int)(start - first_stripe_index * stripe_size) %
+ mddev->chunk_sectors) +
+ first_stripe_index * mddev->chunk_sectors;
+ end_disk_index = (int)(end - last_stripe_index * stripe_size) /
+ mddev->chunk_sectors;
+ end_disk_offset = ((int)(end - last_stripe_index * stripe_size) %
+ mddev->chunk_sectors) +
+ last_stripe_index * mddev->chunk_sectors;
+
+ for (disk = 0; disk < zone->nb_dev; disk++) {
+ sector_t dev_start, dev_end;
+ struct bio *discard_bio = NULL;
+ struct md_rdev *rdev;
+
+ if (disk < start_disk_index)
+ dev_start = (first_stripe_index + 1) *
+ mddev->chunk_sectors;
+ else if (disk > start_disk_index)
+ dev_start = first_stripe_index * mddev->chunk_sectors;
+ else
+ dev_start = start_disk_offset;
+
+ if (disk < end_disk_index)
+ dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
+ else if (disk > end_disk_index)
+ dev_end = last_stripe_index * mddev->chunk_sectors;
+ else
+ dev_end = end_disk_offset;
+
+ if (dev_end <= dev_start)
+ continue;
+
+ rdev = conf->devlist[(zone - conf->strip_zone) *
+ conf->strip_zone[0].nb_dev + disk];
+ if (__blkdev_issue_discard(rdev->bdev,
+ dev_start + zone->dev_start + rdev->data_offset,
+ dev_end - dev_start, GFP_NOIO, 0, &discard_bio) ||
+ !discard_bio)
+ continue;
+ bio_chain(discard_bio, bio);
+ if (mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(rdev->bdev),
+ discard_bio, disk_devt(mddev->gendisk),
+ bio->bi_iter.bi_sector);
+ generic_make_request(discard_bio);
+ }
+ bio_endio(bio);
+}
+
static void raid0_make_request(struct mddev *mddev, struct bio *bio)
{
struct strip_zone *zone;
@@ -473,6 +562,11 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
return;
}
+ if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) {
+ raid0_handle_discard(mddev, bio);
+ return;
+ }
+
bio_sector = bio->bi_iter.bi_sector;
sector = bio_sector;
chunk_sects = mddev->chunk_sectors;
@@ -498,19 +592,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
bio->bi_iter.bi_sector = sector + zone->dev_start +
tmp_dev->data_offset;
- if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
- !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
- /* Just ignore it */
- bio_endio(bio);
- } else {
- if (mddev->gendisk)
- trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
- bio, disk_devt(mddev->gendisk),
- bio_sector);
- mddev_check_writesame(mddev, bio);
- mddev_check_write_zeroes(mddev, bio);
- generic_make_request(bio);
- }
+ if (mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
+ bio, disk_devt(mddev->gendisk),
+ bio_sector);
+ mddev_check_writesame(mddev, bio);
+ mddev_check_write_zeroes(mddev, bio);
+ generic_make_request(bio);
}
static void raid0_status(struct seq_file *seq, struct mddev *mddev)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7ed59351fe97..e1a7e3d4c5e4 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -666,8 +666,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
break;
}
continue;
- } else
+ } else {
+ if ((sectors > best_good_sectors) && (best_disk >= 0))
+ best_disk = -1;
best_good_sectors = sectors;
+ }
if (best_disk >= 0)
/* At least two disks to choose from so failfast is OK */
@@ -1529,17 +1532,16 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
plug = container_of(cb, struct raid1_plug_cb, cb);
else
plug = NULL;
- spin_lock_irqsave(&conf->device_lock, flags);
if (plug) {
bio_list_add(&plug->pending, mbio);
plug->pending_cnt++;
} else {
+ spin_lock_irqsave(&conf->device_lock, flags);
bio_list_add(&conf->pending_bio_list, mbio);
conf->pending_count++;
- }
- spin_unlock_irqrestore(&conf->device_lock, flags);
- if (!plug)
+ spin_unlock_irqrestore(&conf->device_lock, flags);
md_wakeup_thread(mddev->thread);
+ }
}
r1_bio_write_done(r1_bio);
@@ -3061,6 +3063,8 @@ static int raid1_run(struct mddev *mddev)
mdname(mddev));
return -EIO;
}
+ if (mddev_init_writes_pending(mddev) < 0)
+ return -ENOMEM;
/*
* copy the already verified devices into our private RAID1
* bookkeeping area. [whatever we allocate in run(),
@@ -3197,7 +3201,7 @@ static int raid1_reshape(struct mddev *mddev)
struct r1conf *conf = mddev->private;
int cnt, raid_disks;
unsigned long flags;
- int d, d2, err;
+ int d, d2;
/* Cannot change chunk_size, layout, or level */
if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
@@ -3209,11 +3213,8 @@ static int raid1_reshape(struct mddev *mddev)
return -EINVAL;
}
- if (!mddev_is_clustered(mddev)) {
- err = md_allow_write(mddev);
- if (err)
- return err;
- }
+ if (!mddev_is_clustered(mddev))
+ md_allow_write(mddev);
raid_disks = mddev->raid_disks + mddev->delta_disks;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6b86a0032cf8..797ed60abd5e 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1282,17 +1282,16 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
plug = container_of(cb, struct raid10_plug_cb, cb);
else
plug = NULL;
- spin_lock_irqsave(&conf->device_lock, flags);
if (plug) {
bio_list_add(&plug->pending, mbio);
plug->pending_cnt++;
} else {
+ spin_lock_irqsave(&conf->device_lock, flags);
bio_list_add(&conf->pending_bio_list, mbio);
conf->pending_count++;
- }
- spin_unlock_irqrestore(&conf->device_lock, flags);
- if (!plug)
+ spin_unlock_irqrestore(&conf->device_lock, flags);
md_wakeup_thread(mddev->thread);
+ }
}
static void raid10_write_request(struct mddev *mddev, struct bio *bio,
@@ -3612,6 +3611,9 @@ static int raid10_run(struct mddev *mddev)
int first = 1;
bool discard_supported = false;
+ if (mddev_init_writes_pending(mddev) < 0)
+ return -ENOMEM;
+
if (mddev->private == NULL) {
conf = setup_conf(mddev);
if (IS_ERR(conf))
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 26ba09282e7c..0a7af8b0a80a 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -24,6 +24,7 @@
#include "md.h"
#include "raid5.h"
#include "bitmap.h"
+#include "raid5-log.h"
/*
* metadata/data stored in disk with 4k size unit (a block) regardless
@@ -622,20 +623,30 @@ static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
spin_unlock_irqrestore(&log->io_list_lock, flags);
+ /*
+ * In case of journal device failures, submit_bio will get error
+ * and calls endio, then active stripes will continue write
+ * process. Therefore, it is not necessary to check Faulty bit
+ * of journal device here.
+ *
+ * We can't check split_bio after current_bio is submitted. If
+ * io->split_bio is null, after current_bio is submitted, current_bio
+ * might already be completed and the io_unit is freed. We submit
+ * split_bio first to avoid the issue.
+ */
+ if (io->split_bio) {
+ if (io->has_flush)
+ io->split_bio->bi_opf |= REQ_PREFLUSH;
+ if (io->has_fua)
+ io->split_bio->bi_opf |= REQ_FUA;
+ submit_bio(io->split_bio);
+ }
+
if (io->has_flush)
io->current_bio->bi_opf |= REQ_PREFLUSH;
if (io->has_fua)
io->current_bio->bi_opf |= REQ_FUA;
submit_bio(io->current_bio);
-
- if (!io->split_bio)
- return;
-
- if (io->has_flush)
- io->split_bio->bi_opf |= REQ_PREFLUSH;
- if (io->has_fua)
- io->split_bio->bi_opf |= REQ_FUA;
- submit_bio(io->split_bio);
}
/* deferred io_unit will be dispatched here */
@@ -670,6 +681,11 @@ static void r5c_disable_writeback_async(struct work_struct *work)
return;
pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
mdname(mddev));
+
+ /* wait superblock change before suspend */
+ wait_event(mddev->sb_wait,
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
+
mddev_suspend(mddev);
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
mddev_resume(mddev);
@@ -1766,7 +1782,7 @@ static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
mb, PAGE_SIZE));
if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
- REQ_FUA, false)) {
+ REQ_SYNC | REQ_FUA, false)) {
__free_page(page);
return -EIO;
}
@@ -2372,7 +2388,7 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
mb, PAGE_SIZE));
sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
- REQ_OP_WRITE, REQ_FUA, false);
+ REQ_OP_WRITE, REQ_SYNC | REQ_FUA, false);
sh->log_start = ctx->pos;
list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
atomic_inc(&log->stripe_in_journal_count);
@@ -2621,8 +2637,11 @@ int r5c_try_caching_write(struct r5conf *conf,
* When run in degraded mode, array is set to write-through mode.
* This check helps drain pending write safely in the transition to
* write-through mode.
+ *
+ * When a stripe is syncing, the write is also handled in write
+ * through mode.
*/
- if (s->failed) {
+ if (s->failed || test_bit(STRIPE_SYNCING, &sh->state)) {
r5c_make_stripe_write_out(sh);
return -EAGAIN;
}
@@ -2825,6 +2844,9 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
}
r5l_append_flush_payload(log, sh->sector);
+ /* stripe is flused to raid disks, we can do resync now */
+ if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
+ set_bit(STRIPE_HANDLE, &sh->state);
}
int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
@@ -2973,7 +2995,7 @@ ioerr:
return ret;
}
-void r5c_update_on_rdev_error(struct mddev *mddev)
+void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
{
struct r5conf *conf = mddev->private;
struct r5l_log *log = conf->log;
@@ -2981,7 +3003,8 @@ void r5c_update_on_rdev_error(struct mddev *mddev)
if (!log)
return;
- if (raid5_calc_degraded(conf) > 0 &&
+ if ((raid5_calc_degraded(conf) > 0 ||
+ test_bit(Journal, &rdev->flags)) &&
conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
schedule_work(&log->disable_writeback_work);
}
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index 27097101ccca..328d67aedda4 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -28,7 +28,8 @@ extern void r5c_flush_cache(struct r5conf *conf, int num);
extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
extern void r5c_check_cached_full_stripe(struct r5conf *conf);
extern struct md_sysfs_entry r5c_journal_mode;
-extern void r5c_update_on_rdev_error(struct mddev *mddev);
+extern void r5c_update_on_rdev_error(struct mddev *mddev,
+ struct md_rdev *rdev);
extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
extern struct dma_async_tx_descriptor *
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 5d25bebf3328..ccce92e68d7f 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -907,8 +907,8 @@ static int ppl_write_empty_header(struct ppl_log *log)
pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE));
if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
- PPL_HEADER_SIZE, page, REQ_OP_WRITE | REQ_FUA, 0,
- false)) {
+ PPL_HEADER_SIZE, page, REQ_OP_WRITE | REQ_SYNC |
+ REQ_FUA, 0, false)) {
md_error(rdev->mddev, rdev);
ret = -EIO;
}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2e38cfac5b1d..ec0f951ae19f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -103,8 +103,7 @@ static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
{
int i;
- local_irq_disable();
- spin_lock(conf->hash_locks);
+ spin_lock_irq(conf->hash_locks);
for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
spin_lock(&conf->device_lock);
@@ -114,9 +113,9 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
{
int i;
spin_unlock(&conf->device_lock);
- for (i = NR_STRIPE_HASH_LOCKS; i; i--)
- spin_unlock(conf->hash_locks + i - 1);
- local_irq_enable();
+ for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
+ spin_unlock(conf->hash_locks + i);
+ spin_unlock_irq(conf->hash_locks);
}
/* Find first data disk in a raid6 stripe */
@@ -234,11 +233,15 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
if (test_bit(R5_InJournal, &sh->dev[i].flags))
injournal++;
/*
- * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with
- * data in journal, so they are not released to cached lists
+ * In the following cases, the stripe cannot be released to cached
+ * lists. Therefore, we make the stripe write out and set
+ * STRIPE_HANDLE:
+ * 1. when quiesce in r5c write back;
+ * 2. when resync is requested fot the stripe.
*/
- if (conf->quiesce && r5c_is_writeback(conf->log) &&
- !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) {
+ if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) ||
+ (conf->quiesce && r5c_is_writeback(conf->log) &&
+ !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) {
if (test_bit(STRIPE_R5C_CACHING, &sh->state))
r5c_make_stripe_write_out(sh);
set_bit(STRIPE_HANDLE, &sh->state);
@@ -714,12 +717,11 @@ static bool is_full_stripe_write(struct stripe_head *sh)
static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
{
- local_irq_disable();
if (sh1 > sh2) {
- spin_lock(&sh2->stripe_lock);
+ spin_lock_irq(&sh2->stripe_lock);
spin_lock_nested(&sh1->stripe_lock, 1);
} else {
- spin_lock(&sh1->stripe_lock);
+ spin_lock_irq(&sh1->stripe_lock);
spin_lock_nested(&sh2->stripe_lock, 1);
}
}
@@ -727,8 +729,7 @@ static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
{
spin_unlock(&sh1->stripe_lock);
- spin_unlock(&sh2->stripe_lock);
- local_irq_enable();
+ spin_unlock_irq(&sh2->stripe_lock);
}
/* Only freshly new full stripe normal write stripe can be added to a batch list */
@@ -2312,14 +2313,12 @@ static int resize_stripes(struct r5conf *conf, int newsize)
struct stripe_head *osh, *nsh;
LIST_HEAD(newstripes);
struct disk_info *ndisks;
- int err;
+ int err = 0;
struct kmem_cache *sc;
int i;
int hash, cnt;
- err = md_allow_write(conf->mddev);
- if (err)
- return err;
+ md_allow_write(conf->mddev);
/* Step 1 */
sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
@@ -2694,7 +2693,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
bdevname(rdev->bdev, b),
mdname(mddev),
conf->raid_disks - mddev->degraded);
- r5c_update_on_rdev_error(mddev);
+ r5c_update_on_rdev_error(mddev, rdev);
}
/*
@@ -3055,6 +3054,11 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
* When LOG_CRITICAL, stripes with injournal == 0 will be sent to
* no_space_stripes list.
*
+ * 3. during journal failure
+ * In journal failure, we try to flush all cached data to raid disks
+ * based on data in stripe cache. The array is read-only to upper
+ * layers, so we would skip all pending writes.
+ *
*/
static inline bool delay_towrite(struct r5conf *conf,
struct r5dev *dev,
@@ -3068,6 +3072,9 @@ static inline bool delay_towrite(struct r5conf *conf,
if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
s->injournal > 0)
return true;
+ /* case 3 above */
+ if (s->log_failed && s->injournal)
+ return true;
return false;
}
@@ -4078,10 +4085,15 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
set_bit(STRIPE_INSYNC, &sh->state);
else {
atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
- if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+ if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
/* don't try to repair!! */
set_bit(STRIPE_INSYNC, &sh->state);
- else {
+ pr_warn_ratelimited("%s: mismatch sector in range "
+ "%llu-%llu\n", mdname(conf->mddev),
+ (unsigned long long) sh->sector,
+ (unsigned long long) sh->sector +
+ STRIPE_SECTORS);
+ } else {
sh->check_state = check_state_compute_run;
set_bit(STRIPE_COMPUTE_RUN, &sh->state);
set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
@@ -4230,10 +4242,15 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
}
} else {
atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
- if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
+ if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
/* don't try to repair!! */
set_bit(STRIPE_INSYNC, &sh->state);
- else {
+ pr_warn_ratelimited("%s: mismatch sector in range "
+ "%llu-%llu\n", mdname(conf->mddev),
+ (unsigned long long) sh->sector,
+ (unsigned long long) sh->sector +
+ STRIPE_SECTORS);
+ } else {
int *target = &sh->ops.target;
sh->ops.target = -1;
@@ -4653,8 +4670,13 @@ static void handle_stripe(struct stripe_head *sh)
if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
spin_lock(&sh->stripe_lock);
- /* Cannot process 'sync' concurrently with 'discard' */
- if (!test_bit(STRIPE_DISCARD, &sh->state) &&
+ /*
+ * Cannot process 'sync' concurrently with 'discard'.
+ * Flush data in r5cache before 'sync'.
+ */
+ if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
+ !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
+ !test_bit(STRIPE_DISCARD, &sh->state) &&
test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
set_bit(STRIPE_SYNCING, &sh->state);
clear_bit(STRIPE_INSYNC, &sh->state);
@@ -4701,10 +4723,15 @@ static void handle_stripe(struct stripe_head *sh)
" to_write=%d failed=%d failed_num=%d,%d\n",
s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
s.failed_num[0], s.failed_num[1]);
- /* check if the array has lost more than max_degraded devices and,
+ /*
+ * check if the array has lost more than max_degraded devices and,
* if so, some requests might need to be failed.
+ *
+ * When journal device failed (log_failed), we will only process
+ * the stripe if there is data need write to raid disks
*/
- if (s.failed > conf->max_degraded || s.log_failed) {
+ if (s.failed > conf->max_degraded ||
+ (s.log_failed && s.injournal == 0)) {
sh->check_state = 0;
sh->reconstruct_state = 0;
break_stripe_batch_list(sh, 0);
@@ -5277,8 +5304,10 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
struct stripe_head *sh, *tmp;
struct list_head *handle_list = NULL;
struct r5worker_group *wg;
- bool second_try = !r5c_is_writeback(conf->log);
- bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state);
+ bool second_try = !r5c_is_writeback(conf->log) &&
+ !r5l_log_disk_error(conf);
+ bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) ||
+ r5l_log_disk_error(conf);
again:
wg = NULL;
@@ -6313,7 +6342,6 @@ int
raid5_set_cache_size(struct mddev *mddev, int size)
{
struct r5conf *conf = mddev->private;
- int err;
if (size <= 16 || size > 32768)
return -EINVAL;
@@ -6325,10 +6353,7 @@ raid5_set_cache_size(struct mddev *mddev, int size)
;
mutex_unlock(&conf->cache_size_mutex);
-
- err = md_allow_write(mddev);
- if (err)
- return err;
+ md_allow_write(mddev);
mutex_lock(&conf->cache_size_mutex);
while (size > conf->max_nr_stripes)
@@ -7093,6 +7118,9 @@ static int raid5_run(struct mddev *mddev)
long long min_offset_diff = 0;
int first = 1;
+ if (mddev_init_writes_pending(mddev) < 0)
+ return -ENOMEM;
+
if (mddev->recovery_cp != MaxSector)
pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
mdname(mddev));
@@ -7530,7 +7558,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
* neilb: there is no locking about new writes here,
* so this cannot be safe.
*/
- if (atomic_read(&conf->active_stripes)) {
+ if (atomic_read(&conf->active_stripes) ||
+ atomic_read(&conf->r5c_cached_full_stripes) ||
+ atomic_read(&conf->r5c_cached_partial_stripes)) {
return -EBUSY;
}
log_exit(conf);