summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig10
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/bcache/Kconfig1
-rw-r--r--drivers/md/bcache/alloc.c5
-rw-r--r--drivers/md/bcache/bset.c16
-rw-r--r--drivers/md/bcache/bset.h34
-rw-r--r--drivers/md/bcache/btree.c12
-rw-r--r--drivers/md/bcache/journal.c42
-rw-r--r--drivers/md/bcache/request.c41
-rw-r--r--drivers/md/bcache/request.h2
-rw-r--r--drivers/md/bcache/super.c84
-rw-r--r--drivers/md/bcache/sysfs.c9
-rw-r--r--drivers/md/bcache/util.h26
-rw-r--r--drivers/md/dm-bufio.c15
-rw-r--r--drivers/md/dm-cache-metadata.c9
-rw-r--r--drivers/md/dm-crypt.c32
-rw-r--r--drivers/md/dm-delay.c3
-rw-r--r--drivers/md/dm-dust.c515
-rw-r--r--drivers/md/dm-era-target.c1
-rw-r--r--drivers/md/dm-exception-store.h31
-rw-r--r--drivers/md/dm-init.c18
-rw-r--r--drivers/md/dm-integrity.c727
-rw-r--r--drivers/md/dm-ioctl.c6
-rw-r--r--drivers/md/dm-log-writes.c23
-rw-r--r--drivers/md/dm-mpath.c19
-rw-r--r--drivers/md/dm-rq.c8
-rw-r--r--drivers/md/dm-snap.c359
-rw-r--r--drivers/md/dm-table.c19
-rw-r--r--drivers/md/dm-target.c3
-rw-r--r--drivers/md/dm-thin-metadata.c139
-rw-r--r--drivers/md/dm-uevent.c15
-rw-r--r--drivers/md/dm-uevent.h15
-rw-r--r--drivers/md/dm-verity-fec.c6
-rw-r--r--drivers/md/dm-verity-fec.h6
-rw-r--r--drivers/md/dm-verity-target.c7
-rw-r--r--drivers/md/dm-verity.h3
-rw-r--r--drivers/md/dm-writecache.c29
-rw-r--r--drivers/md/dm-zoned-metadata.c5
-rw-r--r--drivers/md/dm-zoned-target.c3
-rw-r--r--drivers/md/dm.c36
-rw-r--r--drivers/md/dm.h1
-rw-r--r--drivers/md/md-bitmap.c9
-rw-r--r--drivers/md/md-cluster.c7
-rw-r--r--drivers/md/md-faulty.c11
-rw-r--r--drivers/md/md-linear.c9
-rw-r--r--drivers/md/md-multipath.c10
-rw-r--r--drivers/md/md.c244
-rw-r--r--drivers/md/md.h34
-rw-r--r--drivers/md/persistent-data/Kconfig1
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c19
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c2
-rw-r--r--drivers/md/raid0.c9
-rw-r--r--drivers/md/raid1.c16
-rw-r--r--drivers/md/raid10.c11
-rw-r--r--drivers/md/raid5-cache.c11
-rw-r--r--drivers/md/raid5-ppl.c10
-rw-r--r--drivers/md/raid5.c26
57 files changed, 2021 insertions, 744 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 2557f198e175..45254b3ef715 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Block device driver configuration
#
@@ -436,6 +437,15 @@ config DM_DELAY
If unsure, say N.
+config DM_DUST
+ tristate "Bad sector simulation target"
+ depends on BLK_DEV_DM
+ ---help---
+ A target that simulates bad sector behavior.
+ Useful for testing.
+
+ If unsure, say N.
+
config DM_INIT
bool "DM \"dm-mod.create=\" parameter support"
depends on BLK_DEV_DM=y
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index a52b703e588e..be7a6eb92abc 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_DM_BUFIO) += dm-bufio.o
obj-$(CONFIG_DM_BIO_PRISON) += dm-bio-prison.o
obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
obj-$(CONFIG_DM_DELAY) += dm-delay.o
+obj-$(CONFIG_DM_DUST) += dm-dust.o
obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o
obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index f6e0a8b3a61e..6dfa653d30db 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config BCACHE
tristate "Block device as cache"
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 5002838ea476..f8986effcb50 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -327,10 +327,11 @@ static int bch_allocator_thread(void *arg)
* possibly issue discards to them, then we add the bucket to
* the free list:
*/
- while (!fifo_empty(&ca->free_inc)) {
+ while (1) {
long bucket;
- fifo_pop(&ca->free_inc, bucket);
+ if (!fifo_pop(&ca->free_inc, bucket))
+ break;
if (ca->discard) {
mutex_unlock(&ca->set->bucket_lock);
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 8f07fa6e1739..268f1b685084 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -887,12 +887,22 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
struct bset *i = bset_tree_last(b)->data;
struct bkey *m, *prev = NULL;
struct btree_iter iter;
+ struct bkey preceding_key_on_stack = ZERO_KEY;
+ struct bkey *preceding_key_p = &preceding_key_on_stack;
BUG_ON(b->ops->is_extents && !KEY_SIZE(k));
- m = bch_btree_iter_init(b, &iter, b->ops->is_extents
- ? PRECEDING_KEY(&START_KEY(k))
- : PRECEDING_KEY(k));
+ /*
+ * If k has preceding key, preceding_key_p will be set to address
+ * of k's preceding key; otherwise preceding_key_p will be set
+ * to NULL inside preceding_key().
+ */
+ if (b->ops->is_extents)
+ preceding_key(&START_KEY(k), &preceding_key_p);
+ else
+ preceding_key(k, &preceding_key_p);
+
+ m = bch_btree_iter_init(b, &iter, preceding_key_p);
if (b->ops->insert_fixup(b, k, &iter, replace_key))
return status;
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index bac76aabca6d..c71365e7c1fa 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -434,20 +434,26 @@ static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
return __bch_cut_back(where, k);
}
-#define PRECEDING_KEY(_k) \
-({ \
- struct bkey *_ret = NULL; \
- \
- if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \
- _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \
- \
- if (!_ret->low) \
- _ret->high--; \
- _ret->low--; \
- } \
- \
- _ret; \
-})
+/*
+ * Pointer '*preceding_key_p' points to a memory object to store preceding
+ * key of k. If the preceding key does not exist, set '*preceding_key_p' to
+ * NULL. So the caller of preceding_key() needs to take care of memory
+ * which '*preceding_key_p' pointed to before calling preceding_key().
+ * Currently the only caller of preceding_key() is bch_btree_insert_key(),
+ * and it points to an on-stack variable, so the memory release is handled
+ * by stackframe itself.
+ */
+static inline void preceding_key(struct bkey *k, struct bkey **preceding_key_p)
+{
+ if (KEY_INODE(k) || KEY_OFFSET(k)) {
+ (**preceding_key_p) = KEY(KEY_INODE(k), KEY_OFFSET(k), 0);
+ if (!(*preceding_key_p)->low)
+ (*preceding_key_p)->high--;
+ (*preceding_key_p)->low--;
+ } else {
+ (*preceding_key_p) = NULL;
+ }
+}
static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k)
{
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 64def336f053..773f5fdad25f 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -429,14 +429,14 @@ static void do_btree_node_write(struct btree *b)
bset_sector_offset(&b->keys, i));
if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
- int j;
struct bio_vec *bv;
- void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
+ void *addr = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bv, b->bio, j, iter_all)
- memcpy(page_address(bv->bv_page),
- base + j * PAGE_SIZE, PAGE_SIZE);
+ bio_for_each_segment_all(bv, b->bio, iter_all) {
+ memcpy(page_address(bv->bv_page), addr, PAGE_SIZE);
+ addr += PAGE_SIZE;
+ }
bch_submit_bbio(b->bio, b->c, &k.key, 0);
@@ -1476,11 +1476,11 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
out_nocoalesce:
closure_sync(&cl);
- bch_keylist_free(&keylist);
while ((k = bch_keylist_pop(&keylist)))
if (!bkey_cmp(k, &ZERO_KEY))
atomic_dec(&b->c->prio_blocked);
+ bch_keylist_free(&keylist);
for (i = 0; i < nodes; i++)
if (!IS_ERR_OR_NULL(new_nodes[i])) {
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index b2fd412715b1..12dae9348147 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -147,7 +147,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
{
#define read_bucket(b) \
({ \
- int ret = journal_read_bucket(ca, list, b); \
+ ret = journal_read_bucket(ca, list, b); \
__set_bit(b, bitmap); \
if (ret < 0) \
return ret; \
@@ -156,6 +156,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
struct cache *ca;
unsigned int iter;
+ int ret = 0;
for_each_cache(ca, c, iter) {
struct journal_device *ja = &ca->journal;
@@ -267,7 +268,7 @@ bsearch:
struct journal_replay,
list)->j.seq;
- return 0;
+ return ret;
#undef read_bucket
}
@@ -317,6 +318,18 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list)
}
}
+static bool is_discard_enabled(struct cache_set *s)
+{
+ struct cache *ca;
+ unsigned int i;
+
+ for_each_cache(ca, s, i)
+ if (ca->discard)
+ return true;
+
+ return false;
+}
+
int bch_journal_replay(struct cache_set *s, struct list_head *list)
{
int ret = 0, keys = 0, entries = 0;
@@ -330,9 +343,17 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
list_for_each_entry(i, list, list) {
BUG_ON(i->pin && atomic_read(i->pin) != 1);
- cache_set_err_on(n != i->j.seq, s,
-"bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)",
- n, i->j.seq - 1, start, end);
+ if (n != i->j.seq) {
+ if (n == start && is_discard_enabled(s))
+ pr_info("bcache: journal entries %llu-%llu may be discarded! (replaying %llu-%llu)",
+ n, i->j.seq - 1, start, end);
+ else {
+ pr_err("bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)",
+ n, i->j.seq - 1, start, end);
+ ret = -EIO;
+ goto err;
+ }
+ }
for (k = i->j.start;
k < bset_bkey_last(&i->j);
@@ -540,11 +561,11 @@ static void journal_reclaim(struct cache_set *c)
ca->sb.nr_this_dev);
}
- bkey_init(k);
- SET_KEY_PTRS(k, n);
-
- if (n)
+ if (n) {
+ bkey_init(k);
+ SET_KEY_PTRS(k, n);
c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
+ }
out:
if (!journal_full(&c->journal))
__closure_wake_up(&c->journal.wait);
@@ -671,6 +692,9 @@ static void journal_write_unlocked(struct closure *cl)
ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
}
+ /* If KEY_PTRS(k) == 0, this jset gets lost in air */
+ BUG_ON(i == 0);
+
atomic_dec_bug(&fifo_back(&c->journal.pin));
bch_journal_next(&c->journal);
journal_reclaim(c);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index f101bfe8657a..41adcd1546f1 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -329,12 +329,13 @@ void bch_data_insert(struct closure *cl)
bch_data_insert_start(cl);
}
-/* Congested? */
-
-unsigned int bch_get_congested(struct cache_set *c)
+/*
+ * Congested? Return 0 (not congested) or the limit (in sectors)
+ * beyond which we should bypass the cache due to congestion.
+ */
+unsigned int bch_get_congested(const struct cache_set *c)
{
int i;
- long rand;
if (!c->congested_read_threshold_us &&
!c->congested_write_threshold_us)
@@ -353,8 +354,7 @@ unsigned int bch_get_congested(struct cache_set *c)
if (i > 0)
i = fract_exp_two(i, 6);
- rand = get_random_int();
- i -= bitmap_weight(&rand, BITS_PER_LONG);
+ i -= hweight32(get_random_u32());
return i > 0 ? i : 1;
}
@@ -376,7 +376,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
{
struct cache_set *c = dc->disk.c;
unsigned int mode = cache_mode(dc);
- unsigned int sectors, congested = bch_get_congested(c);
+ unsigned int sectors, congested;
struct task_struct *task = current;
struct io *i;
@@ -412,6 +412,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
goto rescale;
}
+ congested = bch_get_congested(c);
if (!congested && !dc->sequential_cutoff)
goto rescale;
@@ -706,14 +707,14 @@ static void search_free(struct closure *cl)
{
struct search *s = container_of(cl, struct search, cl);
- atomic_dec(&s->d->c->search_inflight);
+ atomic_dec(&s->iop.c->search_inflight);
if (s->iop.bio)
bio_put(s->iop.bio);
bio_complete(s);
closure_debug_destroy(cl);
- mempool_free(s, &s->d->c->search);
+ mempool_free(s, &s->iop.c->search);
}
static inline struct search *search_alloc(struct bio *bio,
@@ -756,13 +757,13 @@ static void cached_dev_bio_complete(struct closure *cl)
struct search *s = container_of(cl, struct search, cl);
struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
- search_free(cl);
cached_dev_put(dc);
+ search_free(cl);
}
/* Process reads */
-static void cached_dev_cache_miss_done(struct closure *cl)
+static void cached_dev_read_error_done(struct closure *cl)
{
struct search *s = container_of(cl, struct search, cl);
@@ -800,7 +801,22 @@ static void cached_dev_read_error(struct closure *cl)
closure_bio_submit(s->iop.c, bio, cl);
}
- continue_at(cl, cached_dev_cache_miss_done, NULL);
+ continue_at(cl, cached_dev_read_error_done, NULL);
+}
+
+static void cached_dev_cache_miss_done(struct closure *cl)
+{
+ struct search *s = container_of(cl, struct search, cl);
+ struct bcache_device *d = s->d;
+
+ if (s->iop.replace_collision)
+ bch_mark_cache_miss_collision(s->iop.c, s->d);
+
+ if (s->iop.bio)
+ bio_free_pages(s->iop.bio);
+
+ cached_dev_bio_complete(cl);
+ closure_put(&d->cl);
}
static void cached_dev_read_done(struct closure *cl)
@@ -833,6 +849,7 @@ static void cached_dev_read_done(struct closure *cl)
if (verify(dc) && s->recoverable && !s->read_dirty_data)
bch_data_verify(dc, s->orig_bio);
+ closure_get(&dc->disk.cl);
bio_complete(s);
if (s->iop.bio &&
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
index 721bf336ed1a..c64dbd7a91aa 100644
--- a/drivers/md/bcache/request.h
+++ b/drivers/md/bcache/request.h
@@ -33,7 +33,7 @@ struct data_insert_op {
BKEY_PADDED(replace_key);
};
-unsigned int bch_get_congested(struct cache_set *c);
+unsigned int bch_get_congested(const struct cache_set *c);
void bch_data_insert(struct closure *cl);
void bch_cached_dev_request_init(struct cached_dev *dc);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index a697a3a923cd..1b63ac876169 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -662,6 +662,11 @@ static const struct block_device_operations bcache_ops = {
void bcache_device_stop(struct bcache_device *d)
{
if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
+ /*
+ * closure_fn set to
+ * - cached device: cached_dev_flush()
+ * - flash dev: flash_dev_flush()
+ */
closure_queue(&d->cl);
}
@@ -906,21 +911,18 @@ static int cached_dev_status_update(void *arg)
void bch_cached_dev_run(struct cached_dev *dc)
{
struct bcache_device *d = &dc->disk;
- char buf[SB_LABEL_SIZE + 1];
+ char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL);
char *env[] = {
"DRIVER=bcache",
kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
- NULL,
+ kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf ? : ""),
NULL,
};
- memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
- buf[SB_LABEL_SIZE] = '\0';
- env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
-
if (atomic_xchg(&dc->running, 1)) {
kfree(env[1]);
kfree(env[2]);
+ kfree(buf);
return;
}
@@ -944,6 +946,7 @@ void bch_cached_dev_run(struct cached_dev *dc)
kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
kfree(env[1]);
kfree(env[2]);
+ kfree(buf);
if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
@@ -1174,6 +1177,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
return 0;
}
+/* when dc->disk.kobj released */
void bch_cached_dev_release(struct kobject *kobj)
{
struct cached_dev *dc = container_of(kobj, struct cached_dev,
@@ -1280,7 +1284,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
/* Cached device - bcache superblock */
-static void register_bdev(struct cache_sb *sb, struct page *sb_page,
+static int register_bdev(struct cache_sb *sb, struct page *sb_page,
struct block_device *bdev,
struct cached_dev *dc)
{
@@ -1318,14 +1322,16 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page,
BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
bch_cached_dev_run(dc);
- return;
+ return 0;
err:
pr_notice("error %s: %s", dc->backing_dev_name, err);
bcache_device_stop(&dc->disk);
+ return -EIO;
}
/* Flash only volumes */
+/* When d->kobj released */
void bch_flash_dev_release(struct kobject *kobj)
{
struct bcache_device *d = container_of(kobj, struct bcache_device,
@@ -1496,6 +1502,7 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
return true;
}
+/* When c->kobj released */
void bch_cache_set_release(struct kobject *kobj)
{
struct cache_set *c = container_of(kobj, struct cache_set, kobj);
@@ -1516,6 +1523,7 @@ static void cache_set_free(struct closure *cl)
bch_btree_cache_free(c);
bch_journal_free(c);
+ mutex_lock(&bch_register_lock);
for_each_cache(ca, c, i)
if (ca) {
ca->set = NULL;
@@ -1534,7 +1542,6 @@ static void cache_set_free(struct closure *cl)
mempool_exit(&c->search);
kfree(c->devices);
- mutex_lock(&bch_register_lock);
list_del(&c->list);
mutex_unlock(&bch_register_lock);
@@ -1673,6 +1680,7 @@ static void __cache_set_unregister(struct closure *cl)
void bch_cache_set_stop(struct cache_set *c)
{
if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
+ /* closure_fn set to __cache_set_unregister() */
closure_queue(&c->caching);
}
@@ -1775,13 +1783,15 @@ err:
return NULL;
}
-static void run_cache_set(struct cache_set *c)
+static int run_cache_set(struct cache_set *c)
{
const char *err = "cannot allocate memory";
struct cached_dev *dc, *t;
struct cache *ca;
struct closure cl;
unsigned int i;
+ LIST_HEAD(journal);
+ struct journal_replay *l;
closure_init_stack(&cl);
@@ -1790,7 +1800,6 @@ static void run_cache_set(struct cache_set *c)
set_gc_sectors(c);
if (CACHE_SYNC(&c->sb)) {
- LIST_HEAD(journal);
struct bkey *k;
struct jset *j;
@@ -1869,7 +1878,9 @@ static void run_cache_set(struct cache_set *c)
if (j->version < BCACHE_JSET_VERSION_UUID)
__uuid_write(c);
- bch_journal_replay(c, &journal);
+ err = "bcache: replay journal failed";
+ if (bch_journal_replay(c, &journal))
+ goto err;
} else {
pr_notice("invalidating existing data");
@@ -1937,11 +1948,19 @@ static void run_cache_set(struct cache_set *c)
flash_devs_run(c);
set_bit(CACHE_SET_RUNNING, &c->flags);
- return;
+ return 0;
err:
+ while (!list_empty(&journal)) {
+ l = list_first_entry(&journal, struct journal_replay, list);
+ list_del(&l->list);
+ kfree(l);
+ }
+
closure_sync(&cl);
/* XXX: test this, it's broken */
bch_cache_set_error(c, "%s", err);
+
+ return -EIO;
}
static bool can_attach_cache(struct cache *ca, struct cache_set *c)
@@ -2005,8 +2024,11 @@ found:
ca->set->cache[ca->sb.nr_this_dev] = ca;
c->cache_by_alloc[c->caches_loaded++] = ca;
- if (c->caches_loaded == c->sb.nr_in_set)
- run_cache_set(c);
+ if (c->caches_loaded == c->sb.nr_in_set) {
+ err = "failed to run cache set";
+ if (run_cache_set(c) < 0)
+ goto err;
+ }
return NULL;
err:
@@ -2016,6 +2038,7 @@ err:
/* Cache device */
+/* When ca->kobj released */
void bch_cache_release(struct kobject *kobj)
{
struct cache *ca = container_of(kobj, struct cache, kobj);
@@ -2179,6 +2202,12 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page,
ret = cache_alloc(ca);
if (ret != 0) {
+ /*
+ * If we failed here, it means ca->kobj is not initialized yet,
+ * kobject_put() won't be called and there is no chance to
+ * call blkdev_put() to bdev in bch_cache_release(). So we
+ * explicitly call blkdev_put() here.
+ */
blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
if (ret == -ENOMEM)
err = "cache_alloc(): -ENOMEM";
@@ -2262,7 +2291,7 @@ static bool bch_is_open(struct block_device *bdev)
static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
const char *buffer, size_t size)
{
- ssize_t ret = size;
+ ssize_t ret = -EINVAL;
const char *err = "cannot allocate memory";
char *path = NULL;
struct cache_sb *sb = NULL;
@@ -2296,7 +2325,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
if (!IS_ERR(bdev))
bdput(bdev);
if (attr == &ksysfs_register_quiet)
- goto out;
+ goto quiet_out;
}
goto err;
}
@@ -2317,17 +2346,23 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
goto err_close;
mutex_lock(&bch_register_lock);
- register_bdev(sb, sb_page, bdev, dc);
+ ret = register_bdev(sb, sb_page, bdev, dc);
mutex_unlock(&bch_register_lock);
+ /* blkdev_put() will be called in cached_dev_free() */
+ if (ret < 0)
+ goto err;
} else {
struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
if (!ca)
goto err_close;
+ /* blkdev_put() will be called in bch_cache_release() */
if (register_cache(sb, sb_page, bdev, ca) != 0)
goto err;
}
+quiet_out:
+ ret = size;
out:
if (sb_page)
put_page(sb_page);
@@ -2340,7 +2375,6 @@ err_close:
blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
err:
pr_info("error %s: %s", path, err);
- ret = -EINVAL;
goto out;
}
@@ -2370,10 +2404,19 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
bcache_device_stop(&dc->disk);
+ mutex_unlock(&bch_register_lock);
+
+ /*
+ * Give an early chance for other kthreads and
+ * kworkers to stop themselves
+ */
+ schedule();
+
/* What's a condition variable? */
while (1) {
- long timeout = start + 2 * HZ - jiffies;
+ long timeout = start + 10 * HZ - jiffies;
+ mutex_lock(&bch_register_lock);
stopped = list_empty(&bch_cache_sets) &&
list_empty(&uncached_devices);
@@ -2385,7 +2428,6 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
mutex_unlock(&bch_register_lock);
schedule_timeout(timeout);
- mutex_lock(&bch_register_lock);
}
finish_wait(&unregister_wait, &wait);
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 17bae9c14ca0..bfb437ffb13c 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -431,8 +431,13 @@ STORE(bch_cached_dev)
bch_writeback_queue(dc);
}
+ /*
+ * Only set BCACHE_DEV_WB_RUNNING when cached device attached to
+ * a cache set, otherwise it doesn't make sense.
+ */
if (attr == &sysfs_writeback_percent)
- if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
+ if ((dc->disk.c != NULL) &&
+ (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)))
schedule_delayed_work(&dc->writeback_rate_update,
dc->writeback_rate_update_seconds * HZ);
@@ -996,8 +1001,6 @@ SHOW(__bch_cache)
!cached[n - 1])
--n;
- unused = ca->sb.nbuckets - n;
-
while (cached < p + n &&
*cached == BTREE_PRIO)
cached++, n--;
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 00aab6abcfe4..1fbced94e4cc 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -560,17 +560,29 @@ static inline uint64_t bch_crc64_update(uint64_t crc,
return crc;
}
-/* Does linear interpolation between powers of two */
+/*
+ * A stepwise-linear pseudo-exponential. This returns 1 << (x >>
+ * frac_bits), with the less-significant bits filled in by linear
+ * interpolation.
+ *
+ * This can also be interpreted as a floating-point number format,
+ * where the low frac_bits are the mantissa (with implicit leading
+ * 1 bit), and the more significant bits are the exponent.
+ * The return value is 1.mantissa * 2^exponent.
+ *
+ * The way this is used, fract_bits is 6 and the largest possible
+ * input is CONGESTED_MAX-1 = 1023 (exponent 16, mantissa 0x1.fc),
+ * so the maximum output is 0x1fc00.
+ */
static inline unsigned int fract_exp_two(unsigned int x,
unsigned int fract_bits)
{
- unsigned int fract = x & ~(~0 << fract_bits);
-
- x >>= fract_bits;
- x = 1 << x;
- x += (x * fract) >> fract_bits;
+ unsigned int mantissa = 1 << fract_bits; /* Implicit bit */
- return x;
+ mantissa += x & (mantissa - 1);
+ x >>= fract_bits; /* The exponent */
+ /* Largest intermediate value 0x7f0000 */
+ return mantissa << x >> fract_bits;
}
void bch_bio_map(struct bio *bio, void *base);
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 1ecef76225a1..2a48ea3f1b30 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -150,7 +150,7 @@ struct dm_buffer {
void (*end_io)(struct dm_buffer *, blk_status_t);
#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
#define MAX_STACK 10
- struct stack_trace stack_trace;
+ unsigned int stack_len;
unsigned long stack_entries[MAX_STACK];
#endif
};
@@ -232,11 +232,7 @@ static DEFINE_MUTEX(dm_bufio_clients_lock);
#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
static void buffer_record_stack(struct dm_buffer *b)
{
- b->stack_trace.nr_entries = 0;
- b->stack_trace.max_entries = MAX_STACK;
- b->stack_trace.entries = b->stack_entries;
- b->stack_trace.skip = 2;
- save_stack_trace(&b->stack_trace);
+ b->stack_len = stack_trace_save(b->stack_entries, MAX_STACK, 2);
}
#endif
@@ -438,7 +434,7 @@ static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
adjust_total_allocated(b->data_mode, (long)c->block_size);
#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
- memset(&b->stack_trace, 0, sizeof(b->stack_trace));
+ b->stack_len = 0;
#endif
return b;
}
@@ -1520,8 +1516,9 @@ static void drop_buffers(struct dm_bufio_client *c)
DMERR("leaked buffer %llx, hold count %u, list %d",
(unsigned long long)b->block, b->hold_count, i);
#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
- print_stack_trace(&b->stack_trace, 1);
- b->hold_count = 0; /* mark unclaimed to avoid BUG_ON below */
+ stack_trace_print(b->stack_entries, b->stack_len, 1);
+ /* mark unclaimed to avoid BUG_ON below */
+ b->hold_count = 0;
#endif
}
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 6fc93834da44..151aa95775be 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -1167,11 +1167,18 @@ static int __load_discards(struct dm_cache_metadata *cmd,
if (r)
return r;
- for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) {
+ for (b = 0; ; b++) {
r = fn(context, cmd->discard_block_size, to_dblock(b),
dm_bitset_cursor_get_value(&c));
if (r)
break;
+
+ if (b >= (from_dblock(cmd->discard_nr_blocks) - 1))
+ break;
+
+ r = dm_bitset_cursor_next(&c);
+ if (r)
+ break;
}
dm_bitset_cursor_end(&c);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index dd6565798778..1b16d34bb785 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -332,7 +332,6 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
int err;
desc->tfm = essiv->hash_tfm;
- desc->flags = 0;
err = crypto_shash_digest(desc, cc->key, cc->key_size, essiv->salt);
shash_desc_zero(desc);
@@ -606,7 +605,6 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
int i, r;
desc->tfm = lmk->hash_tfm;
- desc->flags = 0;
r = crypto_shash_init(desc);
if (r)
@@ -768,7 +766,6 @@ static int crypt_iv_tcw_whitening(struct crypt_config *cc,
/* calculate crc32 for every 32bit part and xor it */
desc->tfm = tcw->crc32_tfm;
- desc->flags = 0;
for (i = 0; i < 4; i++) {
r = crypto_shash_init(desc);
if (r)
@@ -949,6 +946,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti)
{
#ifdef CONFIG_BLK_DEV_INTEGRITY
struct blk_integrity *bi = blk_get_integrity(cc->dev->bdev->bd_disk);
+ struct mapped_device *md = dm_table_get_md(ti->table);
/* From now we require underlying device with our integrity profile */
if (!bi || strcasecmp(bi->profile->name, "DM-DIF-EXT-TAG")) {
@@ -968,7 +966,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti)
if (crypt_integrity_aead(cc)) {
cc->integrity_tag_size = cc->on_disk_tag_size - cc->integrity_iv_size;
- DMINFO("Integrity AEAD, tag size %u, IV size %u.",
+ DMDEBUG("%s: Integrity AEAD, tag size %u, IV size %u.", dm_device_name(md),
cc->integrity_tag_size, cc->integrity_iv_size);
if (crypto_aead_setauthsize(any_tfm_aead(cc), cc->integrity_tag_size)) {
@@ -976,7 +974,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti)
return -EINVAL;
}
} else if (cc->integrity_iv_size)
- DMINFO("Additional per-sector space %u bytes for IV.",
+ DMDEBUG("%s: Additional per-sector space %u bytes for IV.", dm_device_name(md),
cc->integrity_iv_size);
if ((cc->integrity_tag_size + cc->integrity_iv_size) != bi->tag_size) {
@@ -1034,11 +1032,11 @@ static u8 *org_iv_of_dmreq(struct crypt_config *cc,
return iv_of_dmreq(cc, dmreq) + cc->iv_size;
}
-static uint64_t *org_sector_of_dmreq(struct crypt_config *cc,
+static __le64 *org_sector_of_dmreq(struct crypt_config *cc,
struct dm_crypt_request *dmreq)
{
u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size + cc->iv_size;
- return (uint64_t*) ptr;
+ return (__le64 *) ptr;
}
static unsigned int *org_tag_of_dmreq(struct crypt_config *cc,
@@ -1074,7 +1072,7 @@ static int crypt_convert_block_aead(struct crypt_config *cc,
struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out);
struct dm_crypt_request *dmreq;
u8 *iv, *org_iv, *tag_iv, *tag;
- uint64_t *sector;
+ __le64 *sector;
int r = 0;
BUG_ON(cc->integrity_iv_size && cc->integrity_iv_size != cc->iv_size);
@@ -1146,9 +1144,11 @@ static int crypt_convert_block_aead(struct crypt_config *cc,
r = crypto_aead_decrypt(req);
}
- if (r == -EBADMSG)
- DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu",
+ if (r == -EBADMSG) {
+ char b[BDEVNAME_SIZE];
+ DMERR_LIMIT("%s: INTEGRITY AEAD ERROR, sector %llu", bio_devname(ctx->bio_in, b),
(unsigned long long)le64_to_cpu(*sector));
+ }
if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
r = cc->iv_gen_ops->post(cc, org_iv, dmreq);
@@ -1169,7 +1169,7 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc,
struct scatterlist *sg_in, *sg_out;
struct dm_crypt_request *dmreq;
u8 *iv, *org_iv, *tag_iv;
- uint64_t *sector;
+ __le64 *sector;
int r = 0;
/* Reject unexpected unaligned bio. */
@@ -1445,11 +1445,10 @@ out:
static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
{
- unsigned int i;
struct bio_vec *bv;
struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bv, clone, i, iter_all) {
+ bio_for_each_segment_all(bv, clone, iter_all) {
BUG_ON(!bv->bv_page);
mempool_free(bv->bv_page, &cc->page_pool);
}
@@ -1792,7 +1791,8 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
error = cc->iv_gen_ops->post(cc, org_iv_of_dmreq(cc, dmreq), dmreq);
if (error == -EBADMSG) {
- DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu",
+ char b[BDEVNAME_SIZE];
+ DMERR_LIMIT("%s: INTEGRITY AEAD ERROR, sector %llu", bio_devname(ctx->bio_in, b),
(unsigned long long)le64_to_cpu(*org_sector_of_dmreq(cc, dmreq)));
io->error = BLK_STS_PROTECTION;
} else if (error < 0)
@@ -1891,7 +1891,7 @@ static int crypt_alloc_tfms_skcipher(struct crypt_config *cc, char *ciphermode)
* algorithm implementation is used. Help people debug performance
* problems by logging the ->cra_driver_name.
*/
- DMINFO("%s using implementation \"%s\"", ciphermode,
+ DMDEBUG_LIMIT("%s using implementation \"%s\"", ciphermode,
crypto_skcipher_alg(any_tfm(cc))->base.cra_driver_name);
return 0;
}
@@ -1911,7 +1911,7 @@ static int crypt_alloc_tfms_aead(struct crypt_config *cc, char *ciphermode)
return err;
}
- DMINFO("%s using implementation \"%s\"", ciphermode,
+ DMDEBUG_LIMIT("%s using implementation \"%s\"", ciphermode,
crypto_aead_alg(any_tfm_aead(cc))->base.cra_driver_name);
return 0;
}
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index fddffe251bf6..f496213f8b67 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -121,7 +121,8 @@ static void delay_dtr(struct dm_target *ti)
{
struct delay_c *dc = ti->private;
- destroy_workqueue(dc->kdelayd_wq);
+ if (dc->kdelayd_wq)
+ destroy_workqueue(dc->kdelayd_wq);
if (dc->read.dev)
dm_put_device(ti, dc->read.dev);
diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c
new file mode 100644
index 000000000000..845f376a72d9
--- /dev/null
+++ b/drivers/md/dm-dust.c
@@ -0,0 +1,515 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018 Red Hat, Inc.
+ *
+ * This is a test "dust" device, which fails reads on specified
+ * sectors, emulating the behavior of a hard disk drive sending
+ * a "Read Medium Error" sense.
+ *
+ */
+
+#include <linux/device-mapper.h>
+#include <linux/module.h>
+#include <linux/rbtree.h>
+
+#define DM_MSG_PREFIX "dust"
+
+struct badblock {
+ struct rb_node node;
+ sector_t bb;
+};
+
+struct dust_device {
+ struct dm_dev *dev;
+ struct rb_root badblocklist;
+ unsigned long long badblock_count;
+ spinlock_t dust_lock;
+ unsigned int blksz;
+ unsigned int sect_per_block;
+ sector_t start;
+ bool fail_read_on_bb:1;
+ bool quiet_mode:1;
+};
+
+static struct badblock *dust_rb_search(struct rb_root *root, sector_t blk)
+{
+ struct rb_node *node = root->rb_node;
+
+ while (node) {
+ struct badblock *bblk = rb_entry(node, struct badblock, node);
+
+ if (bblk->bb > blk)
+ node = node->rb_left;
+ else if (bblk->bb < blk)
+ node = node->rb_right;
+ else
+ return bblk;
+ }
+
+ return NULL;
+}
+
+static bool dust_rb_insert(struct rb_root *root, struct badblock *new)
+{
+ struct badblock *bblk;
+ struct rb_node **link = &root->rb_node, *parent = NULL;
+ sector_t value = new->bb;
+
+ while (*link) {
+ parent = *link;
+ bblk = rb_entry(parent, struct badblock, node);
+
+ if (bblk->bb > value)
+ link = &(*link)->rb_left;
+ else if (bblk->bb < value)
+ link = &(*link)->rb_right;
+ else
+ return false;
+ }
+
+ rb_link_node(&new->node, parent, link);
+ rb_insert_color(&new->node, root);
+
+ return true;
+}
+
+static int dust_remove_block(struct dust_device *dd, unsigned long long block)
+{
+ struct badblock *bblock;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dd->dust_lock, flags);
+ bblock = dust_rb_search(&dd->badblocklist, block * dd->sect_per_block);
+
+ if (bblock == NULL) {
+ if (!dd->quiet_mode) {
+ DMERR("%s: block %llu not found in badblocklist",
+ __func__, block);
+ }
+ spin_unlock_irqrestore(&dd->dust_lock, flags);
+ return -EINVAL;
+ }
+
+ rb_erase(&bblock->node, &dd->badblocklist);
+ dd->badblock_count--;
+ if (!dd->quiet_mode)
+ DMINFO("%s: badblock removed at block %llu", __func__, block);
+ kfree(bblock);
+ spin_unlock_irqrestore(&dd->dust_lock, flags);
+
+ return 0;
+}
+
+static int dust_add_block(struct dust_device *dd, unsigned long long block)
+{
+ struct badblock *bblock;
+ unsigned long flags;
+
+ bblock = kmalloc(sizeof(*bblock), GFP_KERNEL);
+ if (bblock == NULL) {
+ if (!dd->quiet_mode)
+ DMERR("%s: badblock allocation failed", __func__);
+ return -ENOMEM;
+ }
+
+ spin_lock_irqsave(&dd->dust_lock, flags);
+ bblock->bb = block * dd->sect_per_block;
+ if (!dust_rb_insert(&dd->badblocklist, bblock)) {
+ if (!dd->quiet_mode) {
+ DMERR("%s: block %llu already in badblocklist",
+ __func__, block);
+ }
+ spin_unlock_irqrestore(&dd->dust_lock, flags);
+ kfree(bblock);
+ return -EINVAL;
+ }
+
+ dd->badblock_count++;
+ if (!dd->quiet_mode)
+ DMINFO("%s: badblock added at block %llu", __func__, block);
+ spin_unlock_irqrestore(&dd->dust_lock, flags);
+
+ return 0;
+}
+
+static int dust_query_block(struct dust_device *dd, unsigned long long block)
+{
+ struct badblock *bblock;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dd->dust_lock, flags);
+ bblock = dust_rb_search(&dd->badblocklist, block * dd->sect_per_block);
+ if (bblock != NULL)
+ DMINFO("%s: block %llu found in badblocklist", __func__, block);
+ else
+ DMINFO("%s: block %llu not found in badblocklist", __func__, block);
+ spin_unlock_irqrestore(&dd->dust_lock, flags);
+
+ return 0;
+}
+
+static int __dust_map_read(struct dust_device *dd, sector_t thisblock)
+{
+ struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock);
+
+ if (bblk)
+ return DM_MAPIO_KILL;
+
+ return DM_MAPIO_REMAPPED;
+}
+
+static int dust_map_read(struct dust_device *dd, sector_t thisblock,
+ bool fail_read_on_bb)
+{
+ unsigned long flags;
+ int ret = DM_MAPIO_REMAPPED;
+
+ if (fail_read_on_bb) {
+ spin_lock_irqsave(&dd->dust_lock, flags);
+ ret = __dust_map_read(dd, thisblock);
+ spin_unlock_irqrestore(&dd->dust_lock, flags);
+ }
+
+ return ret;
+}
+
+static void __dust_map_write(struct dust_device *dd, sector_t thisblock)
+{
+ struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock);
+
+ if (bblk) {
+ rb_erase(&bblk->node, &dd->badblocklist);
+ dd->badblock_count--;
+ kfree(bblk);
+ if (!dd->quiet_mode) {
+ sector_div(thisblock, dd->sect_per_block);
+ DMINFO("block %llu removed from badblocklist by write",
+ (unsigned long long)thisblock);
+ }
+ }
+}
+
+static int dust_map_write(struct dust_device *dd, sector_t thisblock,
+ bool fail_read_on_bb)
+{
+ unsigned long flags;
+
+ if (fail_read_on_bb) {
+ spin_lock_irqsave(&dd->dust_lock, flags);
+ __dust_map_write(dd, thisblock);
+ spin_unlock_irqrestore(&dd->dust_lock, flags);
+ }
+
+ return DM_MAPIO_REMAPPED;
+}
+
+static int dust_map(struct dm_target *ti, struct bio *bio)
+{
+ struct dust_device *dd = ti->private;
+ int ret;
+
+ bio_set_dev(bio, dd->dev->bdev);
+ bio->bi_iter.bi_sector = dd->start + dm_target_offset(ti, bio->bi_iter.bi_sector);
+
+ if (bio_data_dir(bio) == READ)
+ ret = dust_map_read(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb);
+ else
+ ret = dust_map_write(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb);
+
+ return ret;
+}
+
+static bool __dust_clear_badblocks(struct rb_root *tree,
+ unsigned long long count)
+{
+ struct rb_node *node = NULL, *nnode = NULL;
+
+ nnode = rb_first(tree);
+ if (nnode == NULL) {
+ BUG_ON(count != 0);
+ return false;
+ }
+
+ while (nnode) {
+ node = nnode;
+ nnode = rb_next(node);
+ rb_erase(node, tree);
+ count--;
+ kfree(node);
+ }
+ BUG_ON(count != 0);
+ BUG_ON(tree->rb_node != NULL);
+
+ return true;
+}
+
+static int dust_clear_badblocks(struct dust_device *dd)
+{
+ unsigned long flags;
+ struct rb_root badblocklist;
+ unsigned long long badblock_count;
+
+ spin_lock_irqsave(&dd->dust_lock, flags);
+ badblocklist = dd->badblocklist;
+ badblock_count = dd->badblock_count;
+ dd->badblocklist = RB_ROOT;
+ dd->badblock_count = 0;
+ spin_unlock_irqrestore(&dd->dust_lock, flags);
+
+ if (!__dust_clear_badblocks(&badblocklist, badblock_count))
+ DMINFO("%s: no badblocks found", __func__);
+ else
+ DMINFO("%s: badblocks cleared", __func__);
+
+ return 0;
+}
+
+/*
+ * Target parameters:
+ *
+ * <device_path> <offset> <blksz>
+ *
+ * device_path: path to the block device
+ * offset: offset to data area from start of device_path
+ * blksz: block size (minimum 512, maximum 1073741824, must be a power of 2)
+ */
+static int dust_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+ struct dust_device *dd;
+ unsigned long long tmp;
+ char dummy;
+ unsigned int blksz;
+ unsigned int sect_per_block;
+ sector_t DUST_MAX_BLKSZ_SECTORS = 2097152;
+ sector_t max_block_sectors = min(ti->len, DUST_MAX_BLKSZ_SECTORS);
+
+ if (argc != 3) {
+ ti->error = "Invalid argument count";
+ return -EINVAL;
+ }
+
+ if (kstrtouint(argv[2], 10, &blksz) || !blksz) {
+ ti->error = "Invalid block size parameter";
+ return -EINVAL;
+ }
+
+ if (blksz < 512) {
+ ti->error = "Block size must be at least 512";
+ return -EINVAL;
+ }
+
+ if (!is_power_of_2(blksz)) {
+ ti->error = "Block size must be a power of 2";
+ return -EINVAL;
+ }
+
+ if (to_sector(blksz) > max_block_sectors) {
+ ti->error = "Block size is too large";
+ return -EINVAL;
+ }
+
+ sect_per_block = (blksz >> SECTOR_SHIFT);
+
+ if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1 || tmp != (sector_t)tmp) {
+ ti->error = "Invalid device offset sector";
+ return -EINVAL;
+ }
+
+ dd = kzalloc(sizeof(struct dust_device), GFP_KERNEL);
+ if (dd == NULL) {
+ ti->error = "Cannot allocate context";
+ return -ENOMEM;
+ }
+
+ if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dd->dev)) {
+ ti->error = "Device lookup failed";
+ kfree(dd);
+ return -EINVAL;
+ }
+
+ dd->sect_per_block = sect_per_block;
+ dd->blksz = blksz;
+ dd->start = tmp;
+
+ /*
+ * Whether to fail a read on a "bad" block.
+ * Defaults to false; enabled later by message.
+ */
+ dd->fail_read_on_bb = false;
+
+ /*
+ * Initialize bad block list rbtree.
+ */
+ dd->badblocklist = RB_ROOT;
+ dd->badblock_count = 0;
+ spin_lock_init(&dd->dust_lock);
+
+ dd->quiet_mode = false;
+
+ BUG_ON(dm_set_target_max_io_len(ti, dd->sect_per_block) != 0);
+
+ ti->num_discard_bios = 1;
+ ti->num_flush_bios = 1;
+ ti->private = dd;
+
+ return 0;
+}
+
+static void dust_dtr(struct dm_target *ti)
+{
+ struct dust_device *dd = ti->private;
+
+ __dust_clear_badblocks(&dd->badblocklist, dd->badblock_count);
+ dm_put_device(ti, dd->dev);
+ kfree(dd);
+}
+
+static int dust_message(struct dm_target *ti, unsigned int argc, char **argv,
+ char *result_buf, unsigned int maxlen)
+{
+ struct dust_device *dd = ti->private;
+ sector_t size = i_size_read(dd->dev->bdev->bd_inode) >> SECTOR_SHIFT;
+ bool invalid_msg = false;
+ int result = -EINVAL;
+ unsigned long long tmp, block;
+ unsigned long flags;
+ char dummy;
+
+ if (argc == 1) {
+ if (!strcasecmp(argv[0], "addbadblock") ||
+ !strcasecmp(argv[0], "removebadblock") ||
+ !strcasecmp(argv[0], "queryblock")) {
+ DMERR("%s requires an additional argument", argv[0]);
+ } else if (!strcasecmp(argv[0], "disable")) {
+ DMINFO("disabling read failures on bad sectors");
+ dd->fail_read_on_bb = false;
+ result = 0;
+ } else if (!strcasecmp(argv[0], "enable")) {
+ DMINFO("enabling read failures on bad sectors");
+ dd->fail_read_on_bb = true;
+ result = 0;
+ } else if (!strcasecmp(argv[0], "countbadblocks")) {
+ spin_lock_irqsave(&dd->dust_lock, flags);
+ DMINFO("countbadblocks: %llu badblock(s) found",
+ dd->badblock_count);
+ spin_unlock_irqrestore(&dd->dust_lock, flags);
+ result = 0;
+ } else if (!strcasecmp(argv[0], "clearbadblocks")) {
+ result = dust_clear_badblocks(dd);
+ } else if (!strcasecmp(argv[0], "quiet")) {
+ if (!dd->quiet_mode)
+ dd->quiet_mode = true;
+ else
+ dd->quiet_mode = false;
+ result = 0;
+ } else {
+ invalid_msg = true;
+ }
+ } else if (argc == 2) {
+ if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1)
+ return result;
+
+ block = tmp;
+ sector_div(size, dd->sect_per_block);
+ if (block > size) {
+ DMERR("selected block value out of range");
+ return result;
+ }
+
+ if (!strcasecmp(argv[0], "addbadblock"))
+ result = dust_add_block(dd, block);
+ else if (!strcasecmp(argv[0], "removebadblock"))
+ result = dust_remove_block(dd, block);
+ else if (!strcasecmp(argv[0], "queryblock"))
+ result = dust_query_block(dd, block);
+ else
+ invalid_msg = true;
+
+ } else
+ DMERR("invalid number of arguments '%d'", argc);
+
+ if (invalid_msg)
+ DMERR("unrecognized message '%s' received", argv[0]);
+
+ return result;
+}
+
+static void dust_status(struct dm_target *ti, status_type_t type,
+ unsigned int status_flags, char *result, unsigned int maxlen)
+{
+ struct dust_device *dd = ti->private;
+ unsigned int sz = 0;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ DMEMIT("%s %s %s", dd->dev->name,
+ dd->fail_read_on_bb ? "fail_read_on_bad_block" : "bypass",
+ dd->quiet_mode ? "quiet" : "verbose");
+ break;
+
+ case STATUSTYPE_TABLE:
+ DMEMIT("%s %llu %u", dd->dev->name,
+ (unsigned long long)dd->start, dd->blksz);
+ break;
+ }
+}
+
+static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
+{
+ struct dust_device *dd = ti->private;
+ struct dm_dev *dev = dd->dev;
+
+ *bdev = dev->bdev;
+
+ /*
+ * Only pass ioctls through if the device sizes match exactly.
+ */
+ if (dd->start ||
+ ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
+ return 1;
+
+ return 0;
+}
+
+static int dust_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn,
+ void *data)
+{
+ struct dust_device *dd = ti->private;
+
+ return fn(ti, dd->dev, dd->start, ti->len, data);
+}
+
+static struct target_type dust_target = {
+ .name = "dust",
+ .version = {1, 0, 0},
+ .module = THIS_MODULE,
+ .ctr = dust_ctr,
+ .dtr = dust_dtr,
+ .iterate_devices = dust_iterate_devices,
+ .map = dust_map,
+ .message = dust_message,
+ .status = dust_status,
+ .prepare_ioctl = dust_prepare_ioctl,
+};
+
+static int __init dm_dust_init(void)
+{
+ int result = dm_register_target(&dust_target);
+
+ if (result < 0)
+ DMERR("dm_register_target failed %d", result);
+
+ return result;
+}
+
+static void __exit dm_dust_exit(void)
+{
+ dm_unregister_target(&dust_target);
+}
+
+module_init(dm_dust_init);
+module_exit(dm_dust_exit);
+
+MODULE_DESCRIPTION(DM_NAME " dust test target");
+MODULE_AUTHOR("Bryan Gurney <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
index 8e48920a3ffa..bdb84b8e7162 100644
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include "dm.h"
#include "persistent-data/dm-transaction-manager.h"
#include "persistent-data/dm-bitset.h"
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index 12b5216c2cfe..3f4139ac1f60 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -11,6 +11,7 @@
#define _LINUX_DM_EXCEPTION_STORE
#include <linux/blkdev.h>
+#include <linux/list_bl.h>
#include <linux/device-mapper.h>
/*
@@ -27,7 +28,7 @@ typedef sector_t chunk_t;
* chunk within the device.
*/
struct dm_exception {
- struct list_head hash_list;
+ struct hlist_bl_node hash_list;
chunk_t old_chunk;
chunk_t new_chunk;
@@ -135,9 +136,8 @@ struct dm_dev *dm_snap_cow(struct dm_snapshot *snap);
/*
* Funtions to manipulate consecutive chunks
*/
-# if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
-# define DM_CHUNK_CONSECUTIVE_BITS 8
-# define DM_CHUNK_NUMBER_BITS 56
+#define DM_CHUNK_CONSECUTIVE_BITS 8
+#define DM_CHUNK_NUMBER_BITS 56
static inline chunk_t dm_chunk_number(chunk_t chunk)
{
@@ -163,29 +163,6 @@ static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS);
}
-# else
-# define DM_CHUNK_CONSECUTIVE_BITS 0
-
-static inline chunk_t dm_chunk_number(chunk_t chunk)
-{
- return chunk;
-}
-
-static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e)
-{
- return 0;
-}
-
-static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
-{
-}
-
-static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
-{
-}
-
-# endif
-
/*
* Return the number of sectors in the device.
*/
diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c
index 4b76f84424c3..728733a514c7 100644
--- a/drivers/md/dm-init.c
+++ b/drivers/md/dm-init.c
@@ -140,8 +140,8 @@ static char __init *dm_parse_table_entry(struct dm_device *dev, char *str)
return ERR_PTR(-EINVAL);
}
/* target_args */
- dev->target_args_array[n] = kstrndup(field[3], GFP_KERNEL,
- DM_MAX_STR_SIZE);
+ dev->target_args_array[n] = kstrndup(field[3], DM_MAX_STR_SIZE,
+ GFP_KERNEL);
if (!dev->target_args_array[n])
return ERR_PTR(-ENOMEM);
@@ -160,7 +160,7 @@ static int __init dm_parse_table(struct dm_device *dev, char *str)
while (table_entry) {
DMDEBUG("parsing table \"%s\"", str);
- if (++dev->dmi.target_count >= DM_MAX_TARGETS) {
+ if (++dev->dmi.target_count > DM_MAX_TARGETS) {
DMERR("too many targets %u > %d",
dev->dmi.target_count, DM_MAX_TARGETS);
return -EINVAL;
@@ -242,9 +242,9 @@ static int __init dm_parse_devices(struct list_head *devices, char *str)
return -ENOMEM;
list_add_tail(&dev->list, devices);
- if (++ndev >= DM_MAX_DEVICES) {
- DMERR("too many targets %u > %d",
- dev->dmi.target_count, DM_MAX_TARGETS);
+ if (++ndev > DM_MAX_DEVICES) {
+ DMERR("too many devices %lu > %d",
+ ndev, DM_MAX_DEVICES);
return -EINVAL;
}
@@ -272,10 +272,10 @@ static int __init dm_init_init(void)
return 0;
if (strlen(create) >= DM_MAX_STR_SIZE) {
- DMERR("Argument is too big. Limit is %d\n", DM_MAX_STR_SIZE);
+ DMERR("Argument is too big. Limit is %d", DM_MAX_STR_SIZE);
return -EINVAL;
}
- str = kstrndup(create, GFP_KERNEL, DM_MAX_STR_SIZE);
+ str = kstrndup(create, DM_MAX_STR_SIZE, GFP_KERNEL);
if (!str)
return -ENOMEM;
@@ -283,7 +283,7 @@ static int __init dm_init_init(void)
if (r)
goto out;
- DMINFO("waiting for all devices to be available before creating mapped devices\n");
+ DMINFO("waiting for all devices to be available before creating mapped devices");
wait_for_device_probe();
list_for_each_entry(dev, &devices, list) {
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 7c678f50aaa3..44e76cda087a 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -15,6 +15,7 @@
#include <linux/rbtree.h>
#include <linux/delay.h>
#include <linux/random.h>
+#include <linux/reboot.h>
#include <crypto/hash.h>
#include <crypto/skcipher.h>
#include <linux/async_tx.h>
@@ -24,6 +25,7 @@
#define DEFAULT_INTERLEAVE_SECTORS 32768
#define DEFAULT_JOURNAL_SIZE_FACTOR 7
+#define DEFAULT_SECTORS_PER_BITMAP_BIT 32768
#define DEFAULT_BUFFER_SECTORS 128
#define DEFAULT_JOURNAL_WATERMARK 50
#define DEFAULT_SYNC_MSEC 10000
@@ -33,6 +35,8 @@
#define METADATA_WORKQUEUE_MAX_ACTIVE 16
#define RECALC_SECTORS 8192
#define RECALC_WRITE_SUPER 16
+#define BITMAP_BLOCK_SIZE 4096 /* don't change it */
+#define BITMAP_FLUSH_INTERVAL (10 * HZ)
/*
* Warning - DEBUG_PRINT prints security-sensitive data to the log,
@@ -48,6 +52,7 @@
#define SB_MAGIC "integrt"
#define SB_VERSION_1 1
#define SB_VERSION_2 2
+#define SB_VERSION_3 3
#define SB_SECTORS 8
#define MAX_SECTORS_PER_BLOCK 8
@@ -60,12 +65,14 @@ struct superblock {
__u64 provided_data_sectors; /* userspace uses this value */
__u32 flags;
__u8 log2_sectors_per_block;
- __u8 pad[3];
+ __u8 log2_blocks_per_bitmap_bit;
+ __u8 pad[2];
__u64 recalc_sector;
};
#define SB_FLAG_HAVE_JOURNAL_MAC 0x1
#define SB_FLAG_RECALCULATING 0x2
+#define SB_FLAG_DIRTY_BITMAP 0x4
#define JOURNAL_ENTRY_ROUNDUP 8
@@ -88,14 +95,10 @@ struct journal_entry {
#if BITS_PER_LONG == 64
#define journal_entry_set_sector(je, x) do { smp_wmb(); WRITE_ONCE((je)->u.sector, cpu_to_le64(x)); } while (0)
-#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector)
-#elif defined(CONFIG_LBDAF)
-#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32((x) >> 32)); } while (0)
-#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector)
#else
-#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32(0)); } while (0)
-#define journal_entry_get_sector(je) le32_to_cpu((je)->u.s.sector_lo)
+#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32((x) >> 32)); } while (0)
#endif
+#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector)
#define journal_entry_is_unused(je) ((je)->u.s.sector_hi == cpu_to_le32(-1))
#define journal_entry_set_unused(je) do { ((je)->u.s.sector_hi = cpu_to_le32(-1)); } while (0)
#define journal_entry_is_inprogress(je) ((je)->u.s.sector_hi == cpu_to_le32(-2))
@@ -155,9 +158,18 @@ struct dm_integrity_c {
struct workqueue_struct *metadata_wq;
struct superblock *sb;
unsigned journal_pages;
+ unsigned n_bitmap_blocks;
+
struct page_list *journal;
struct page_list *journal_io;
struct page_list *journal_xor;
+ struct page_list *recalc_bitmap;
+ struct page_list *may_write_bitmap;
+ struct bitmap_block_status *bbs;
+ unsigned bitmap_flush_interval;
+ int synchronous_mode;
+ struct bio_list synchronous_bios;
+ struct delayed_work bitmap_flush_work;
struct crypto_skcipher *journal_crypt;
struct scatterlist **journal_scatterlist;
@@ -184,6 +196,7 @@ struct dm_integrity_c {
__s8 log2_metadata_run;
__u8 log2_buffer_sectors;
__u8 sectors_per_block;
+ __u8 log2_blocks_per_bitmap_bit;
unsigned char mode;
int suspending;
@@ -236,17 +249,20 @@ struct dm_integrity_c {
bool journal_uptodate;
bool just_formatted;
+ bool recalculate_flag;
struct alg_spec internal_hash_alg;
struct alg_spec journal_crypt_alg;
struct alg_spec journal_mac_alg;
atomic64_t number_of_mismatches;
+
+ struct notifier_block reboot_notifier;
};
struct dm_integrity_range {
sector_t logical_sector;
- unsigned n_sectors;
+ sector_t n_sectors;
bool waiting;
union {
struct rb_node node;
@@ -292,6 +308,16 @@ struct journal_io {
struct journal_completion *comp;
};
+struct bitmap_block_status {
+ struct work_struct work;
+ struct dm_integrity_c *ic;
+ unsigned idx;
+ unsigned long *bitmap;
+ struct bio_list bio_queue;
+ spinlock_t bio_queue_lock;
+
+};
+
static struct kmem_cache *journal_io_cache;
#define JOURNAL_IO_MEMPOOL 32
@@ -427,7 +453,9 @@ static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr)
static void sb_set_version(struct dm_integrity_c *ic)
{
- if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
+ if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP))
+ ic->sb->version = SB_VERSION_3;
+ else if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
ic->sb->version = SB_VERSION_2;
else
ic->sb->version = SB_VERSION_1;
@@ -451,6 +479,137 @@ static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags)
return dm_io(&io_req, 1, &io_loc, NULL);
}
+#define BITMAP_OP_TEST_ALL_SET 0
+#define BITMAP_OP_TEST_ALL_CLEAR 1
+#define BITMAP_OP_SET 2
+#define BITMAP_OP_CLEAR 3
+
+static bool block_bitmap_op(struct dm_integrity_c *ic, struct page_list *bitmap,
+ sector_t sector, sector_t n_sectors, int mode)
+{
+ unsigned long bit, end_bit, this_end_bit, page, end_page;
+ unsigned long *data;
+
+ if (unlikely(((sector | n_sectors) & ((1 << ic->sb->log2_sectors_per_block) - 1)) != 0)) {
+ DMCRIT("invalid bitmap access (%llx,%llx,%d,%d,%d)",
+ (unsigned long long)sector,
+ (unsigned long long)n_sectors,
+ ic->sb->log2_sectors_per_block,
+ ic->log2_blocks_per_bitmap_bit,
+ mode);
+ BUG();
+ }
+
+ if (unlikely(!n_sectors))
+ return true;
+
+ bit = sector >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
+ end_bit = (sector + n_sectors - 1) >>
+ (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
+
+ page = bit / (PAGE_SIZE * 8);
+ bit %= PAGE_SIZE * 8;
+
+ end_page = end_bit / (PAGE_SIZE * 8);
+ end_bit %= PAGE_SIZE * 8;
+
+repeat:
+ if (page < end_page) {
+ this_end_bit = PAGE_SIZE * 8 - 1;
+ } else {
+ this_end_bit = end_bit;
+ }
+
+ data = lowmem_page_address(bitmap[page].page);
+
+ if (mode == BITMAP_OP_TEST_ALL_SET) {
+ while (bit <= this_end_bit) {
+ if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
+ do {
+ if (data[bit / BITS_PER_LONG] != -1)
+ return false;
+ bit += BITS_PER_LONG;
+ } while (this_end_bit >= bit + BITS_PER_LONG - 1);
+ continue;
+ }
+ if (!test_bit(bit, data))
+ return false;
+ bit++;
+ }
+ } else if (mode == BITMAP_OP_TEST_ALL_CLEAR) {
+ while (bit <= this_end_bit) {
+ if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
+ do {
+ if (data[bit / BITS_PER_LONG] != 0)
+ return false;
+ bit += BITS_PER_LONG;
+ } while (this_end_bit >= bit + BITS_PER_LONG - 1);
+ continue;
+ }
+ if (test_bit(bit, data))
+ return false;
+ bit++;
+ }
+ } else if (mode == BITMAP_OP_SET) {
+ while (bit <= this_end_bit) {
+ if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
+ do {
+ data[bit / BITS_PER_LONG] = -1;
+ bit += BITS_PER_LONG;
+ } while (this_end_bit >= bit + BITS_PER_LONG - 1);
+ continue;
+ }
+ __set_bit(bit, data);
+ bit++;
+ }
+ } else if (mode == BITMAP_OP_CLEAR) {
+ if (!bit && this_end_bit == PAGE_SIZE * 8 - 1)
+ clear_page(data);
+ else while (bit <= this_end_bit) {
+ if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) {
+ do {
+ data[bit / BITS_PER_LONG] = 0;
+ bit += BITS_PER_LONG;
+ } while (this_end_bit >= bit + BITS_PER_LONG - 1);
+ continue;
+ }
+ __clear_bit(bit, data);
+ bit++;
+ }
+ } else {
+ BUG();
+ }
+
+ if (unlikely(page < end_page)) {
+ bit = 0;
+ page++;
+ goto repeat;
+ }
+
+ return true;
+}
+
+static void block_bitmap_copy(struct dm_integrity_c *ic, struct page_list *dst, struct page_list *src)
+{
+ unsigned n_bitmap_pages = DIV_ROUND_UP(ic->n_bitmap_blocks, PAGE_SIZE / BITMAP_BLOCK_SIZE);
+ unsigned i;
+
+ for (i = 0; i < n_bitmap_pages; i++) {
+ unsigned long *dst_data = lowmem_page_address(dst[i].page);
+ unsigned long *src_data = lowmem_page_address(src[i].page);
+ copy_page(dst_data, src_data);
+ }
+}
+
+static struct bitmap_block_status *sector_to_bitmap_block(struct dm_integrity_c *ic, sector_t sector)
+{
+ unsigned bit = sector >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
+ unsigned bitmap_block = bit / (BITMAP_BLOCK_SIZE * 8);
+
+ BUG_ON(bitmap_block >= ic->n_bitmap_blocks);
+ return &ic->bbs[bitmap_block];
+}
+
static void access_journal_check(struct dm_integrity_c *ic, unsigned section, unsigned offset,
bool e, const char *function)
{
@@ -459,8 +618,8 @@ static void access_journal_check(struct dm_integrity_c *ic, unsigned section, un
if (unlikely(section >= ic->journal_sections) ||
unlikely(offset >= limit)) {
- printk(KERN_CRIT "%s: invalid access at (%u,%u), limit (%u,%u)\n",
- function, section, offset, ic->journal_sections, limit);
+ DMCRIT("%s: invalid access at (%u,%u), limit (%u,%u)",
+ function, section, offset, ic->journal_sections, limit);
BUG();
}
#endif
@@ -532,7 +691,6 @@ static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result
unsigned j, size;
desc->tfm = ic->journal_mac;
- desc->flags = 0;
r = crypto_shash_init(desc);
if (unlikely(r)) {
@@ -761,12 +919,12 @@ static void complete_journal_io(unsigned long error, void *context)
complete_journal_op(comp);
}
-static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section,
- unsigned n_sections, struct journal_completion *comp)
+static void rw_journal_sectors(struct dm_integrity_c *ic, int op, int op_flags,
+ unsigned sector, unsigned n_sectors, struct journal_completion *comp)
{
struct dm_io_request io_req;
struct dm_io_region io_loc;
- unsigned sector, n_sectors, pl_index, pl_offset;
+ unsigned pl_index, pl_offset;
int r;
if (unlikely(dm_integrity_failed(ic))) {
@@ -775,9 +933,6 @@ static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned
return;
}
- sector = section * ic->journal_section_sectors;
- n_sectors = n_sections * ic->journal_section_sectors;
-
pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
@@ -810,6 +965,17 @@ static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned
}
}
+static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section,
+ unsigned n_sections, struct journal_completion *comp)
+{
+ unsigned sector, n_sectors;
+
+ sector = section * ic->journal_section_sectors;
+ n_sectors = n_sections * ic->journal_section_sectors;
+
+ rw_journal_sectors(ic, op, op_flags, sector, n_sectors, comp);
+}
+
static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsigned commit_sections)
{
struct journal_completion io_comp;
@@ -993,6 +1159,12 @@ static void wait_and_add_new_range(struct dm_integrity_c *ic, struct dm_integrit
} while (unlikely(new_range->waiting));
}
+static void add_new_range_and_wait(struct dm_integrity_c *ic, struct dm_integrity_range *new_range)
+{
+ if (unlikely(!add_new_range(ic, new_range, true)))
+ wait_and_add_new_range(ic, new_range);
+}
+
static void init_journal_node(struct journal_node *node)
{
RB_CLEAR_NODE(&node->node);
@@ -1209,6 +1381,14 @@ static void do_endio(struct dm_integrity_c *ic, struct bio *bio)
int r = dm_integrity_failed(ic);
if (unlikely(r) && !bio->bi_status)
bio->bi_status = errno_to_blk_status(r);
+ if (unlikely(ic->synchronous_mode) && bio_op(bio) == REQ_OP_WRITE) {
+ unsigned long flags;
+ spin_lock_irqsave(&ic->endio_wait.lock, flags);
+ bio_list_add(&ic->synchronous_bios, bio);
+ queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0);
+ spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
+ return;
+ }
bio_endio(bio);
}
@@ -1276,7 +1456,6 @@ static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector
unsigned digest_size;
req->tfm = ic->internal_hash;
- req->flags = 0;
r = crypto_shash_init(req);
if (unlikely(r < 0)) {
@@ -1483,7 +1662,8 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
else
wanted_tag_size *= ic->tag_size;
if (unlikely(wanted_tag_size != bip->bip_iter.bi_size)) {
- DMERR("Invalid integrity data size %u, expected %u", bip->bip_iter.bi_size, wanted_tag_size);
+ DMERR("Invalid integrity data size %u, expected %u",
+ bip->bip_iter.bi_size, wanted_tag_size);
return DM_MAPIO_KILL;
}
}
@@ -1687,7 +1867,7 @@ retry:
unsigned ws, we, range_sectors;
dio->range.n_sectors = min(dio->range.n_sectors,
- ic->free_sectors << ic->sb->log2_sectors_per_block);
+ (sector_t)ic->free_sectors << ic->sb->log2_sectors_per_block);
if (unlikely(!dio->range.n_sectors)) {
if (from_map)
goto offload_to_thread;
@@ -1770,6 +1950,20 @@ offload_to_thread:
goto journal_read_write;
}
+ if (ic->mode == 'B' && dio->write) {
+ if (!block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
+ dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) {
+ struct bitmap_block_status *bbs;
+
+ bbs = sector_to_bitmap_block(ic, dio->range.logical_sector);
+ spin_lock(&bbs->bio_queue_lock);
+ bio_list_add(&bbs->bio_queue, bio);
+ spin_unlock(&bbs->bio_queue_lock);
+ queue_work(ic->writer_wq, &bbs->work);
+ return;
+ }
+ }
+
dio->in_flight = (atomic_t)ATOMIC_INIT(2);
if (need_sync_io) {
@@ -1796,10 +1990,15 @@ offload_to_thread:
if (need_sync_io) {
wait_for_completion_io(&read_comp);
- if (unlikely(ic->recalc_wq != NULL) &&
- ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
+ if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
dio->range.logical_sector + dio->range.n_sectors > le64_to_cpu(ic->sb->recalc_sector))
goto skip_check;
+ if (ic->mode == 'B') {
+ if (!block_bitmap_op(ic, ic->recalc_bitmap, dio->range.logical_sector,
+ dio->range.n_sectors, BITMAP_OP_TEST_ALL_CLEAR))
+ goto skip_check;
+ }
+
if (likely(!bio->bi_status))
integrity_metadata(&dio->work);
else
@@ -1837,8 +2036,16 @@ static void pad_uncommitted(struct dm_integrity_c *ic)
wraparound_section(ic, &ic->free_section);
ic->n_uncommitted_sections++;
}
- WARN_ON(ic->journal_sections * ic->journal_section_entries !=
- (ic->n_uncommitted_sections + ic->n_committed_sections) * ic->journal_section_entries + ic->free_sectors);
+ if (WARN_ON(ic->journal_sections * ic->journal_section_entries !=
+ (ic->n_uncommitted_sections + ic->n_committed_sections) *
+ ic->journal_section_entries + ic->free_sectors)) {
+ DMCRIT("journal_sections %u, journal_section_entries %u, "
+ "n_uncommitted_sections %u, n_committed_sections %u, "
+ "journal_section_entries %u, free_sectors %u",
+ ic->journal_sections, ic->journal_section_entries,
+ ic->n_uncommitted_sections, ic->n_committed_sections,
+ ic->journal_section_entries, ic->free_sectors);
+ }
}
static void integrity_commit(struct work_struct *w)
@@ -1987,8 +2194,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start,
io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block;
spin_lock_irq(&ic->endio_wait.lock);
- if (unlikely(!add_new_range(ic, &io->range, true)))
- wait_and_add_new_range(ic, &io->range);
+ add_new_range_and_wait(ic, &io->range);
if (likely(!from_replay)) {
struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries];
@@ -2126,11 +2332,14 @@ static void integrity_recalc(struct work_struct *w)
sector_t area, offset;
sector_t metadata_block;
unsigned metadata_offset;
+ sector_t logical_sector, n_sectors;
__u8 *t;
unsigned i;
int r;
unsigned super_counter = 0;
+ DEBUG_print("start recalculation... (position %llx)\n", le64_to_cpu(ic->sb->recalc_sector));
+
spin_lock_irq(&ic->endio_wait.lock);
next_chunk:
@@ -2139,21 +2348,49 @@ next_chunk:
goto unlock_ret;
range.logical_sector = le64_to_cpu(ic->sb->recalc_sector);
- if (unlikely(range.logical_sector >= ic->provided_data_sectors))
+ if (unlikely(range.logical_sector >= ic->provided_data_sectors)) {
+ if (ic->mode == 'B') {
+ DEBUG_print("queue_delayed_work: bitmap_flush_work\n");
+ queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0);
+ }
goto unlock_ret;
+ }
get_area_and_offset(ic, range.logical_sector, &area, &offset);
range.n_sectors = min((sector_t)RECALC_SECTORS, ic->provided_data_sectors - range.logical_sector);
if (!ic->meta_dev)
- range.n_sectors = min(range.n_sectors, (1U << ic->sb->log2_interleave_sectors) - (unsigned)offset);
-
- if (unlikely(!add_new_range(ic, &range, true)))
- wait_and_add_new_range(ic, &range);
+ range.n_sectors = min(range.n_sectors, ((sector_t)1U << ic->sb->log2_interleave_sectors) - (unsigned)offset);
+ add_new_range_and_wait(ic, &range);
spin_unlock_irq(&ic->endio_wait.lock);
+ logical_sector = range.logical_sector;
+ n_sectors = range.n_sectors;
+
+ if (ic->mode == 'B') {
+ if (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector, n_sectors, BITMAP_OP_TEST_ALL_CLEAR)) {
+ goto advance_and_next;
+ }
+ while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector,
+ ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) {
+ logical_sector += ic->sectors_per_block;
+ n_sectors -= ic->sectors_per_block;
+ cond_resched();
+ }
+ while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector + n_sectors - ic->sectors_per_block,
+ ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) {
+ n_sectors -= ic->sectors_per_block;
+ cond_resched();
+ }
+ get_area_and_offset(ic, logical_sector, &area, &offset);
+ }
+
+ DEBUG_print("recalculating: %lx, %lx\n", logical_sector, n_sectors);
if (unlikely(++super_counter == RECALC_WRITE_SUPER)) {
recalc_write_super(ic);
+ if (ic->mode == 'B') {
+ queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval);
+ }
super_counter = 0;
}
@@ -2168,7 +2405,7 @@ next_chunk:
io_req.client = ic->io;
io_loc.bdev = ic->dev->bdev;
io_loc.sector = get_data_sector(ic, area, offset);
- io_loc.count = range.n_sectors;
+ io_loc.count = n_sectors;
r = dm_io(&io_req, 1, &io_loc, NULL);
if (unlikely(r)) {
@@ -2177,8 +2414,8 @@ next_chunk:
}
t = ic->recalc_tags;
- for (i = 0; i < range.n_sectors; i += ic->sectors_per_block) {
- integrity_sector_checksum(ic, range.logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t);
+ for (i = 0; i < n_sectors; i += ic->sectors_per_block) {
+ integrity_sector_checksum(ic, logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t);
t += ic->tag_size;
}
@@ -2190,6 +2427,9 @@ next_chunk:
goto err;
}
+advance_and_next:
+ cond_resched();
+
spin_lock_irq(&ic->endio_wait.lock);
remove_range_unlocked(ic, &range);
ic->sb->recalc_sector = cpu_to_le64(range.logical_sector + range.n_sectors);
@@ -2205,6 +2445,103 @@ unlock_ret:
recalc_write_super(ic);
}
+static void bitmap_block_work(struct work_struct *w)
+{
+ struct bitmap_block_status *bbs = container_of(w, struct bitmap_block_status, work);
+ struct dm_integrity_c *ic = bbs->ic;
+ struct bio *bio;
+ struct bio_list bio_queue;
+ struct bio_list waiting;
+
+ bio_list_init(&waiting);
+
+ spin_lock(&bbs->bio_queue_lock);
+ bio_queue = bbs->bio_queue;
+ bio_list_init(&bbs->bio_queue);
+ spin_unlock(&bbs->bio_queue_lock);
+
+ while ((bio = bio_list_pop(&bio_queue))) {
+ struct dm_integrity_io *dio;
+
+ dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
+
+ if (block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
+ dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) {
+ remove_range(ic, &dio->range);
+ INIT_WORK(&dio->work, integrity_bio_wait);
+ queue_work(ic->wait_wq, &dio->work);
+ } else {
+ block_bitmap_op(ic, ic->journal, dio->range.logical_sector,
+ dio->range.n_sectors, BITMAP_OP_SET);
+ bio_list_add(&waiting, bio);
+ }
+ }
+
+ if (bio_list_empty(&waiting))
+ return;
+
+ rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC,
+ bbs->idx * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT),
+ BITMAP_BLOCK_SIZE >> SECTOR_SHIFT, NULL);
+
+ while ((bio = bio_list_pop(&waiting))) {
+ struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
+
+ block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector,
+ dio->range.n_sectors, BITMAP_OP_SET);
+
+ remove_range(ic, &dio->range);
+ INIT_WORK(&dio->work, integrity_bio_wait);
+ queue_work(ic->wait_wq, &dio->work);
+ }
+
+ queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval);
+}
+
+static void bitmap_flush_work(struct work_struct *work)
+{
+ struct dm_integrity_c *ic = container_of(work, struct dm_integrity_c, bitmap_flush_work.work);
+ struct dm_integrity_range range;
+ unsigned long limit;
+ struct bio *bio;
+
+ dm_integrity_flush_buffers(ic);
+
+ range.logical_sector = 0;
+ range.n_sectors = ic->provided_data_sectors;
+
+ spin_lock_irq(&ic->endio_wait.lock);
+ add_new_range_and_wait(ic, &range);
+ spin_unlock_irq(&ic->endio_wait.lock);
+
+ dm_integrity_flush_buffers(ic);
+ if (ic->meta_dev)
+ blkdev_issue_flush(ic->dev->bdev, GFP_NOIO, NULL);
+
+ limit = ic->provided_data_sectors;
+ if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
+ limit = le64_to_cpu(ic->sb->recalc_sector)
+ >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit)
+ << (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit);
+ }
+ /*DEBUG_print("zeroing journal\n");*/
+ block_bitmap_op(ic, ic->journal, 0, limit, BITMAP_OP_CLEAR);
+ block_bitmap_op(ic, ic->may_write_bitmap, 0, limit, BITMAP_OP_CLEAR);
+
+ rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0,
+ ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
+
+ spin_lock_irq(&ic->endio_wait.lock);
+ remove_range_unlocked(ic, &range);
+ while (unlikely((bio = bio_list_pop(&ic->synchronous_bios)) != NULL)) {
+ bio_endio(bio);
+ spin_unlock_irq(&ic->endio_wait.lock);
+ spin_lock_irq(&ic->endio_wait.lock);
+ }
+ spin_unlock_irq(&ic->endio_wait.lock);
+}
+
+
static void init_journal(struct dm_integrity_c *ic, unsigned start_section,
unsigned n_sections, unsigned char commit_seq)
{
@@ -2401,9 +2738,37 @@ clear_journal:
init_journal_node(&ic->journal_tree[i]);
}
+static void dm_integrity_enter_synchronous_mode(struct dm_integrity_c *ic)
+{
+ DEBUG_print("dm_integrity_enter_synchronous_mode\n");
+
+ if (ic->mode == 'B') {
+ ic->bitmap_flush_interval = msecs_to_jiffies(10) + 1;
+ ic->synchronous_mode = 1;
+
+ cancel_delayed_work_sync(&ic->bitmap_flush_work);
+ queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0);
+ flush_workqueue(ic->commit_wq);
+ }
+}
+
+static int dm_integrity_reboot(struct notifier_block *n, unsigned long code, void *x)
+{
+ struct dm_integrity_c *ic = container_of(n, struct dm_integrity_c, reboot_notifier);
+
+ DEBUG_print("dm_integrity_reboot\n");
+
+ dm_integrity_enter_synchronous_mode(ic);
+
+ return NOTIFY_DONE;
+}
+
static void dm_integrity_postsuspend(struct dm_target *ti)
{
struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
+ int r;
+
+ WARN_ON(unregister_reboot_notifier(&ic->reboot_notifier));
del_timer_sync(&ic->autocommit_timer);
@@ -2412,6 +2777,9 @@ static void dm_integrity_postsuspend(struct dm_target *ti)
if (ic->recalc_wq)
drain_workqueue(ic->recalc_wq);
+ if (ic->mode == 'B')
+ cancel_delayed_work_sync(&ic->bitmap_flush_work);
+
queue_work(ic->commit_wq, &ic->commit_work);
drain_workqueue(ic->commit_wq);
@@ -2422,6 +2790,18 @@ static void dm_integrity_postsuspend(struct dm_target *ti)
dm_integrity_flush_buffers(ic);
}
+ if (ic->mode == 'B') {
+ dm_integrity_flush_buffers(ic);
+#if 1
+ /* set to 0 to test bitmap replay code */
+ init_journal(ic, 0, ic->journal_sections, 0);
+ ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
+ r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
+ if (unlikely(r))
+ dm_integrity_io_error(ic, "writing superblock", r);
+#endif
+ }
+
WRITE_ONCE(ic->suspending, 0);
BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
@@ -2432,11 +2812,70 @@ static void dm_integrity_postsuspend(struct dm_target *ti)
static void dm_integrity_resume(struct dm_target *ti)
{
struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private;
+ int r;
+ DEBUG_print("resume\n");
+
+ if (ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) {
+ DEBUG_print("resume dirty_bitmap\n");
+ rw_journal_sectors(ic, REQ_OP_READ, 0, 0,
+ ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
+ if (ic->mode == 'B') {
+ if (ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit) {
+ block_bitmap_copy(ic, ic->recalc_bitmap, ic->journal);
+ block_bitmap_copy(ic, ic->may_write_bitmap, ic->journal);
+ if (!block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors,
+ BITMAP_OP_TEST_ALL_CLEAR)) {
+ ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
+ ic->sb->recalc_sector = cpu_to_le64(0);
+ }
+ } else {
+ DEBUG_print("non-matching blocks_per_bitmap_bit: %u, %u\n",
+ ic->sb->log2_blocks_per_bitmap_bit, ic->log2_blocks_per_bitmap_bit);
+ ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit;
+ block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET);
+ block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET);
+ block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_SET);
+ rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0,
+ ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
+ ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
+ ic->sb->recalc_sector = cpu_to_le64(0);
+ }
+ } else {
+ if (!(ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit &&
+ block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_TEST_ALL_CLEAR))) {
+ ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
+ ic->sb->recalc_sector = cpu_to_le64(0);
+ }
+ init_journal(ic, 0, ic->journal_sections, 0);
+ replay_journal(ic);
+ ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
+ }
+ r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
+ if (unlikely(r))
+ dm_integrity_io_error(ic, "writing superblock", r);
+ } else {
+ replay_journal(ic);
+ if (ic->mode == 'B') {
+ int mode;
+ ic->sb->flags |= cpu_to_le32(SB_FLAG_DIRTY_BITMAP);
+ ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit;
+ r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA);
+ if (unlikely(r))
+ dm_integrity_io_error(ic, "writing superblock", r);
+
+ mode = ic->recalculate_flag ? BITMAP_OP_SET : BITMAP_OP_CLEAR;
+ block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, mode);
+ block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, mode);
+ block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, mode);
+ rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0,
+ ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL);
+ }
+ }
- replay_journal(ic);
-
- if (ic->recalc_wq && ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
+ DEBUG_print("testing recalc: %x\n", ic->sb->flags);
+ if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
__u64 recalc_pos = le64_to_cpu(ic->sb->recalc_sector);
+ DEBUG_print("recalc pos: %lx / %lx\n", (long)recalc_pos, ic->provided_data_sectors);
if (recalc_pos < ic->provided_data_sectors) {
queue_work(ic->recalc_wq, &ic->recalc_work);
} else if (recalc_pos > ic->provided_data_sectors) {
@@ -2444,6 +2883,16 @@ static void dm_integrity_resume(struct dm_target *ti)
recalc_write_super(ic);
}
}
+
+ ic->reboot_notifier.notifier_call = dm_integrity_reboot;
+ ic->reboot_notifier.next = NULL;
+ ic->reboot_notifier.priority = INT_MAX - 1; /* be notified after md and before hardware drivers */
+ WARN_ON(register_reboot_notifier(&ic->reboot_notifier));
+
+#if 0
+ /* set to 1 to stress test synchronous mode */
+ dm_integrity_enter_synchronous_mode(ic);
+#endif
}
static void dm_integrity_status(struct dm_target *ti, status_type_t type,
@@ -2468,10 +2917,14 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
__u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100;
watermark_percentage += ic->journal_entries / 2;
do_div(watermark_percentage, ic->journal_entries);
- arg_count = 5;
+ arg_count = 3;
arg_count += !!ic->meta_dev;
arg_count += ic->sectors_per_block != 1;
arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING));
+ arg_count += ic->mode == 'J';
+ arg_count += ic->mode == 'J';
+ arg_count += ic->mode == 'B';
+ arg_count += ic->mode == 'B';
arg_count += !!ic->internal_hash_alg.alg_string;
arg_count += !!ic->journal_crypt_alg.alg_string;
arg_count += !!ic->journal_mac_alg.alg_string;
@@ -2481,13 +2934,19 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
DMEMIT(" meta_device:%s", ic->meta_dev->name);
if (ic->sectors_per_block != 1)
DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT);
- if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))
+ if (ic->recalculate_flag)
DMEMIT(" recalculate");
DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS);
DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors);
DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors);
- DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage);
- DMEMIT(" commit_time:%u", ic->autocommit_msec);
+ if (ic->mode == 'J') {
+ DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage);
+ DMEMIT(" commit_time:%u", ic->autocommit_msec);
+ }
+ if (ic->mode == 'B') {
+ DMEMIT(" sectors_per_bit:%llu", (unsigned long long)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit);
+ DMEMIT(" bitmap_flush_interval:%u", jiffies_to_msecs(ic->bitmap_flush_interval));
+ }
#define EMIT_ALG(a, n) \
do { \
@@ -2568,7 +3027,7 @@ static int calculate_device_limits(struct dm_integrity_c *ic)
if (last_sector < ic->start || last_sector >= ic->meta_device_sectors)
return -EINVAL;
} else {
- __u64 meta_size = ic->provided_data_sectors * ic->tag_size;
+ __u64 meta_size = (ic->provided_data_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size;
meta_size = (meta_size + ((1U << (ic->log2_buffer_sectors + SECTOR_SHIFT)) - 1))
>> (ic->log2_buffer_sectors + SECTOR_SHIFT);
meta_size <<= ic->log2_buffer_sectors;
@@ -2665,37 +3124,37 @@ static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic)
blk_queue_max_integrity_segments(disk->queue, UINT_MAX);
}
-static void dm_integrity_free_page_list(struct dm_integrity_c *ic, struct page_list *pl)
+static void dm_integrity_free_page_list(struct page_list *pl)
{
unsigned i;
if (!pl)
return;
- for (i = 0; i < ic->journal_pages; i++)
- if (pl[i].page)
- __free_page(pl[i].page);
+ for (i = 0; pl[i].page; i++)
+ __free_page(pl[i].page);
kvfree(pl);
}
-static struct page_list *dm_integrity_alloc_page_list(struct dm_integrity_c *ic)
+static struct page_list *dm_integrity_alloc_page_list(unsigned n_pages)
{
- size_t page_list_desc_size = ic->journal_pages * sizeof(struct page_list);
struct page_list *pl;
unsigned i;
- pl = kvmalloc(page_list_desc_size, GFP_KERNEL | __GFP_ZERO);
+ pl = kvmalloc_array(n_pages + 1, sizeof(struct page_list), GFP_KERNEL | __GFP_ZERO);
if (!pl)
return NULL;
- for (i = 0; i < ic->journal_pages; i++) {
+ for (i = 0; i < n_pages; i++) {
pl[i].page = alloc_page(GFP_KERNEL);
if (!pl[i].page) {
- dm_integrity_free_page_list(ic, pl);
+ dm_integrity_free_page_list(pl);
return NULL;
}
if (i)
pl[i - 1].next = &pl[i];
}
+ pl[i].page = NULL;
+ pl[i].next = NULL;
return pl;
}
@@ -2708,7 +3167,8 @@ static void dm_integrity_free_journal_scatterlist(struct dm_integrity_c *ic, str
kvfree(sl);
}
-static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic, struct page_list *pl)
+static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic,
+ struct page_list *pl)
{
struct scatterlist **sl;
unsigned i;
@@ -2727,7 +3187,8 @@ static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_int
unsigned idx;
page_list_location(ic, i, 0, &start_index, &start_offset);
- page_list_location(ic, i, ic->journal_section_sectors - 1, &end_index, &end_offset);
+ page_list_location(ic, i, ic->journal_section_sectors - 1,
+ &end_index, &end_offset);
n_pages = (end_index - start_index + 1);
@@ -2848,7 +3309,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
}
ic->journal_pages = journal_pages;
- ic->journal = dm_integrity_alloc_page_list(ic);
+ ic->journal = dm_integrity_alloc_page_list(ic->journal_pages);
if (!ic->journal) {
*error = "Could not allocate memory for journal";
r = -ENOMEM;
@@ -2880,7 +3341,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
DEBUG_print("cipher %s, block size %u iv size %u\n",
ic->journal_crypt_alg.alg_string, blocksize, ivsize);
- ic->journal_io = dm_integrity_alloc_page_list(ic);
+ ic->journal_io = dm_integrity_alloc_page_list(ic->journal_pages);
if (!ic->journal_io) {
*error = "Could not allocate memory for journal io";
r = -ENOMEM;
@@ -2904,7 +3365,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
goto bad;
}
- ic->journal_xor = dm_integrity_alloc_page_list(ic);
+ ic->journal_xor = dm_integrity_alloc_page_list(ic->journal_pages);
if (!ic->journal_xor) {
*error = "Could not allocate memory for journal xor";
r = -ENOMEM;
@@ -2928,7 +3389,8 @@ static int create_journal(struct dm_integrity_c *ic, char **error)
sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids);
memset(crypt_iv, 0x00, ivsize);
- skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, crypt_iv);
+ skcipher_request_set_crypt(req, sg, sg,
+ PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, crypt_iv);
init_completion(&comp.comp);
comp.in_flight = (atomic_t)ATOMIC_INIT(1);
if (do_crypt(true, req, &comp))
@@ -3069,7 +3531,7 @@ bad:
* device
* offset from the start of the device
* tag size
- * D - direct writes, J - journal writes, R - recovery mode
+ * D - direct writes, J - journal writes, B - bitmap mode, R - recovery mode
* number of optional arguments
* optional arguments:
* journal_sectors
@@ -3077,10 +3539,14 @@ bad:
* buffer_sectors
* journal_watermark
* commit_time
+ * meta_device
+ * block_size
+ * sectors_per_bit
+ * bitmap_flush_interval
* internal_hash
* journal_crypt
* journal_mac
- * block_size
+ * recalculate
*/
static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
{
@@ -3093,10 +3559,13 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
{0, 9, "Invalid number of feature args"},
};
unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec;
- bool recalculate;
bool should_write_sb;
__u64 threshold;
unsigned long long start;
+ __s8 log2_sectors_per_bitmap_bit = -1;
+ __s8 log2_blocks_per_bitmap_bit;
+ __u64 bits_in_journal;
+ __u64 n_bitmap_bits;
#define DIRECT_ARGUMENTS 4
@@ -3120,6 +3589,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
init_waitqueue_head(&ic->copy_to_journal_wait);
init_completion(&ic->crypto_backoff);
atomic64_set(&ic->number_of_mismatches, 0);
+ ic->bitmap_flush_interval = BITMAP_FLUSH_INTERVAL;
r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ic->dev);
if (r) {
@@ -3142,10 +3612,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
}
}
- if (!strcmp(argv[3], "J") || !strcmp(argv[3], "D") || !strcmp(argv[3], "R"))
+ if (!strcmp(argv[3], "J") || !strcmp(argv[3], "B") ||
+ !strcmp(argv[3], "D") || !strcmp(argv[3], "R")) {
ic->mode = argv[3][0];
- else {
- ti->error = "Invalid mode (expecting J, D, R)";
+ } else {
+ ti->error = "Invalid mode (expecting J, B, D, R)";
r = -EINVAL;
goto bad;
}
@@ -3155,7 +3626,6 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
buffer_sectors = DEFAULT_BUFFER_SECTORS;
journal_watermark = DEFAULT_JOURNAL_WATERMARK;
sync_msec = DEFAULT_SYNC_MSEC;
- recalculate = false;
ic->sectors_per_block = 1;
as.argc = argc - DIRECT_ARGUMENTS;
@@ -3167,6 +3637,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
while (extra_args--) {
const char *opt_string;
unsigned val;
+ unsigned long long llval;
opt_string = dm_shift_arg(&as);
if (!opt_string) {
r = -EINVAL;
@@ -3188,7 +3659,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
dm_put_device(ti, ic->meta_dev);
ic->meta_dev = NULL;
}
- r = dm_get_device(ti, strchr(opt_string, ':') + 1, dm_table_get_mode(ti->table), &ic->meta_dev);
+ r = dm_get_device(ti, strchr(opt_string, ':') + 1,
+ dm_table_get_mode(ti->table), &ic->meta_dev);
if (r) {
ti->error = "Device lookup failed";
goto bad;
@@ -3202,6 +3674,14 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad;
}
ic->sectors_per_block = val >> SECTOR_SHIFT;
+ } else if (sscanf(opt_string, "sectors_per_bit:%llu%c", &llval, &dummy) == 1) {
+ log2_sectors_per_bitmap_bit = !llval ? 0 : __ilog2_u64(llval);
+ } else if (sscanf(opt_string, "bitmap_flush_interval:%u%c", &val, &dummy) == 1) {
+ if (val >= (uint64_t)UINT_MAX * 1000 / HZ) {
+ r = -EINVAL;
+ ti->error = "Invalid bitmap_flush_interval argument";
+ }
+ ic->bitmap_flush_interval = msecs_to_jiffies(val);
} else if (!strncmp(opt_string, "internal_hash:", strlen("internal_hash:"))) {
r = get_alg_and_key(opt_string, &ic->internal_hash_alg, &ti->error,
"Invalid internal_hash argument");
@@ -3218,7 +3698,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
if (r)
goto bad;
} else if (!strcmp(opt_string, "recalculate")) {
- recalculate = true;
+ ic->recalculate_flag = true;
} else {
r = -EINVAL;
ti->error = "Invalid argument";
@@ -3234,7 +3714,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
if (!journal_sectors) {
journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS,
- ic->data_device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR);
+ ic->data_device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR);
}
if (!buffer_sectors)
@@ -3269,6 +3749,12 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
else
ic->log2_tag_size = -1;
+ if (ic->mode == 'B' && !ic->internal_hash) {
+ r = -EINVAL;
+ ti->error = "Bitmap mode can be only used with internal hash";
+ goto bad;
+ }
+
ic->autocommit_jiffies = msecs_to_jiffies(sync_msec);
ic->autocommit_msec = sync_msec;
timer_setup(&ic->autocommit_timer, autocommit_fn, 0);
@@ -3314,7 +3800,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
}
INIT_WORK(&ic->commit_work, integrity_commit);
- if (ic->mode == 'J') {
+ if (ic->mode == 'J' || ic->mode == 'B') {
ic->writer_wq = alloc_workqueue("dm-integrity-writer", WQ_MEM_RECLAIM, 1);
if (!ic->writer_wq) {
ti->error = "Cannot allocate workqueue";
@@ -3355,7 +3841,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
should_write_sb = true;
}
- if (!ic->sb->version || ic->sb->version > SB_VERSION_2) {
+ if (!ic->sb->version || ic->sb->version > SB_VERSION_3) {
r = -EINVAL;
ti->error = "Unknown version";
goto bad;
@@ -3415,6 +3901,27 @@ try_smaller_buffer:
ti->error = "The device is too small";
goto bad;
}
+
+ if (log2_sectors_per_bitmap_bit < 0)
+ log2_sectors_per_bitmap_bit = __fls(DEFAULT_SECTORS_PER_BITMAP_BIT);
+ if (log2_sectors_per_bitmap_bit < ic->sb->log2_sectors_per_block)
+ log2_sectors_per_bitmap_bit = ic->sb->log2_sectors_per_block;
+
+ bits_in_journal = ((__u64)ic->journal_section_sectors * ic->journal_sections) << (SECTOR_SHIFT + 3);
+ if (bits_in_journal > UINT_MAX)
+ bits_in_journal = UINT_MAX;
+ while (bits_in_journal < (ic->provided_data_sectors + ((sector_t)1 << log2_sectors_per_bitmap_bit) - 1) >> log2_sectors_per_bitmap_bit)
+ log2_sectors_per_bitmap_bit++;
+
+ log2_blocks_per_bitmap_bit = log2_sectors_per_bitmap_bit - ic->sb->log2_sectors_per_block;
+ ic->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit;
+ if (should_write_sb) {
+ ic->sb->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit;
+ }
+ n_bitmap_bits = ((ic->provided_data_sectors >> ic->sb->log2_sectors_per_block)
+ + (((sector_t)1 << log2_blocks_per_bitmap_bit) - 1)) >> log2_blocks_per_bitmap_bit;
+ ic->n_bitmap_blocks = DIV_ROUND_UP(n_bitmap_bits, BITMAP_BLOCK_SIZE * 8);
+
if (!ic->meta_dev)
ic->log2_buffer_sectors = min(ic->log2_buffer_sectors, (__u8)__ffs(ic->metadata_run));
@@ -3439,25 +3946,21 @@ try_smaller_buffer:
DEBUG_print(" journal_sections %u\n", (unsigned)le32_to_cpu(ic->sb->journal_sections));
DEBUG_print(" journal_entries %u\n", ic->journal_entries);
DEBUG_print(" log2_interleave_sectors %d\n", ic->sb->log2_interleave_sectors);
- DEBUG_print(" device_sectors 0x%llx\n", (unsigned long long)ic->device_sectors);
+ DEBUG_print(" data_device_sectors 0x%llx\n", i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT);
DEBUG_print(" initial_sectors 0x%x\n", ic->initial_sectors);
DEBUG_print(" metadata_run 0x%x\n", ic->metadata_run);
DEBUG_print(" log2_metadata_run %d\n", ic->log2_metadata_run);
DEBUG_print(" provided_data_sectors 0x%llx (%llu)\n", (unsigned long long)ic->provided_data_sectors,
(unsigned long long)ic->provided_data_sectors);
DEBUG_print(" log2_buffer_sectors %u\n", ic->log2_buffer_sectors);
+ DEBUG_print(" bits_in_journal %llu\n", (unsigned long long)bits_in_journal);
- if (recalculate && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) {
+ if (ic->recalculate_flag && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) {
ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING);
ic->sb->recalc_sector = cpu_to_le64(0);
}
- if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
- if (!ic->internal_hash) {
- r = -EINVAL;
- ti->error = "Recalculate is only valid with internal hash";
- goto bad;
- }
+ if (ic->internal_hash) {
ic->recalc_wq = alloc_workqueue("dm-integrity-recalc", WQ_MEM_RECLAIM, 1);
if (!ic->recalc_wq ) {
ti->error = "Cannot allocate workqueue";
@@ -3494,6 +3997,45 @@ try_smaller_buffer:
r = create_journal(ic, &ti->error);
if (r)
goto bad;
+
+ }
+
+ if (ic->mode == 'B') {
+ unsigned i;
+ unsigned n_bitmap_pages = DIV_ROUND_UP(ic->n_bitmap_blocks, PAGE_SIZE / BITMAP_BLOCK_SIZE);
+
+ ic->recalc_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages);
+ if (!ic->recalc_bitmap) {
+ r = -ENOMEM;
+ goto bad;
+ }
+ ic->may_write_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages);
+ if (!ic->may_write_bitmap) {
+ r = -ENOMEM;
+ goto bad;
+ }
+ ic->bbs = kvmalloc_array(ic->n_bitmap_blocks, sizeof(struct bitmap_block_status), GFP_KERNEL);
+ if (!ic->bbs) {
+ r = -ENOMEM;
+ goto bad;
+ }
+ INIT_DELAYED_WORK(&ic->bitmap_flush_work, bitmap_flush_work);
+ for (i = 0; i < ic->n_bitmap_blocks; i++) {
+ struct bitmap_block_status *bbs = &ic->bbs[i];
+ unsigned sector, pl_index, pl_offset;
+
+ INIT_WORK(&bbs->work, bitmap_block_work);
+ bbs->ic = ic;
+ bbs->idx = i;
+ bio_list_init(&bbs->bio_queue);
+ spin_lock_init(&bbs->bio_queue_lock);
+
+ sector = i * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT);
+ pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
+ pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
+
+ bbs->bitmap = lowmem_page_address(ic->journal[pl_index].page) + pl_offset;
+ }
}
if (should_write_sb) {
@@ -3518,6 +4060,17 @@ try_smaller_buffer:
if (r)
goto bad;
}
+ if (ic->mode == 'B') {
+ unsigned max_io_len = ((sector_t)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit) * (BITMAP_BLOCK_SIZE * 8);
+ if (!max_io_len)
+ max_io_len = 1U << 31;
+ DEBUG_print("max_io_len: old %u, new %u\n", ti->max_io_len, max_io_len);
+ if (!ti->max_io_len || ti->max_io_len > max_io_len) {
+ r = dm_set_target_max_io_len(ti, max_io_len);
+ if (r)
+ goto bad;
+ }
+ }
if (!ic->internal_hash)
dm_integrity_set(ti, ic);
@@ -3526,6 +4079,7 @@ try_smaller_buffer:
ti->flush_supported = true;
return 0;
+
bad:
dm_integrity_dtr(ti);
return r;
@@ -3548,10 +4102,9 @@ static void dm_integrity_dtr(struct dm_target *ti)
destroy_workqueue(ic->writer_wq);
if (ic->recalc_wq)
destroy_workqueue(ic->recalc_wq);
- if (ic->recalc_buffer)
- vfree(ic->recalc_buffer);
- if (ic->recalc_tags)
- kvfree(ic->recalc_tags);
+ vfree(ic->recalc_buffer);
+ kvfree(ic->recalc_tags);
+ kvfree(ic->bbs);
if (ic->bufio)
dm_bufio_client_destroy(ic->bufio);
mempool_exit(&ic->journal_io_mempool);
@@ -3561,9 +4114,11 @@ static void dm_integrity_dtr(struct dm_target *ti)
dm_put_device(ti, ic->dev);
if (ic->meta_dev)
dm_put_device(ti, ic->meta_dev);
- dm_integrity_free_page_list(ic, ic->journal);
- dm_integrity_free_page_list(ic, ic->journal_io);
- dm_integrity_free_page_list(ic, ic->journal_xor);
+ dm_integrity_free_page_list(ic->journal);
+ dm_integrity_free_page_list(ic->journal_io);
+ dm_integrity_free_page_list(ic->journal_xor);
+ dm_integrity_free_page_list(ic->recalc_bitmap);
+ dm_integrity_free_page_list(ic->may_write_bitmap);
if (ic->journal_scatterlist)
dm_integrity_free_journal_scatterlist(ic, ic->journal_scatterlist);
if (ic->journal_io_scatterlist)
@@ -3601,7 +4156,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
static struct target_type integrity_target = {
.name = "integrity",
- .version = {1, 2, 0},
+ .version = {1, 3, 0},
.module = THIS_MODULE,
.features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
.ctr = dm_integrity_ctr,
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index c740153b4e52..1e03bc89e20f 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -2069,7 +2069,7 @@ int __init dm_early_create(struct dm_ioctl *dmi,
/* alloc table */
r = dm_table_create(&t, get_mode(dmi), dmi->target_count, md);
if (r)
- goto err_destroy_dm;
+ goto err_hash_remove;
/* add targets */
for (i = 0; i < dmi->target_count; i++) {
@@ -2116,6 +2116,10 @@ int __init dm_early_create(struct dm_ioctl *dmi,
err_destroy_table:
dm_table_destroy(t);
+err_hash_remove:
+ (void) __hash_remove(__get_name_cell(dmi->name));
+ /* release reference from __get_name_cell */
+ dm_put(md);
err_destroy_dm:
dm_put(md);
dm_destroy(md);
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 9ea2b0291f20..e549392e0ea5 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -60,6 +60,7 @@
#define WRITE_LOG_VERSION 1ULL
#define WRITE_LOG_MAGIC 0x6a736677736872ULL
+#define WRITE_LOG_SUPER_SECTOR 0
/*
* The disk format for this is braindead simple.
@@ -115,6 +116,7 @@ struct log_writes_c {
struct list_head logging_blocks;
wait_queue_head_t wait;
struct task_struct *log_kthread;
+ struct completion super_done;
};
struct pending_block {
@@ -180,6 +182,14 @@ static void log_end_io(struct bio *bio)
bio_put(bio);
}
+static void log_end_super(struct bio *bio)
+{
+ struct log_writes_c *lc = bio->bi_private;
+
+ complete(&lc->super_done);
+ log_end_io(bio);
+}
+
/*
* Meant to be called if there is an error, it will free all the pages
* associated with the block.
@@ -215,7 +225,8 @@ static int write_metadata(struct log_writes_c *lc, void *entry,
bio->bi_iter.bi_size = 0;
bio->bi_iter.bi_sector = sector;
bio_set_dev(bio, lc->logdev->bdev);
- bio->bi_end_io = log_end_io;
+ bio->bi_end_io = (sector == WRITE_LOG_SUPER_SECTOR) ?
+ log_end_super : log_end_io;
bio->bi_private = lc;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
@@ -418,11 +429,18 @@ static int log_super(struct log_writes_c *lc)
super.nr_entries = cpu_to_le64(lc->logged_entries);
super.sectorsize = cpu_to_le32(lc->sectorsize);
- if (write_metadata(lc, &super, sizeof(super), NULL, 0, 0)) {
+ if (write_metadata(lc, &super, sizeof(super), NULL, 0,
+ WRITE_LOG_SUPER_SECTOR)) {
DMERR("Couldn't write super");
return -1;
}
+ /*
+ * Super sector should be writen in-order, otherwise the
+ * nr_entries could be rewritten incorrectly by an old bio.
+ */
+ wait_for_completion_io(&lc->super_done);
+
return 0;
}
@@ -531,6 +549,7 @@ static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv)
INIT_LIST_HEAD(&lc->unflushed_blocks);
INIT_LIST_HEAD(&lc->logging_blocks);
init_waitqueue_head(&lc->wait);
+ init_completion(&lc->super_done);
atomic_set(&lc->io_blocks, 0);
atomic_set(&lc->pending_blocks, 0);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 2ee5e357a0a7..dbcc1e41cd57 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -544,8 +544,23 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
return DM_MAPIO_REMAPPED;
}
-static void multipath_release_clone(struct request *clone)
+static void multipath_release_clone(struct request *clone,
+ union map_info *map_context)
{
+ if (unlikely(map_context)) {
+ /*
+ * non-NULL map_context means caller is still map
+ * method; must undo multipath_clone_and_map()
+ */
+ struct dm_mpath_io *mpio = get_mpio(map_context);
+ struct pgpath *pgpath = mpio->pgpath;
+
+ if (pgpath && pgpath->pg->ps.type->end_io)
+ pgpath->pg->ps.type->end_io(&pgpath->pg->ps,
+ &pgpath->path,
+ mpio->nr_bytes);
+ }
+
blk_put_request(clone);
}
@@ -882,6 +897,7 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
if (attached_handler_name || m->hw_handler_name) {
INIT_DELAYED_WORK(&p->activate_path, activate_path_work);
r = setup_scsi_dh(p->path.dev->bdev, m, &attached_handler_name, &ti->error);
+ kfree(attached_handler_name);
if (r) {
dm_put_device(ti, p->path.dev);
goto bad;
@@ -896,7 +912,6 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
return p;
bad:
- kfree(attached_handler_name);
free_pgpath(p);
return ERR_PTR(r);
}
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index b66745bd08bb..5f7063f05ae0 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -168,7 +168,7 @@ static void dm_end_request(struct request *clone, blk_status_t error)
struct request *rq = tio->orig;
blk_rq_unprep_clone(clone);
- tio->ti->type->release_clone_rq(clone);
+ tio->ti->type->release_clone_rq(clone, NULL);
rq_end_stats(md, rq);
blk_mq_end_request(rq, error);
@@ -201,7 +201,7 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_
rq_end_stats(md, rq);
if (tio->clone) {
blk_rq_unprep_clone(tio->clone);
- tio->ti->type->release_clone_rq(tio->clone);
+ tio->ti->type->release_clone_rq(tio->clone, NULL);
}
dm_mq_delay_requeue_request(rq, delay_ms);
@@ -398,7 +398,7 @@ static int map_request(struct dm_rq_target_io *tio)
case DM_MAPIO_REMAPPED:
if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
/* -ENOMEM */
- ti->type->release_clone_rq(clone);
+ ti->type->release_clone_rq(clone, &tio->info);
return DM_MAPIO_REQUEUE;
}
@@ -408,7 +408,7 @@ static int map_request(struct dm_rq_target_io *tio)
ret = dm_dispatch_clone_request(clone, rq);
if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
blk_rq_unprep_clone(clone);
- tio->ti->type->release_clone_rq(clone);
+ tio->ti->type->release_clone_rq(clone, &tio->info);
tio->clone = NULL;
return DM_MAPIO_REQUEUE;
}
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index a168963b757d..3107f2b1988b 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -13,6 +13,7 @@
#include <linux/init.h>
#include <linux/kdev_t.h>
#include <linux/list.h>
+#include <linux/list_bl.h>
#include <linux/mempool.h>
#include <linux/module.h>
#include <linux/slab.h>
@@ -44,11 +45,11 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
struct dm_exception_table {
uint32_t hash_mask;
unsigned hash_shift;
- struct list_head *table;
+ struct hlist_bl_head *table;
};
struct dm_snapshot {
- struct mutex lock;
+ struct rw_semaphore lock;
struct dm_dev *origin;
struct dm_dev *cow;
@@ -76,7 +77,9 @@ struct dm_snapshot {
atomic_t pending_exceptions_count;
- /* Protected by "lock" */
+ spinlock_t pe_allocation_lock;
+
+ /* Protected by "pe_allocation_lock" */
sector_t exception_start_sequence;
/* Protected by kcopyd single-threaded callback */
@@ -457,9 +460,9 @@ static int __find_snapshots_sharing_cow(struct dm_snapshot *snap,
if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
continue;
- mutex_lock(&s->lock);
+ down_read(&s->lock);
active = s->active;
- mutex_unlock(&s->lock);
+ up_read(&s->lock);
if (active) {
if (snap_src)
@@ -618,6 +621,36 @@ static void unregister_snapshot(struct dm_snapshot *s)
* The lowest hash_shift bits of the chunk number are ignored, allowing
* some consecutive chunks to be grouped together.
*/
+static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk);
+
+/* Lock to protect access to the completed and pending exception hash tables. */
+struct dm_exception_table_lock {
+ struct hlist_bl_head *complete_slot;
+ struct hlist_bl_head *pending_slot;
+};
+
+static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk,
+ struct dm_exception_table_lock *lock)
+{
+ struct dm_exception_table *complete = &s->complete;
+ struct dm_exception_table *pending = &s->pending;
+
+ lock->complete_slot = &complete->table[exception_hash(complete, chunk)];
+ lock->pending_slot = &pending->table[exception_hash(pending, chunk)];
+}
+
+static void dm_exception_table_lock(struct dm_exception_table_lock *lock)
+{
+ hlist_bl_lock(lock->complete_slot);
+ hlist_bl_lock(lock->pending_slot);
+}
+
+static void dm_exception_table_unlock(struct dm_exception_table_lock *lock)
+{
+ hlist_bl_unlock(lock->pending_slot);
+ hlist_bl_unlock(lock->complete_slot);
+}
+
static int dm_exception_table_init(struct dm_exception_table *et,
uint32_t size, unsigned hash_shift)
{
@@ -625,12 +658,12 @@ static int dm_exception_table_init(struct dm_exception_table *et,
et->hash_shift = hash_shift;
et->hash_mask = size - 1;
- et->table = dm_vcalloc(size, sizeof(struct list_head));
+ et->table = dm_vcalloc(size, sizeof(struct hlist_bl_head));
if (!et->table)
return -ENOMEM;
for (i = 0; i < size; i++)
- INIT_LIST_HEAD(et->table + i);
+ INIT_HLIST_BL_HEAD(et->table + i);
return 0;
}
@@ -638,15 +671,16 @@ static int dm_exception_table_init(struct dm_exception_table *et,
static void dm_exception_table_exit(struct dm_exception_table *et,
struct kmem_cache *mem)
{
- struct list_head *slot;
- struct dm_exception *ex, *next;
+ struct hlist_bl_head *slot;
+ struct dm_exception *ex;
+ struct hlist_bl_node *pos, *n;
int i, size;
size = et->hash_mask + 1;
for (i = 0; i < size; i++) {
slot = et->table + i;
- list_for_each_entry_safe (ex, next, slot, hash_list)
+ hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list)
kmem_cache_free(mem, ex);
}
@@ -660,7 +694,7 @@ static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
static void dm_remove_exception(struct dm_exception *e)
{
- list_del(&e->hash_list);
+ hlist_bl_del(&e->hash_list);
}
/*
@@ -670,11 +704,12 @@ static void dm_remove_exception(struct dm_exception *e)
static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
chunk_t chunk)
{
- struct list_head *slot;
+ struct hlist_bl_head *slot;
+ struct hlist_bl_node *pos;
struct dm_exception *e;
slot = &et->table[exception_hash(et, chunk)];
- list_for_each_entry (e, slot, hash_list)
+ hlist_bl_for_each_entry(e, pos, slot, hash_list)
if (chunk >= e->old_chunk &&
chunk <= e->old_chunk + dm_consecutive_chunk_count(e))
return e;
@@ -721,7 +756,8 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe)
static void dm_insert_exception(struct dm_exception_table *eh,
struct dm_exception *new_e)
{
- struct list_head *l;
+ struct hlist_bl_head *l;
+ struct hlist_bl_node *pos;
struct dm_exception *e = NULL;
l = &eh->table[exception_hash(eh, new_e->old_chunk)];
@@ -731,7 +767,7 @@ static void dm_insert_exception(struct dm_exception_table *eh,
goto out;
/* List is ordered by old_chunk */
- list_for_each_entry_reverse(e, l, hash_list) {
+ hlist_bl_for_each_entry(e, pos, l, hash_list) {
/* Insert after an existing chunk? */
if (new_e->old_chunk == (e->old_chunk +
dm_consecutive_chunk_count(e) + 1) &&
@@ -752,12 +788,24 @@ static void dm_insert_exception(struct dm_exception_table *eh,
return;
}
- if (new_e->old_chunk > e->old_chunk)
+ if (new_e->old_chunk < e->old_chunk)
break;
}
out:
- list_add(&new_e->hash_list, e ? &e->hash_list : l);
+ if (!e) {
+ /*
+ * Either the table doesn't support consecutive chunks or slot
+ * l is empty.
+ */
+ hlist_bl_add_head(&new_e->hash_list, l);
+ } else if (new_e->old_chunk < e->old_chunk) {
+ /* Add before an existing exception */
+ hlist_bl_add_before(&new_e->hash_list, &e->hash_list);
+ } else {
+ /* Add to l's tail: e is the last exception in this slot */
+ hlist_bl_add_behind(&new_e->hash_list, &e->hash_list);
+ }
}
/*
@@ -766,6 +814,7 @@ out:
*/
static int dm_add_exception(void *context, chunk_t old, chunk_t new)
{
+ struct dm_exception_table_lock lock;
struct dm_snapshot *s = context;
struct dm_exception *e;
@@ -778,7 +827,17 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
/* Consecutive_count is implicitly initialised to zero */
e->new_chunk = new;
+ /*
+ * Although there is no need to lock access to the exception tables
+ * here, if we don't then hlist_bl_add_head(), called by
+ * dm_insert_exception(), will complain about accessing the
+ * corresponding list without locking it first.
+ */
+ dm_exception_table_lock_init(s, old, &lock);
+
+ dm_exception_table_lock(&lock);
dm_insert_exception(&s->complete, e);
+ dm_exception_table_unlock(&lock);
return 0;
}
@@ -807,7 +866,7 @@ static int calc_max_buckets(void)
{
/* use a fixed size of 2MB */
unsigned long mem = 2 * 1024 * 1024;
- mem /= sizeof(struct list_head);
+ mem /= sizeof(struct hlist_bl_head);
return mem;
}
@@ -927,7 +986,7 @@ static int remove_single_exception_chunk(struct dm_snapshot *s)
int r;
chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1;
- mutex_lock(&s->lock);
+ down_write(&s->lock);
/*
* Process chunks (and associated exceptions) in reverse order
@@ -942,7 +1001,7 @@ static int remove_single_exception_chunk(struct dm_snapshot *s)
b = __release_queued_bios_after_merge(s);
out:
- mutex_unlock(&s->lock);
+ up_write(&s->lock);
if (b)
flush_bios(b);
@@ -1001,9 +1060,9 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
if (linear_chunks < 0) {
DMERR("Read error in exception store: "
"shutting down merge");
- mutex_lock(&s->lock);
+ down_write(&s->lock);
s->merge_failed = 1;
- mutex_unlock(&s->lock);
+ up_write(&s->lock);
}
goto shut;
}
@@ -1044,10 +1103,10 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s)
previous_count = read_pending_exceptions_done_count();
}
- mutex_lock(&s->lock);
+ down_write(&s->lock);
s->first_merging_chunk = old_chunk;
s->num_merging_chunks = linear_chunks;
- mutex_unlock(&s->lock);
+ up_write(&s->lock);
/* Wait until writes to all 'linear_chunks' drain */
for (i = 0; i < linear_chunks; i++)
@@ -1089,10 +1148,10 @@ static void merge_callback(int read_err, unsigned long write_err, void *context)
return;
shut:
- mutex_lock(&s->lock);
+ down_write(&s->lock);
s->merge_failed = 1;
b = __release_queued_bios_after_merge(s);
- mutex_unlock(&s->lock);
+ up_write(&s->lock);
error_bios(b);
merge_shutdown(s);
@@ -1188,10 +1247,11 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
s->snapshot_overflowed = 0;
s->active = 0;
atomic_set(&s->pending_exceptions_count, 0);
+ spin_lock_init(&s->pe_allocation_lock);
s->exception_start_sequence = 0;
s->exception_complete_sequence = 0;
s->out_of_order_tree = RB_ROOT;
- mutex_init(&s->lock);
+ init_rwsem(&s->lock);
INIT_LIST_HEAD(&s->list);
spin_lock_init(&s->pe_lock);
s->state_bits = 0;
@@ -1357,9 +1417,9 @@ static void snapshot_dtr(struct dm_target *ti)
/* Check whether exception handover must be cancelled */
(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
if (snap_src && snap_dest && (s == snap_src)) {
- mutex_lock(&snap_dest->lock);
+ down_write(&snap_dest->lock);
snap_dest->valid = 0;
- mutex_unlock(&snap_dest->lock);
+ up_write(&snap_dest->lock);
DMERR("Cancelling snapshot handover.");
}
up_read(&_origins_lock);
@@ -1390,8 +1450,6 @@ static void snapshot_dtr(struct dm_target *ti)
dm_exception_store_destroy(s->store);
- mutex_destroy(&s->lock);
-
dm_put_device(ti, s->cow);
dm_put_device(ti, s->origin);
@@ -1467,6 +1525,13 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err)
dm_table_event(s->ti->table);
}
+static void invalidate_snapshot(struct dm_snapshot *s, int err)
+{
+ down_write(&s->lock);
+ __invalidate_snapshot(s, err);
+ up_write(&s->lock);
+}
+
static void pending_complete(void *context, int success)
{
struct dm_snap_pending_exception *pe = context;
@@ -1475,43 +1540,63 @@ static void pending_complete(void *context, int success)
struct bio *origin_bios = NULL;
struct bio *snapshot_bios = NULL;
struct bio *full_bio = NULL;
+ struct dm_exception_table_lock lock;
int error = 0;
+ dm_exception_table_lock_init(s, pe->e.old_chunk, &lock);
+
if (!success) {
/* Read/write error - snapshot is unusable */
- mutex_lock(&s->lock);
- __invalidate_snapshot(s, -EIO);
+ invalidate_snapshot(s, -EIO);
error = 1;
+
+ dm_exception_table_lock(&lock);
goto out;
}
e = alloc_completed_exception(GFP_NOIO);
if (!e) {
- mutex_lock(&s->lock);
- __invalidate_snapshot(s, -ENOMEM);
+ invalidate_snapshot(s, -ENOMEM);
error = 1;
+
+ dm_exception_table_lock(&lock);
goto out;
}
*e = pe->e;
- mutex_lock(&s->lock);
+ down_read(&s->lock);
+ dm_exception_table_lock(&lock);
if (!s->valid) {
+ up_read(&s->lock);
free_completed_exception(e);
error = 1;
+
goto out;
}
- /* Check for conflicting reads */
- __check_for_conflicting_io(s, pe->e.old_chunk);
-
/*
- * Add a proper exception, and remove the
- * in-flight exception from the list.
+ * Add a proper exception. After inserting the completed exception all
+ * subsequent snapshot reads to this chunk will be redirected to the
+ * COW device. This ensures that we do not starve. Moreover, as long
+ * as the pending exception exists, neither origin writes nor snapshot
+ * merging can overwrite the chunk in origin.
*/
dm_insert_exception(&s->complete, e);
+ up_read(&s->lock);
+
+ /* Wait for conflicting reads to drain */
+ if (__chunk_is_tracked(s, pe->e.old_chunk)) {
+ dm_exception_table_unlock(&lock);
+ __check_for_conflicting_io(s, pe->e.old_chunk);
+ dm_exception_table_lock(&lock);
+ }
out:
+ /* Remove the in-flight exception from the list */
dm_remove_exception(&pe->e);
+
+ dm_exception_table_unlock(&lock);
+
snapshot_bios = bio_list_get(&pe->snapshot_bios);
origin_bios = bio_list_get(&pe->origin_bios);
full_bio = pe->full_bio;
@@ -1519,8 +1604,6 @@ out:
full_bio->bi_end_io = pe->full_bio_end_io;
increment_pending_exceptions_done_count();
- mutex_unlock(&s->lock);
-
/* Submit any pending write bios */
if (error) {
if (full_bio)
@@ -1660,43 +1743,59 @@ __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
}
/*
- * Looks to see if this snapshot already has a pending exception
- * for this chunk, otherwise it allocates a new one and inserts
- * it into the pending table.
+ * Inserts a pending exception into the pending table.
*
- * NOTE: a write lock must be held on snap->lock before calling
- * this.
+ * NOTE: a write lock must be held on the chunk's pending exception table slot
+ * before calling this.
*/
static struct dm_snap_pending_exception *
-__find_pending_exception(struct dm_snapshot *s,
- struct dm_snap_pending_exception *pe, chunk_t chunk)
+__insert_pending_exception(struct dm_snapshot *s,
+ struct dm_snap_pending_exception *pe, chunk_t chunk)
{
- struct dm_snap_pending_exception *pe2;
-
- pe2 = __lookup_pending_exception(s, chunk);
- if (pe2) {
- free_pending_exception(pe);
- return pe2;
- }
-
pe->e.old_chunk = chunk;
bio_list_init(&pe->origin_bios);
bio_list_init(&pe->snapshot_bios);
pe->started = 0;
pe->full_bio = NULL;
+ spin_lock(&s->pe_allocation_lock);
if (s->store->type->prepare_exception(s->store, &pe->e)) {
+ spin_unlock(&s->pe_allocation_lock);
free_pending_exception(pe);
return NULL;
}
pe->exception_sequence = s->exception_start_sequence++;
+ spin_unlock(&s->pe_allocation_lock);
dm_insert_exception(&s->pending, &pe->e);
return pe;
}
+/*
+ * Looks to see if this snapshot already has a pending exception
+ * for this chunk, otherwise it allocates a new one and inserts
+ * it into the pending table.
+ *
+ * NOTE: a write lock must be held on the chunk's pending exception table slot
+ * before calling this.
+ */
+static struct dm_snap_pending_exception *
+__find_pending_exception(struct dm_snapshot *s,
+ struct dm_snap_pending_exception *pe, chunk_t chunk)
+{
+ struct dm_snap_pending_exception *pe2;
+
+ pe2 = __lookup_pending_exception(s, chunk);
+ if (pe2) {
+ free_pending_exception(pe);
+ return pe2;
+ }
+
+ return __insert_pending_exception(s, pe, chunk);
+}
+
static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
struct bio *bio, chunk_t chunk)
{
@@ -1714,6 +1813,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
int r = DM_MAPIO_REMAPPED;
chunk_t chunk;
struct dm_snap_pending_exception *pe = NULL;
+ struct dm_exception_table_lock lock;
init_tracked_chunk(bio);
@@ -1723,13 +1823,15 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
}
chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
+ dm_exception_table_lock_init(s, chunk, &lock);
/* Full snapshots are not usable */
/* To get here the table must be live so s->active is always set. */
if (!s->valid)
return DM_MAPIO_KILL;
- mutex_lock(&s->lock);
+ down_read(&s->lock);
+ dm_exception_table_lock(&lock);
if (!s->valid || (unlikely(s->snapshot_overflowed) &&
bio_data_dir(bio) == WRITE)) {
@@ -1752,15 +1854,9 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
if (bio_data_dir(bio) == WRITE) {
pe = __lookup_pending_exception(s, chunk);
if (!pe) {
- mutex_unlock(&s->lock);
+ dm_exception_table_unlock(&lock);
pe = alloc_pending_exception(s);
- mutex_lock(&s->lock);
-
- if (!s->valid || s->snapshot_overflowed) {
- free_pending_exception(pe);
- r = DM_MAPIO_KILL;
- goto out_unlock;
- }
+ dm_exception_table_lock(&lock);
e = dm_lookup_exception(&s->complete, chunk);
if (e) {
@@ -1771,13 +1867,22 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
pe = __find_pending_exception(s, pe, chunk);
if (!pe) {
+ dm_exception_table_unlock(&lock);
+ up_read(&s->lock);
+
+ down_write(&s->lock);
+
if (s->store->userspace_supports_overflow) {
- s->snapshot_overflowed = 1;
- DMERR("Snapshot overflowed: Unable to allocate exception.");
+ if (s->valid && !s->snapshot_overflowed) {
+ s->snapshot_overflowed = 1;
+ DMERR("Snapshot overflowed: Unable to allocate exception.");
+ }
} else
__invalidate_snapshot(s, -ENOMEM);
+ up_write(&s->lock);
+
r = DM_MAPIO_KILL;
- goto out_unlock;
+ goto out;
}
}
@@ -1789,7 +1894,10 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
bio->bi_iter.bi_size ==
(s->store->chunk_size << SECTOR_SHIFT)) {
pe->started = 1;
- mutex_unlock(&s->lock);
+
+ dm_exception_table_unlock(&lock);
+ up_read(&s->lock);
+
start_full_bio(pe, bio);
goto out;
}
@@ -1797,9 +1905,12 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
bio_list_add(&pe->snapshot_bios, bio);
if (!pe->started) {
- /* this is protected by snap->lock */
+ /* this is protected by the exception table lock */
pe->started = 1;
- mutex_unlock(&s->lock);
+
+ dm_exception_table_unlock(&lock);
+ up_read(&s->lock);
+
start_copy(pe);
goto out;
}
@@ -1809,7 +1920,8 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio)
}
out_unlock:
- mutex_unlock(&s->lock);
+ dm_exception_table_unlock(&lock);
+ up_read(&s->lock);
out:
return r;
}
@@ -1845,7 +1957,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
- mutex_lock(&s->lock);
+ down_write(&s->lock);
/* Full merging snapshots are redirected to the origin */
if (!s->valid)
@@ -1876,12 +1988,12 @@ redirect_to_origin:
bio_set_dev(bio, s->origin->bdev);
if (bio_data_dir(bio) == WRITE) {
- mutex_unlock(&s->lock);
+ up_write(&s->lock);
return do_origin(s->origin, bio);
}
out_unlock:
- mutex_unlock(&s->lock);
+ up_write(&s->lock);
return r;
}
@@ -1913,7 +2025,7 @@ static int snapshot_preresume(struct dm_target *ti)
down_read(&_origins_lock);
(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
if (snap_src && snap_dest) {
- mutex_lock(&snap_src->lock);
+ down_read(&snap_src->lock);
if (s == snap_src) {
DMERR("Unable to resume snapshot source until "
"handover completes.");
@@ -1923,7 +2035,7 @@ static int snapshot_preresume(struct dm_target *ti)
"source is suspended.");
r = -EINVAL;
}
- mutex_unlock(&snap_src->lock);
+ up_read(&snap_src->lock);
}
up_read(&_origins_lock);
@@ -1969,11 +2081,11 @@ static void snapshot_resume(struct dm_target *ti)
(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
if (snap_src && snap_dest) {
- mutex_lock(&snap_src->lock);
- mutex_lock_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
+ down_write(&snap_src->lock);
+ down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
__handover_exceptions(snap_src, snap_dest);
- mutex_unlock(&snap_dest->lock);
- mutex_unlock(&snap_src->lock);
+ up_write(&snap_dest->lock);
+ up_write(&snap_src->lock);
}
up_read(&_origins_lock);
@@ -1988,9 +2100,9 @@ static void snapshot_resume(struct dm_target *ti)
/* Now we have correct chunk size, reregister */
reregister_snapshot(s);
- mutex_lock(&s->lock);
+ down_write(&s->lock);
s->active = 1;
- mutex_unlock(&s->lock);
+ up_write(&s->lock);
}
static uint32_t get_origin_minimum_chunksize(struct block_device *bdev)
@@ -2030,7 +2142,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type,
switch (type) {
case STATUSTYPE_INFO:
- mutex_lock(&snap->lock);
+ down_write(&snap->lock);
if (!snap->valid)
DMEMIT("Invalid");
@@ -2055,7 +2167,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type,
DMEMIT("Unknown");
}
- mutex_unlock(&snap->lock);
+ up_write(&snap->lock);
break;
@@ -2107,9 +2219,10 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
int r = DM_MAPIO_REMAPPED;
struct dm_snapshot *snap;
struct dm_exception *e;
- struct dm_snap_pending_exception *pe;
+ struct dm_snap_pending_exception *pe, *pe2;
struct dm_snap_pending_exception *pe_to_start_now = NULL;
struct dm_snap_pending_exception *pe_to_start_last = NULL;
+ struct dm_exception_table_lock lock;
chunk_t chunk;
/* Do all the snapshots on this origin */
@@ -2121,52 +2234,59 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
if (dm_target_is_snapshot_merge(snap->ti))
continue;
- mutex_lock(&snap->lock);
-
- /* Only deal with valid and active snapshots */
- if (!snap->valid || !snap->active)
- goto next_snapshot;
-
/* Nothing to do if writing beyond end of snapshot */
if (sector >= dm_table_get_size(snap->ti->table))
- goto next_snapshot;
+ continue;
/*
* Remember, different snapshots can have
* different chunk sizes.
*/
chunk = sector_to_chunk(snap->store, sector);
+ dm_exception_table_lock_init(snap, chunk, &lock);
- /*
- * Check exception table to see if block
- * is already remapped in this snapshot
- * and trigger an exception if not.
- */
- e = dm_lookup_exception(&snap->complete, chunk);
- if (e)
+ down_read(&snap->lock);
+ dm_exception_table_lock(&lock);
+
+ /* Only deal with valid and active snapshots */
+ if (!snap->valid || !snap->active)
goto next_snapshot;
pe = __lookup_pending_exception(snap, chunk);
if (!pe) {
- mutex_unlock(&snap->lock);
- pe = alloc_pending_exception(snap);
- mutex_lock(&snap->lock);
-
- if (!snap->valid) {
- free_pending_exception(pe);
- goto next_snapshot;
- }
-
+ /*
+ * Check exception table to see if block is already
+ * remapped in this snapshot and trigger an exception
+ * if not.
+ */
e = dm_lookup_exception(&snap->complete, chunk);
- if (e) {
- free_pending_exception(pe);
+ if (e)
goto next_snapshot;
- }
- pe = __find_pending_exception(snap, pe, chunk);
- if (!pe) {
- __invalidate_snapshot(snap, -ENOMEM);
- goto next_snapshot;
+ dm_exception_table_unlock(&lock);
+ pe = alloc_pending_exception(snap);
+ dm_exception_table_lock(&lock);
+
+ pe2 = __lookup_pending_exception(snap, chunk);
+
+ if (!pe2) {
+ e = dm_lookup_exception(&snap->complete, chunk);
+ if (e) {
+ free_pending_exception(pe);
+ goto next_snapshot;
+ }
+
+ pe = __insert_pending_exception(snap, pe, chunk);
+ if (!pe) {
+ dm_exception_table_unlock(&lock);
+ up_read(&snap->lock);
+
+ invalidate_snapshot(snap, -ENOMEM);
+ continue;
+ }
+ } else {
+ free_pending_exception(pe);
+ pe = pe2;
}
}
@@ -2193,7 +2313,8 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
}
next_snapshot:
- mutex_unlock(&snap->lock);
+ dm_exception_table_unlock(&lock);
+ up_read(&snap->lock);
if (pe_to_start_now) {
start_copy(pe_to_start_now);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index cde3b49b2a91..ec8b27e20de3 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -561,7 +561,7 @@ static char **realloc_argv(unsigned *size, char **old_argv)
gfp = GFP_NOIO;
}
argv = kmalloc_array(new_size, sizeof(*argv), gfp);
- if (argv) {
+ if (argv && old_argv) {
memcpy(argv, old_argv, *size * sizeof(*argv));
*size = new_size;
}
@@ -880,13 +880,17 @@ void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type)
}
EXPORT_SYMBOL_GPL(dm_table_set_type);
+/* validate the dax capability of the target device span */
static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, sector_t len, void *data)
+ sector_t start, sector_t len, void *data)
{
- return bdev_dax_supported(dev->bdev, PAGE_SIZE);
+ int blocksize = *(int *) data;
+
+ return generic_fsdax_supported(dev->dax_dev, dev->bdev, blocksize,
+ start, len);
}
-static bool dm_table_supports_dax(struct dm_table *t)
+bool dm_table_supports_dax(struct dm_table *t, int blocksize)
{
struct dm_target *ti;
unsigned i;
@@ -899,7 +903,8 @@ static bool dm_table_supports_dax(struct dm_table *t)
return false;
if (!ti->type->iterate_devices ||
- !ti->type->iterate_devices(ti, device_supports_dax, NULL))
+ !ti->type->iterate_devices(ti, device_supports_dax,
+ &blocksize))
return false;
}
@@ -979,7 +984,7 @@ static int dm_table_determine_type(struct dm_table *t)
verify_bio_based:
/* We must use this table as bio-based */
t->type = DM_TYPE_BIO_BASED;
- if (dm_table_supports_dax(t) ||
+ if (dm_table_supports_dax(t, PAGE_SIZE) ||
(list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) {
t->type = DM_TYPE_DAX_BIO_BASED;
} else {
@@ -1905,7 +1910,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
}
blk_queue_write_cache(q, wc, fua);
- if (dm_table_supports_dax(t))
+ if (dm_table_supports_dax(t, PAGE_SIZE))
blk_queue_flag_set(QUEUE_FLAG_DAX, q);
else
blk_queue_flag_clear(QUEUE_FLAG_DAX, q);
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 314d17ca6466..64dd0b34fcf4 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -136,7 +136,8 @@ static int io_err_clone_and_map_rq(struct dm_target *ti, struct request *rq,
return DM_MAPIO_KILL;
}
-static void io_err_release_clone_rq(struct request *clone)
+static void io_err_release_clone_rq(struct request *clone,
+ union map_info *map_context)
{
}
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index ed3caceaed07..7f0840601737 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -202,6 +202,13 @@ struct dm_pool_metadata {
bool fail_io:1;
/*
+ * Set once a thin-pool has been accessed through one of the interfaces
+ * that imply the pool is in-service (e.g. thin devices created/deleted,
+ * thin-pool message, metadata snapshots, etc).
+ */
+ bool in_service:1;
+
+ /*
* Reading the space map roots can fail, so we read it into these
* buffers before the superblock is locked and updated.
*/
@@ -367,6 +374,32 @@ static int subtree_equal(void *context, const void *value1_le, const void *value
/*----------------------------------------------------------------*/
+/*
+ * Variant that is used for in-core only changes or code that
+ * shouldn't put the pool in service on its own (e.g. commit).
+ */
+static inline void __pmd_write_lock(struct dm_pool_metadata *pmd)
+ __acquires(pmd->root_lock)
+{
+ down_write(&pmd->root_lock);
+}
+#define pmd_write_lock_in_core(pmd) __pmd_write_lock((pmd))
+
+static inline void pmd_write_lock(struct dm_pool_metadata *pmd)
+{
+ __pmd_write_lock(pmd);
+ if (unlikely(!pmd->in_service))
+ pmd->in_service = true;
+}
+
+static inline void pmd_write_unlock(struct dm_pool_metadata *pmd)
+ __releases(pmd->root_lock)
+{
+ up_write(&pmd->root_lock);
+}
+
+/*----------------------------------------------------------------*/
+
static int superblock_lock_zero(struct dm_pool_metadata *pmd,
struct dm_block **sblock)
{
@@ -790,6 +823,9 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
*/
BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
+ if (unlikely(!pmd->in_service))
+ return 0;
+
r = __write_changed_details(pmd);
if (r < 0)
return r;
@@ -853,6 +889,7 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
pmd->time = 0;
INIT_LIST_HEAD(&pmd->thin_devices);
pmd->fail_io = false;
+ pmd->in_service = false;
pmd->bdev = bdev;
pmd->data_block_size = data_block_size;
@@ -903,7 +940,6 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
DMWARN("%s: __commit_transaction() failed, error = %d",
__func__, r);
}
-
if (!pmd->fail_io)
__destroy_persistent_data_objects(pmd);
@@ -1032,10 +1068,10 @@ int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev)
{
int r = -EINVAL;
- down_write(&pmd->root_lock);
+ pmd_write_lock(pmd);
if (!pmd->fail_io)
r = __create_thin(pmd, dev);
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
@@ -1123,10 +1159,10 @@ int dm_pool_create_snap(struct dm_pool_metadata *pmd,
{
int r = -EINVAL;
- down_write(&pmd->root_lock);
+ pmd_write_lock(pmd);
if (!pmd->fail_io)
r = __create_snap(pmd, dev, origin);
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
@@ -1166,10 +1202,10 @@ int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
{
int r = -EINVAL;
- down_write(&pmd->root_lock);
+ pmd_write_lock(pmd);
if (!pmd->fail_io)
r = __delete_device(pmd, dev);
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
@@ -1180,7 +1216,7 @@ int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
{
int r = -EINVAL;
- down_write(&pmd->root_lock);
+ pmd_write_lock(pmd);
if (pmd->fail_io)
goto out;
@@ -1194,7 +1230,7 @@ int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
r = 0;
out:
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
@@ -1225,7 +1261,12 @@ static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
* We commit to ensure the btree roots which we increment in a
* moment are up to date.
*/
- __commit_transaction(pmd);
+ r = __commit_transaction(pmd);
+ if (r < 0) {
+ DMWARN("%s: __commit_transaction() failed, error = %d",
+ __func__, r);
+ return r;
+ }
/*
* Copy the superblock.
@@ -1283,10 +1324,10 @@ int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd)
{
int r = -EINVAL;
- down_write(&pmd->root_lock);
+ pmd_write_lock(pmd);
if (!pmd->fail_io)
r = __reserve_metadata_snap(pmd);
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
@@ -1331,10 +1372,10 @@ int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd)
{
int r = -EINVAL;
- down_write(&pmd->root_lock);
+ pmd_write_lock(pmd);
if (!pmd->fail_io)
r = __release_metadata_snap(pmd);
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
@@ -1377,19 +1418,19 @@ int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
{
int r = -EINVAL;
- down_write(&pmd->root_lock);
+ pmd_write_lock_in_core(pmd);
if (!pmd->fail_io)
r = __open_device(pmd, dev, 0, td);
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
int dm_pool_close_thin_device(struct dm_thin_device *td)
{
- down_write(&td->pmd->root_lock);
+ pmd_write_lock_in_core(td->pmd);
__close_device(td);
- up_write(&td->pmd->root_lock);
+ pmd_write_unlock(td->pmd);
return 0;
}
@@ -1570,10 +1611,10 @@ int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
{
int r = -EINVAL;
- down_write(&td->pmd->root_lock);
+ pmd_write_lock(td->pmd);
if (!td->pmd->fail_io)
r = __insert(td, block, data_block);
- up_write(&td->pmd->root_lock);
+ pmd_write_unlock(td->pmd);
return r;
}
@@ -1657,10 +1698,10 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
{
int r = -EINVAL;
- down_write(&td->pmd->root_lock);
+ pmd_write_lock(td->pmd);
if (!td->pmd->fail_io)
r = __remove(td, block);
- up_write(&td->pmd->root_lock);
+ pmd_write_unlock(td->pmd);
return r;
}
@@ -1670,10 +1711,10 @@ int dm_thin_remove_range(struct dm_thin_device *td,
{
int r = -EINVAL;
- down_write(&td->pmd->root_lock);
+ pmd_write_lock(td->pmd);
if (!td->pmd->fail_io)
r = __remove_range(td, begin, end);
- up_write(&td->pmd->root_lock);
+ pmd_write_unlock(td->pmd);
return r;
}
@@ -1696,13 +1737,13 @@ int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_
{
int r = 0;
- down_write(&pmd->root_lock);
+ pmd_write_lock(pmd);
for (; b != e; b++) {
r = dm_sm_inc_block(pmd->data_sm, b);
if (r)
break;
}
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
@@ -1711,13 +1752,13 @@ int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_
{
int r = 0;
- down_write(&pmd->root_lock);
+ pmd_write_lock(pmd);
for (; b != e; b++) {
r = dm_sm_dec_block(pmd->data_sm, b);
if (r)
break;
}
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
@@ -1765,10 +1806,10 @@ int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
{
int r = -EINVAL;
- down_write(&pmd->root_lock);
+ pmd_write_lock(pmd);
if (!pmd->fail_io)
r = dm_sm_new_block(pmd->data_sm, result);
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
@@ -1777,12 +1818,16 @@ int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
{
int r = -EINVAL;
- down_write(&pmd->root_lock);
+ /*
+ * Care is taken to not have commit be what
+ * triggers putting the thin-pool in-service.
+ */
+ __pmd_write_lock(pmd);
if (pmd->fail_io)
goto out;
r = __commit_transaction(pmd);
- if (r <= 0)
+ if (r < 0)
goto out;
/*
@@ -1790,7 +1835,7 @@ int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
*/
r = __begin_transaction(pmd);
out:
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
@@ -1806,7 +1851,7 @@ int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
{
int r = -EINVAL;
- down_write(&pmd->root_lock);
+ pmd_write_lock(pmd);
if (pmd->fail_io)
goto out;
@@ -1817,7 +1862,7 @@ int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
pmd->fail_io = true;
out:
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
@@ -1948,10 +1993,10 @@ int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
{
int r = -EINVAL;
- down_write(&pmd->root_lock);
+ pmd_write_lock(pmd);
if (!pmd->fail_io)
r = __resize_space_map(pmd->data_sm, new_count);
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
@@ -1960,29 +2005,29 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_cou
{
int r = -EINVAL;
- down_write(&pmd->root_lock);
+ pmd_write_lock(pmd);
if (!pmd->fail_io) {
r = __resize_space_map(pmd->metadata_sm, new_count);
if (!r)
__set_metadata_reserve(pmd);
}
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
{
- down_write(&pmd->root_lock);
+ pmd_write_lock_in_core(pmd);
dm_bm_set_read_only(pmd->bm);
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
}
void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
{
- down_write(&pmd->root_lock);
+ pmd_write_lock_in_core(pmd);
dm_bm_set_read_write(pmd->bm);
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
}
int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
@@ -1992,9 +2037,9 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
{
int r;
- down_write(&pmd->root_lock);
+ pmd_write_lock_in_core(pmd);
r = dm_sm_register_threshold_callback(pmd->metadata_sm, threshold, fn, context);
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
@@ -2005,7 +2050,7 @@ int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
struct dm_block *sblock;
struct thin_disk_superblock *disk_super;
- down_write(&pmd->root_lock);
+ pmd_write_lock(pmd);
pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
r = superblock_lock(pmd, &sblock);
@@ -2019,7 +2064,7 @@ int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
dm_bm_unlock(sblock);
out:
- up_write(&pmd->root_lock);
+ pmd_write_unlock(pmd);
return r;
}
diff --git a/drivers/md/dm-uevent.c b/drivers/md/dm-uevent.c
index 8efe033bab55..8671267200d8 100644
--- a/drivers/md/dm-uevent.c
+++ b/drivers/md/dm-uevent.c
@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Device Mapper Uevent Support (dm-uevent)
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2 of the License, or (at your
- * option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
* Copyright IBM Corporation, 2007
* Author: Mike Anderson <andmike@linux.vnet.ibm.com>
*/
diff --git a/drivers/md/dm-uevent.h b/drivers/md/dm-uevent.h
index 2eccc8bd671a..d30d226f2a18 100644
--- a/drivers/md/dm-uevent.h
+++ b/drivers/md/dm-uevent.h
@@ -1,20 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Device Mapper Uevent Support
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2 of the License, or (at your
- * option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
* Copyright IBM Corporation, 2007
* Author: Mike Anderson <andmike@linux.vnet.ibm.com>
*/
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index b634fa23f4c4..3ceeb6b404ed 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2015 Google, Inc.
*
* Author: Sami Tolvanen <samitolvanen@google.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
*/
#include "dm-verity-fec.h"
diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
index 6ad803b2b36c..42fbd3a7fc9f 100644
--- a/drivers/md/dm-verity-fec.h
+++ b/drivers/md/dm-verity-fec.h
@@ -1,12 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Copyright (C) 2015 Google, Inc.
*
* Author: Sami Tolvanen <samitolvanen@google.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
*/
#ifndef DM_VERITY_FEC_H
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index f4c31ffaa88e..ea24ff0612e3 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2012 Red Hat, Inc.
*
@@ -5,8 +6,6 @@
*
* Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors
*
- * This file is released under the GPLv2.
- *
* In the file "/sys/module/dm_verity/parameters/prefetch_cluster" you can set
* default prefetch value. Data are read in "prefetch_cluster" chunks from the
* hash device. Setting this greatly improves performance when data and hash
@@ -236,8 +235,8 @@ static int verity_handle_err(struct dm_verity *v, enum verity_block_type type,
BUG();
}
- DMERR("%s: %s block %llu is corrupted", v->data_dev->name, type_str,
- block);
+ DMERR_LIMIT("%s: %s block %llu is corrupted", v->data_dev->name,
+ type_str, block);
if (v->corrupted_errs == DM_VERITY_MAX_CORRUPTED_ERRS)
DMERR("%s: reached maximum errors", v->data_dev->name);
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index 3441c10b840c..eeaf940aef6d 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2012 Red Hat, Inc.
* Copyright (C) 2015 Google, Inc.
@@ -5,8 +6,6 @@
* Author: Mikulas Patocka <mpatocka@redhat.com>
*
* Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors
- *
- * This file is released under the GPLv2.
*/
#ifndef DM_VERITY_H
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index f7822875589e..1cb137f0ef9d 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -190,7 +190,6 @@ struct writeback_struct {
struct dm_writecache *wc;
struct wc_entry **wc_list;
unsigned wc_list_n;
- unsigned page_offset;
struct page *page;
struct wc_entry *wc_list_inline[WB_LIST_INLINE];
struct bio bio;
@@ -546,21 +545,20 @@ static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
e = container_of(node, struct wc_entry, rb_node);
if (read_original_sector(wc, e) == block)
break;
+
node = (read_original_sector(wc, e) >= block ?
e->rb_node.rb_left : e->rb_node.rb_right);
if (unlikely(!node)) {
- if (!(flags & WFE_RETURN_FOLLOWING)) {
+ if (!(flags & WFE_RETURN_FOLLOWING))
return NULL;
- }
if (read_original_sector(wc, e) >= block) {
- break;
+ return e;
} else {
node = rb_next(&e->rb_node);
- if (unlikely(!node)) {
+ if (unlikely(!node))
return NULL;
- }
e = container_of(node, struct wc_entry, rb_node);
- break;
+ return e;
}
}
}
@@ -571,7 +569,7 @@ static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
node = rb_prev(&e->rb_node);
else
node = rb_next(&e->rb_node);
- if (!node)
+ if (unlikely(!node))
return e;
e2 = container_of(node, struct wc_entry, rb_node);
if (read_original_sector(wc, e2) != block)
@@ -804,7 +802,7 @@ static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_
writecache_free_entry(wc, e);
}
- if (!node)
+ if (unlikely(!node))
break;
e = container_of(node, struct wc_entry, rb_node);
@@ -1478,10 +1476,9 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba
bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set);
wb = container_of(bio, struct writeback_struct, bio);
wb->wc = wc;
- wb->bio.bi_end_io = writecache_writeback_endio;
- bio_set_dev(&wb->bio, wc->dev->bdev);
- wb->bio.bi_iter.bi_sector = read_original_sector(wc, e);
- wb->page_offset = PAGE_SIZE;
+ bio->bi_end_io = writecache_writeback_endio;
+ bio_set_dev(bio, wc->dev->bdev);
+ bio->bi_iter.bi_sector = read_original_sector(wc, e);
if (max_pages <= WB_LIST_INLINE ||
unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
GFP_NOIO | __GFP_NORETRY |
@@ -1507,12 +1504,12 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba
wb->wc_list[wb->wc_list_n++] = f;
e = f;
}
- bio_set_op_attrs(&wb->bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA);
+ bio_set_op_attrs(bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA);
if (writecache_has_error(wc)) {
bio->bi_status = BLK_STS_IOERR;
- bio_endio(&wb->bio);
+ bio_endio(bio);
} else {
- submit_bio(&wb->bio);
+ submit_bio(bio);
}
__writeback_throttle(wc, wbl);
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index fa68336560c3..d8334cd45d7c 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -1169,6 +1169,9 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
goto out;
}
+ if (!nr_blkz)
+ break;
+
/* Process report */
for (i = 0; i < nr_blkz; i++) {
ret = dmz_init_zone(zmd, zone, &blkz[i]);
@@ -1204,6 +1207,8 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
/* Get zone information from disk */
ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone),
&blkz, &nr_blkz, GFP_NOIO);
+ if (!nr_blkz)
+ ret = -EIO;
if (ret) {
dmz_dev_err(zmd->dev, "Get zone %u report failed",
dmz_id(zmd, zone));
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 8865c1709e16..51d029bbb740 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -643,7 +643,8 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path)
q = bdev_get_queue(dev->bdev);
dev->capacity = i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
- aligned_capacity = dev->capacity & ~(blk_queue_zone_sectors(q) - 1);
+ aligned_capacity = dev->capacity &
+ ~((sector_t)blk_queue_zone_sectors(q) - 1);
if (ti->begin ||
((ti->len != dev->capacity) && (ti->len != aligned_capacity))) {
ti->error = "Partial mapping not supported";
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 043f0761e4a0..5475081dcbd6 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -781,7 +781,8 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
}
static struct table_device *find_table_device(struct list_head *l, dev_t dev,
- fmode_t mode) {
+ fmode_t mode)
+{
struct table_device *td;
list_for_each_entry(td, l, list)
@@ -792,7 +793,8 @@ static struct table_device *find_table_device(struct list_head *l, dev_t dev,
}
int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
- struct dm_dev **result) {
+ struct dm_dev **result)
+{
int r;
struct table_device *td;
@@ -1105,6 +1107,25 @@ static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
return ret;
}
+static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
+ int blocksize, sector_t start, sector_t len)
+{
+ struct mapped_device *md = dax_get_private(dax_dev);
+ struct dm_table *map;
+ int srcu_idx;
+ bool ret;
+
+ map = dm_get_live_table(md, &srcu_idx);
+ if (!map)
+ return false;
+
+ ret = dm_table_supports_dax(map, blocksize);
+
+ dm_put_live_table(md, srcu_idx);
+
+ return ret;
+}
+
static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
{
@@ -1467,7 +1488,7 @@ static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
unsigned num_bios)
{
- unsigned len = ci->sector_count;
+ unsigned len;
/*
* Even though the device advertised support for this type of
@@ -1478,6 +1499,8 @@ static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *
if (!num_bios)
return -EOPNOTSUPP;
+ len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
+
__send_duplicate_bios(ci, ti, num_bios, &len);
ci->sector += len;
@@ -1906,7 +1929,6 @@ static void cleanup_mapped_device(struct mapped_device *md)
static struct mapped_device *alloc_dev(int minor)
{
int r, numa_node_id = dm_get_numa_node();
- struct dax_device *dax_dev = NULL;
struct mapped_device *md;
void *old_md;
@@ -1969,11 +1991,10 @@ static struct mapped_device *alloc_dev(int minor)
sprintf(md->disk->disk_name, "dm-%d", minor);
if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
- dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
- if (!dax_dev)
+ md->dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
+ if (!md->dax_dev)
goto bad;
}
- md->dax_dev = dax_dev;
add_disk_no_queue_reg(md->disk);
format_dev_t(md->name, MKDEV(_major, minor));
@@ -3192,6 +3213,7 @@ static const struct block_device_operations dm_blk_dops = {
static const struct dax_operations dm_dax_ops = {
.direct_access = dm_dax_direct_access,
+ .dax_supported = dm_dax_supported,
.copy_from_iter = dm_dax_copy_from_iter,
.copy_to_iter = dm_dax_copy_to_iter,
};
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 2d539b82ec08..17e3db54404c 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -72,6 +72,7 @@ bool dm_table_bio_based(struct dm_table *t);
bool dm_table_request_based(struct dm_table *t);
void dm_table_free_md_mempools(struct dm_table *t);
struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
+bool dm_table_supports_dax(struct dm_table *t, int blocksize);
void dm_lock_md_type(struct mapped_device *md);
void dm_unlock_md_type(struct mapped_device *md);
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 1cd4f991792c..c01d41198f5e 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* bitmap.c two-level bitmap (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
*
@@ -490,10 +491,10 @@ void md_bitmap_print_sb(struct bitmap *bitmap)
pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic));
pr_debug(" version: %d\n", le32_to_cpu(sb->version));
pr_debug(" uuid: %08x.%08x.%08x.%08x\n",
- le32_to_cpu(*(__u32 *)(sb->uuid+0)),
- le32_to_cpu(*(__u32 *)(sb->uuid+4)),
- le32_to_cpu(*(__u32 *)(sb->uuid+8)),
- le32_to_cpu(*(__u32 *)(sb->uuid+12)));
+ le32_to_cpu(*(__le32 *)(sb->uuid+0)),
+ le32_to_cpu(*(__le32 *)(sb->uuid+4)),
+ le32_to_cpu(*(__le32 *)(sb->uuid+8)),
+ le32_to_cpu(*(__le32 *)(sb->uuid+12)));
pr_debug(" events: %llu\n",
(unsigned long long) le64_to_cpu(sb->events));
pr_debug("events cleared: %llu\n",
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 8dff19d5502e..813a99ffa86f 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -1,11 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2015, SUSE
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
*/
diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c
index c2fdf899de14..50ad4ba86f0e 100644
--- a/drivers/md/md-faulty.c
+++ b/drivers/md/md-faulty.c
@@ -1,19 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* faulty.c : Multiple Devices driver for Linux
*
* Copyright (C) 2004 Neil Brown
*
* fautly-device-simulator personality for md
- *
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index 5998d78aa189..7354466ddc90 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
linear.c : Multiple Devices driver for Linux
Copyright (C) 1994-96 Marc ZYNGIER
@@ -6,14 +7,6 @@
Linear mode management functions.
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2, or (at your option)
- any later version.
-
- You should have received a copy of the GNU General Public License
- (for example /usr/src/linux/COPYING); if not, write to the Free
- Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/blkdev.h>
diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c
index 881487de1e25..6780938d2991 100644
--- a/drivers/md/md-multipath.c
+++ b/drivers/md/md-multipath.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* multipath.c : Multiple Devices driver for Linux
*
@@ -8,15 +9,6 @@
* MULTIPATH management functions.
*
* derived from raid1.c.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/blkdev.h>
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 05ffffb8b769..9801d540fea1 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
md.c : Multiple Devices driver for Linux
Copyright (C) 1998, 1999, 2000 Ingo Molnar
@@ -22,14 +23,6 @@
- persistent bitmap code
Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2, or (at your option)
- any later version.
-
- You should have received a copy of the GNU General Public License
- (for example /usr/src/linux/COPYING); if not, write to the Free
- Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
Errors, Warnings, etc.
Please use:
@@ -88,8 +81,7 @@ static struct kobj_type md_ktype;
struct md_cluster_operations *md_cluster_ops;
EXPORT_SYMBOL(md_cluster_ops);
-struct module *md_cluster_mod;
-EXPORT_SYMBOL(md_cluster_mod);
+static struct module *md_cluster_mod;
static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
static struct workqueue_struct *md_wq;
@@ -132,24 +124,6 @@ static inline int speed_max(struct mddev *mddev)
mddev->sync_speed_max : sysctl_speed_limit_max;
}
-static void * flush_info_alloc(gfp_t gfp_flags, void *data)
-{
- return kzalloc(sizeof(struct flush_info), gfp_flags);
-}
-static void flush_info_free(void *flush_info, void *data)
-{
- kfree(flush_info);
-}
-
-static void * flush_bio_alloc(gfp_t gfp_flags, void *data)
-{
- return kzalloc(sizeof(struct flush_bio), gfp_flags);
-}
-static void flush_bio_free(void *flush_bio, void *data)
-{
- kfree(flush_bio);
-}
-
static struct ctl_table_header *raid_table_header;
static struct ctl_table raid_table[] = {
@@ -423,54 +397,31 @@ static int md_congested(void *data, int bits)
/*
* Generic flush handling for md
*/
-static void submit_flushes(struct work_struct *ws)
-{
- struct flush_info *fi = container_of(ws, struct flush_info, flush_work);
- struct mddev *mddev = fi->mddev;
- struct bio *bio = fi->bio;
- bio->bi_opf &= ~REQ_PREFLUSH;
- md_handle_request(mddev, bio);
-
- mempool_free(fi, mddev->flush_pool);
-}
-
-static void md_end_flush(struct bio *fbio)
+static void md_end_flush(struct bio *bio)
{
- struct flush_bio *fb = fbio->bi_private;
- struct md_rdev *rdev = fb->rdev;
- struct flush_info *fi = fb->fi;
- struct bio *bio = fi->bio;
- struct mddev *mddev = fi->mddev;
+ struct md_rdev *rdev = bio->bi_private;
+ struct mddev *mddev = rdev->mddev;
rdev_dec_pending(rdev, mddev);
- if (atomic_dec_and_test(&fi->flush_pending)) {
- if (bio->bi_iter.bi_size == 0) {
- /* an empty barrier - all done */
- bio_endio(bio);
- mempool_free(fi, mddev->flush_pool);
- } else {
- INIT_WORK(&fi->flush_work, submit_flushes);
- queue_work(md_wq, &fi->flush_work);
- }
+ if (atomic_dec_and_test(&mddev->flush_pending)) {
+ /* The pre-request flush has finished */
+ queue_work(md_wq, &mddev->flush_work);
}
-
- mempool_free(fb, mddev->flush_bio_pool);
- bio_put(fbio);
+ bio_put(bio);
}
-void md_flush_request(struct mddev *mddev, struct bio *bio)
+static void md_submit_flush_data(struct work_struct *ws);
+
+static void submit_flushes(struct work_struct *ws)
{
+ struct mddev *mddev = container_of(ws, struct mddev, flush_work);
struct md_rdev *rdev;
- struct flush_info *fi;
-
- fi = mempool_alloc(mddev->flush_pool, GFP_NOIO);
-
- fi->bio = bio;
- fi->mddev = mddev;
- atomic_set(&fi->flush_pending, 1);
+ mddev->start_flush = ktime_get_boottime();
+ INIT_WORK(&mddev->flush_work, md_submit_flush_data);
+ atomic_set(&mddev->flush_pending, 1);
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 &&
@@ -480,37 +431,74 @@ void md_flush_request(struct mddev *mddev, struct bio *bio)
* we reclaim rcu_read_lock
*/
struct bio *bi;
- struct flush_bio *fb;
atomic_inc(&rdev->nr_pending);
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
-
- fb = mempool_alloc(mddev->flush_bio_pool, GFP_NOIO);
- fb->fi = fi;
- fb->rdev = rdev;
-
bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
- bio_set_dev(bi, rdev->bdev);
bi->bi_end_io = md_end_flush;
- bi->bi_private = fb;
+ bi->bi_private = rdev;
+ bio_set_dev(bi, rdev->bdev);
bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
-
- atomic_inc(&fi->flush_pending);
+ atomic_inc(&mddev->flush_pending);
submit_bio(bi);
-
rcu_read_lock();
rdev_dec_pending(rdev, mddev);
}
rcu_read_unlock();
+ if (atomic_dec_and_test(&mddev->flush_pending))
+ queue_work(md_wq, &mddev->flush_work);
+}
- if (atomic_dec_and_test(&fi->flush_pending)) {
- if (bio->bi_iter.bi_size == 0) {
+static void md_submit_flush_data(struct work_struct *ws)
+{
+ struct mddev *mddev = container_of(ws, struct mddev, flush_work);
+ struct bio *bio = mddev->flush_bio;
+
+ /*
+ * must reset flush_bio before calling into md_handle_request to avoid a
+ * deadlock, because other bios passed md_handle_request suspend check
+ * could wait for this and below md_handle_request could wait for those
+ * bios because of suspend check
+ */
+ mddev->last_flush = mddev->start_flush;
+ mddev->flush_bio = NULL;
+ wake_up(&mddev->sb_wait);
+
+ if (bio->bi_iter.bi_size == 0) {
+ /* an empty barrier - all done */
+ bio_endio(bio);
+ } else {
+ bio->bi_opf &= ~REQ_PREFLUSH;
+ md_handle_request(mddev, bio);
+ }
+}
+
+void md_flush_request(struct mddev *mddev, struct bio *bio)
+{
+ ktime_t start = ktime_get_boottime();
+ spin_lock_irq(&mddev->lock);
+ wait_event_lock_irq(mddev->sb_wait,
+ !mddev->flush_bio ||
+ ktime_after(mddev->last_flush, start),
+ mddev->lock);
+ if (!ktime_after(mddev->last_flush, start)) {
+ WARN_ON(mddev->flush_bio);
+ mddev->flush_bio = bio;
+ bio = NULL;
+ }
+ spin_unlock_irq(&mddev->lock);
+
+ if (!bio) {
+ INIT_WORK(&mddev->flush_work, submit_flushes);
+ queue_work(md_wq, &mddev->flush_work);
+ } else {
+ /* flush was performed for some other bio while we waited. */
+ if (bio->bi_iter.bi_size == 0)
/* an empty barrier - all done */
bio_endio(bio);
- mempool_free(fi, mddev->flush_pool);
- } else {
- INIT_WORK(&fi->flush_work, submit_flushes);
- queue_work(md_wq, &fi->flush_work);
+ else {
+ bio->bi_opf &= ~REQ_PREFLUSH;
+ mddev->pers->make_request(mddev, bio);
}
}
}
@@ -560,6 +548,7 @@ void mddev_init(struct mddev *mddev)
atomic_set(&mddev->openers, 0);
atomic_set(&mddev->active_io, 0);
spin_lock_init(&mddev->lock);
+ atomic_set(&mddev->flush_pending, 0);
init_waitqueue_head(&mddev->sb_wait);
init_waitqueue_head(&mddev->recovery_wait);
mddev->reshape_position = MaxSector;
@@ -1109,8 +1098,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
* (not needed for Linear and RAID0 as metadata doesn't
* record this size)
*/
- if (IS_ENABLED(CONFIG_LBDAF) && (u64)rdev->sectors >= (2ULL << 32) &&
- sb->level >= 1)
+ if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1)
rdev->sectors = (sector_t)(2ULL << 32) - 2;
if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
@@ -1408,8 +1396,7 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
/* Limit to 4TB as metadata cannot record more than that.
* 4TB == 2^32 KB, or 2*2^32 sectors.
*/
- if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) &&
- rdev->mddev->level >= 1)
+ if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
num_sectors = (sector_t)(2ULL << 32) - 2;
do {
md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
@@ -1553,7 +1540,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
*/
s32 offset;
sector_t bb_sector;
- u64 *bbp;
+ __le64 *bbp;
int i;
int sectors = le16_to_cpu(sb->bblog_size);
if (sectors > (PAGE_SIZE / 512))
@@ -1565,7 +1552,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
if (!sync_page_io(rdev, bb_sector, sectors << 9,
rdev->bb_page, REQ_OP_READ, 0, true))
return -EIO;
- bbp = (u64 *)page_address(rdev->bb_page);
+ bbp = (__le64 *)page_address(rdev->bb_page);
rdev->badblocks.shift = sb->bblog_shift;
for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
u64 bb = le64_to_cpu(*bbp);
@@ -1877,7 +1864,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
md_error(mddev, rdev);
else {
struct badblocks *bb = &rdev->badblocks;
- u64 *bbp = (u64 *)page_address(rdev->bb_page);
+ __le64 *bbp = (__le64 *)page_address(rdev->bb_page);
u64 *p = bb->page;
sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
if (bb->changed) {
@@ -2855,8 +2842,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
err = 0;
}
} else if (cmd_match(buf, "re-add")) {
- if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
- rdev->saved_raid_disk >= 0) {
+ if (!rdev->mddev->pers)
+ err = -EINVAL;
+ else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
+ rdev->saved_raid_disk >= 0) {
/* clear_bit is performed _after_ all the devices
* have their local Faulty bit cleared. If any writes
* happen in the meantime in the local node, they
@@ -3384,10 +3373,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
return -EIO;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
- rv = mddev ? mddev_lock(mddev): -EBUSY;
+ rv = mddev ? mddev_lock(mddev) : -ENODEV;
if (!rv) {
if (rdev->mddev == NULL)
- rv = -EBUSY;
+ rv = -ENODEV;
else
rv = entry->store(rdev, page, length);
mddev_unlock(mddev);
@@ -5511,22 +5500,6 @@ int md_run(struct mddev *mddev)
if (err)
return err;
}
- if (mddev->flush_pool == NULL) {
- mddev->flush_pool = mempool_create(NR_FLUSH_INFOS, flush_info_alloc,
- flush_info_free, mddev);
- if (!mddev->flush_pool) {
- err = -ENOMEM;
- goto abort;
- }
- }
- if (mddev->flush_bio_pool == NULL) {
- mddev->flush_bio_pool = mempool_create(NR_FLUSH_BIOS, flush_bio_alloc,
- flush_bio_free, mddev);
- if (!mddev->flush_bio_pool) {
- err = -ENOMEM;
- goto abort;
- }
- }
spin_lock(&pers_lock);
pers = find_pers(mddev->level, mddev->clevel);
@@ -5686,11 +5659,8 @@ int md_run(struct mddev *mddev)
return 0;
abort:
- mempool_destroy(mddev->flush_bio_pool);
- mddev->flush_bio_pool = NULL;
- mempool_destroy(mddev->flush_pool);
- mddev->flush_pool = NULL;
-
+ bioset_exit(&mddev->bio_set);
+ bioset_exit(&mddev->sync_set);
return err;
}
EXPORT_SYMBOL_GPL(md_run);
@@ -5894,14 +5864,6 @@ static void __md_stop(struct mddev *mddev)
mddev->to_remove = &md_redundancy_group;
module_put(pers->owner);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- if (mddev->flush_bio_pool) {
- mempool_destroy(mddev->flush_bio_pool);
- mddev->flush_bio_pool = NULL;
- }
- if (mddev->flush_pool) {
- mempool_destroy(mddev->flush_pool);
- mddev->flush_pool = NULL;
- }
}
void md_stop(struct mddev *mddev)
@@ -7645,9 +7607,9 @@ static void status_unused(struct seq_file *seq)
static int status_resync(struct seq_file *seq, struct mddev *mddev)
{
sector_t max_sectors, resync, res;
- unsigned long dt, db;
- sector_t rt;
- int scale;
+ unsigned long dt, db = 0;
+ sector_t rt, curr_mark_cnt, resync_mark_cnt;
+ int scale, recovery_active;
unsigned int per_milli;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
@@ -7736,22 +7698,30 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
* db: blocks written from mark until now
* rt: remaining time
*
- * rt is a sector_t, so could be 32bit or 64bit.
- * So we divide before multiply in case it is 32bit and close
- * to the limit.
- * We scale the divisor (db) by 32 to avoid losing precision
- * near the end of resync when the number of remaining sectors
- * is close to 'db'.
- * We then divide rt by 32 after multiplying by db to compensate.
- * The '+1' avoids division by zero if db is very small.
+ * rt is a sector_t, which is always 64bit now. We are keeping
+ * the original algorithm, but it is not really necessary.
+ *
+ * Original algorithm:
+ * So we divide before multiply in case it is 32bit and close
+ * to the limit.
+ * We scale the divisor (db) by 32 to avoid losing precision
+ * near the end of resync when the number of remaining sectors
+ * is close to 'db'.
+ * We then divide rt by 32 after multiplying by db to compensate.
+ * The '+1' avoids division by zero if db is very small.
*/
dt = ((jiffies - mddev->resync_mark) / HZ);
if (!dt) dt++;
- db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
- - mddev->resync_mark_cnt;
+
+ curr_mark_cnt = mddev->curr_mark_cnt;
+ recovery_active = atomic_read(&mddev->recovery_active);
+ resync_mark_cnt = mddev->resync_mark_cnt;
+
+ if (curr_mark_cnt >= (recovery_active + resync_mark_cnt))
+ db = curr_mark_cnt - (recovery_active + resync_mark_cnt);
rt = max_sectors - resync; /* number of remaining sectors */
- sector_div(rt, db/32+1);
+ rt = div64_u64(rt, db/32+1);
rt *= dt;
rt >>= 5;
@@ -9257,7 +9227,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
* reshape is happening in the remote node, we need to
* update reshape_position and call start_reshape.
*/
- mddev->reshape_position = sb->reshape_position;
+ mddev->reshape_position = le64_to_cpu(sb->reshape_position);
if (mddev->pers->update_reshape_pos)
mddev->pers->update_reshape_pos(mddev);
if (mddev->pers->start_reshape)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index c52afb52c776..7c930c091193 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -1,15 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
md.h : kernel internal structure of the Linux MD driver
Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2, or (at your option)
- any later version.
-
- You should have received a copy of the GNU General Public License
- (for example /usr/src/linux/COPYING); if not, write to the Free
- Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef _MD_MD_H
@@ -252,19 +245,6 @@ enum mddev_sb_flags {
MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */
};
-#define NR_FLUSH_INFOS 8
-#define NR_FLUSH_BIOS 64
-struct flush_info {
- struct bio *bio;
- struct mddev *mddev;
- struct work_struct flush_work;
- atomic_t flush_pending;
-};
-struct flush_bio {
- struct flush_info *fi;
- struct md_rdev *rdev;
-};
-
struct mddev {
void *private;
struct md_personality *pers;
@@ -470,8 +450,16 @@ struct mddev {
* metadata and bitmap writes
*/
- mempool_t *flush_pool;
- mempool_t *flush_bio_pool;
+ /* Generic flush handling.
+ * The last to finish preflush schedules a worker to submit
+ * the rest of the request (without the REQ_PREFLUSH flag).
+ */
+ struct bio *flush_bio;
+ atomic_t flush_pending;
+ ktime_t start_flush, last_flush; /* last_flush is when the last completed
+ * flush was started.
+ */
+ struct work_struct flush_work;
struct work_struct event_work; /* used by dm to report failure event */
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
struct md_cluster_info *cluster_info;
diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig
index a53cbc928af1..baaec1ae29c1 100644
--- a/drivers/md/persistent-data/Kconfig
+++ b/drivers/md/persistent-data/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config DM_PERSISTENT_DATA
tristate
depends on BLK_DEV_DM
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 3972232b8037..749ec268d957 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -35,7 +35,10 @@
#define MAX_HOLDERS 4
#define MAX_STACK 10
-typedef unsigned long stack_entries[MAX_STACK];
+struct stack_store {
+ unsigned int nr_entries;
+ unsigned long entries[MAX_STACK];
+};
struct block_lock {
spinlock_t lock;
@@ -44,8 +47,7 @@ struct block_lock {
struct task_struct *holders[MAX_HOLDERS];
#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
- struct stack_trace traces[MAX_HOLDERS];
- stack_entries entries[MAX_HOLDERS];
+ struct stack_store traces[MAX_HOLDERS];
#endif
};
@@ -73,7 +75,7 @@ static void __add_holder(struct block_lock *lock, struct task_struct *task)
{
unsigned h = __find_holder(lock, NULL);
#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
- struct stack_trace *t;
+ struct stack_store *t;
#endif
get_task_struct(task);
@@ -81,11 +83,7 @@ static void __add_holder(struct block_lock *lock, struct task_struct *task)
#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
t = lock->traces + h;
- t->nr_entries = 0;
- t->max_entries = MAX_STACK;
- t->entries = lock->entries[h];
- t->skip = 2;
- save_stack_trace(t);
+ t->nr_entries = stack_trace_save(t->entries, MAX_STACK, 2);
#endif
}
@@ -106,7 +104,8 @@ static int __check_holder(struct block_lock *lock)
DMERR("recursive lock detected in metadata");
#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
DMERR("previously held here:");
- print_stack_trace(lock->traces + i, 4);
+ stack_trace_print(lock->traces[i].entries,
+ lock->traces[i].nr_entries, 4);
DMERR("subsequent acquisition attempted here:");
dump_stack();
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index 0a3b8ae4a29c..b8a62188f6be 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -190,6 +190,8 @@ static int sm_find_free(void *addr, unsigned begin, unsigned end,
static int sm_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm)
{
+ memset(ll, 0, sizeof(struct ll_disk));
+
ll->tm = tm;
ll->bitmap_info.tm = tm;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index f3fb5bb8c82a..bf5cf184a260 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
raid0.c : Multiple Devices driver for Linux
Copyright (C) 1994-96 Marc ZYNGIER
@@ -7,14 +8,6 @@
RAID-0 management functions.
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2, or (at your option)
- any later version.
-
- You should have received a copy of the GNU General Public License
- (for example /usr/src/linux/COPYING); if not, write to the Free
- Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/blkdev.h>
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index fdf451aac369..2aa36e570e04 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* raid1.c : Multiple Devices driver for Linux
*
@@ -20,15 +21,6 @@
*
* Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
* - persistent bitmap code
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/slab.h>
@@ -2110,7 +2102,7 @@ static void process_checks(struct r1bio *r1_bio)
}
r1_bio->read_disk = primary;
for (i = 0; i < conf->raid_disks * 2; i++) {
- int j;
+ int j = 0;
struct bio *pbio = r1_bio->bios[primary];
struct bio *sbio = r1_bio->bios[i];
blk_status_t status = sbio->bi_status;
@@ -2125,8 +2117,8 @@ static void process_checks(struct r1bio *r1_bio)
/* Now we can 'fixup' the error value */
sbio->bi_status = 0;
- bio_for_each_segment_all(bi, sbio, j, iter_all)
- page_len[j] = bi->bv_len;
+ bio_for_each_segment_all(bi, sbio, iter_all)
+ page_len[j++] = bi->bv_len;
if (!status) {
for (j = vcnt; j-- ; ) {
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3b6880dd648d..aea11476fee6 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* raid10.c : Multiple Devices driver for Linux
*
@@ -6,16 +7,6 @@
* RAID-10 support for md.
*
* Base on code in raid1.c. See raid1.c for further copyright information.
- *
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/slab.h>
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index cbbe6b6535be..9b6da759dca2 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1,16 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2015 Shaohua Li <shli@fb.com>
* Copyright (C) 2016 Song Liu <songliubraving@fb.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
*/
#include <linux/kernel.h>
#include <linux/wait.h>
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 17e9e7d51097..18a4064a61a8 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -1,15 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Partial Parity Log for closing the RAID5 write hole
* Copyright (c) 2017, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
*/
#include <linux/kernel.h>
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c033bfcb209e..b83bce2beb66 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* raid5.c : Multiple Devices driver for Linux
* Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
@@ -7,15 +8,6 @@
* RAID-4/5/6 management functions.
* Thanks to Penguin Computing for making the RAID-6 development possible
* by donating a test server!
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
@@ -711,6 +703,8 @@ static bool is_full_stripe_write(struct stripe_head *sh)
}
static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
+ __acquires(&sh1->stripe_lock)
+ __acquires(&sh2->stripe_lock)
{
if (sh1 > sh2) {
spin_lock_irq(&sh2->stripe_lock);
@@ -722,6 +716,8 @@ static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
}
static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
+ __releases(&sh1->stripe_lock)
+ __releases(&sh2->stripe_lock)
{
spin_unlock(&sh1->stripe_lock);
spin_unlock_irq(&sh2->stripe_lock);
@@ -4187,7 +4183,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
/* now write out any block on a failed drive,
* or P or Q if they were recomputed
*/
- BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
+ dev = NULL;
if (s->failed == 2) {
dev = &sh->dev[s->failed_num[1]];
s->locked++;
@@ -4212,6 +4208,14 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantwrite, &dev->flags);
}
+ if (WARN_ONCE(dev && !test_bit(R5_UPTODATE, &dev->flags),
+ "%s: disk%td not up to date\n",
+ mdname(conf->mddev),
+ dev - (struct r5dev *) &sh->dev)) {
+ clear_bit(R5_LOCKED, &dev->flags);
+ clear_bit(R5_Wantwrite, &dev->flags);
+ s->locked--;
+ }
clear_bit(STRIPE_DEGRADED, &sh->state);
set_bit(STRIPE_INSYNC, &sh->state);
@@ -6166,6 +6170,8 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
static int handle_active_stripes(struct r5conf *conf, int group,
struct r5worker *worker,
struct list_head *temp_inactive_list)
+ __releases(&conf->device_lock)
+ __acquires(&conf->device_lock)
{
struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
int i, batch_size = 0, hash;