summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig13
-rw-r--r--drivers/md/Makefile2
-rw-r--r--drivers/md/bcache/alloc.c64
-rw-r--r--drivers/md/bcache/bcache.h2
-rw-r--r--drivers/md/bcache/bset.c124
-rw-r--r--drivers/md/bcache/bset.h40
-rw-r--r--drivers/md/bcache/btree.c69
-rw-r--r--drivers/md/bcache/extents.c53
-rw-r--r--drivers/md/bcache/movinggc.c41
-rw-r--r--drivers/md/bcache/super.c10
-rw-r--r--drivers/md/bcache/sysfs.c4
-rw-r--r--drivers/md/bcache/util.h67
-rw-r--r--drivers/md/bcache/writeback.c13
-rw-r--r--drivers/md/dm-bufio.c15
-rw-r--r--drivers/md/dm-cache-target.c24
-rw-r--r--drivers/md/dm-core.h1
-rw-r--r--drivers/md/dm-crypt.c27
-rw-r--r--drivers/md/dm-ebs-target.c7
-rw-r--r--drivers/md/dm-flakey.c72
-rw-r--r--drivers/md/dm-integrity.c58
-rw-r--r--drivers/md/dm-ps-historical-service-time.c4
-rw-r--r--drivers/md/dm-ps-queue-length.c4
-rw-r--r--drivers/md/dm-ps-round-robin.c4
-rw-r--r--drivers/md/dm-ps-service-time.c4
-rw-r--r--drivers/md/dm-raid.c2
-rw-r--r--drivers/md/dm-raid1.c5
-rw-r--r--drivers/md/dm-stripe.c1
-rw-r--r--drivers/md/dm-table.c30
-rw-r--r--drivers/md/dm-vdo/dedupe.c1
-rw-r--r--drivers/md/dm-vdo/indexer/index-layout.c5
-rw-r--r--drivers/md/dm-vdo/indexer/volume.c24
-rw-r--r--drivers/md/dm-vdo/vdo.c11
-rw-r--r--drivers/md/dm-verity-fec.c4
-rw-r--r--drivers/md/dm-verity-target.c16
-rw-r--r--drivers/md/dm-verity-verify-sig.c17
-rw-r--r--drivers/md/dm-zone.c25
-rw-r--r--drivers/md/dm-zoned-target.c2
-rw-r--r--drivers/md/dm.c75
-rw-r--r--drivers/md/md-autodetect.c8
-rw-r--r--drivers/md/md-bitmap.c79
-rw-r--r--drivers/md/md-bitmap.h7
-rw-r--r--drivers/md/md-linear.c352
-rw-r--r--drivers/md/md.c67
-rw-r--r--drivers/md/md.h5
-rw-r--r--drivers/md/raid0.c4
-rw-r--r--drivers/md/raid1.c65
-rw-r--r--drivers/md/raid1.h1
-rw-r--r--drivers/md/raid10.c42
-rw-r--r--drivers/md/raid10.h1
-rw-r--r--drivers/md/raid5-cache.c4
-rw-r--r--drivers/md/raid5.c111
-rw-r--r--drivers/md/raid5.h4
52 files changed, 1077 insertions, 613 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 1e9db8e4acdf..0b1870a09e1f 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -61,6 +61,19 @@ config MD_BITMAP_FILE
various kernel APIs and can only work with files on a file system not
actually sitting on the MD device.
+config MD_LINEAR
+ tristate "Linear (append) mode"
+ depends on BLK_DEV_MD
+ help
+ If you say Y here, then your multiple devices driver will be able to
+ use the so-called linear mode, i.e. it will combine the hard disk
+ partitions by simply appending one to the other.
+
+ To compile this as a module, choose M here: the module
+ will be called linear.
+
+ If unsure, say Y.
+
config MD_RAID0
tristate "RAID-0 (striping) mode"
depends on BLK_DEV_MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 476a214e4bdc..87bdfc9fe14c 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -29,12 +29,14 @@ dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o
md-mod-y += md.o md-bitmap.o
raid456-y += raid5.o raid5-cache.o raid5-ppl.o
+linear-y += md-linear.o
# Note: link order is important. All raid personalities
# and must come before md.o, as they each initialise
# themselves, and md.o may use the personalities when it
# auto-initialised.
+obj-$(CONFIG_MD_LINEAR) += linear.o
obj-$(CONFIG_MD_RAID0) += raid0.o
obj-$(CONFIG_MD_RAID1) += raid1.o
obj-$(CONFIG_MD_RAID10) += raid10.o
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index da50f6661bae..48ce750bf70a 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -164,68 +164,40 @@ static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
* prio is worth 1/8th of what INITIAL_PRIO is worth.
*/
-static inline unsigned int new_bucket_prio(struct cache *ca, struct bucket *b)
-{
- unsigned int min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8;
-
- return (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b);
-}
-
-static inline bool new_bucket_max_cmp(const void *l, const void *r, void *args)
-{
- struct bucket **lhs = (struct bucket **)l;
- struct bucket **rhs = (struct bucket **)r;
- struct cache *ca = args;
-
- return new_bucket_prio(ca, *lhs) > new_bucket_prio(ca, *rhs);
-}
-
-static inline bool new_bucket_min_cmp(const void *l, const void *r, void *args)
-{
- struct bucket **lhs = (struct bucket **)l;
- struct bucket **rhs = (struct bucket **)r;
- struct cache *ca = args;
-
- return new_bucket_prio(ca, *lhs) < new_bucket_prio(ca, *rhs);
-}
-
-static inline void new_bucket_swap(void *l, void *r, void __always_unused *args)
-{
- struct bucket **lhs = l, **rhs = r;
+#define bucket_prio(b) \
+({ \
+ unsigned int min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \
+ \
+ (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \
+})
- swap(*lhs, *rhs);
-}
+#define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r))
+#define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r))
static void invalidate_buckets_lru(struct cache *ca)
{
struct bucket *b;
- const struct min_heap_callbacks bucket_max_cmp_callback = {
- .less = new_bucket_max_cmp,
- .swp = new_bucket_swap,
- };
- const struct min_heap_callbacks bucket_min_cmp_callback = {
- .less = new_bucket_min_cmp,
- .swp = new_bucket_swap,
- };
+ ssize_t i;
- ca->heap.nr = 0;
+ ca->heap.used = 0;
for_each_bucket(b, ca) {
if (!bch_can_invalidate_bucket(ca, b))
continue;
- if (!min_heap_full(&ca->heap))
- min_heap_push(&ca->heap, &b, &bucket_max_cmp_callback, ca);
- else if (!new_bucket_max_cmp(&b, min_heap_peek(&ca->heap), ca)) {
+ if (!heap_full(&ca->heap))
+ heap_add(&ca->heap, b, bucket_max_cmp);
+ else if (bucket_max_cmp(b, heap_peek(&ca->heap))) {
ca->heap.data[0] = b;
- min_heap_sift_down(&ca->heap, 0, &bucket_max_cmp_callback, ca);
+ heap_sift(&ca->heap, 0, bucket_max_cmp);
}
}
- min_heapify_all(&ca->heap, &bucket_min_cmp_callback, ca);
+ for (i = ca->heap.used / 2 - 1; i >= 0; --i)
+ heap_sift(&ca->heap, i, bucket_min_cmp);
while (!fifo_full(&ca->free_inc)) {
- if (!ca->heap.nr) {
+ if (!heap_pop(&ca->heap, b, bucket_min_cmp)) {
/*
* We don't want to be calling invalidate_buckets()
* multiple times when it can't do anything
@@ -234,8 +206,6 @@ static void invalidate_buckets_lru(struct cache *ca)
wake_up_gc(ca->set);
return;
}
- b = min_heap_peek(&ca->heap)[0];
- min_heap_pop(&ca->heap, &bucket_min_cmp_callback, ca);
bch_invalidate_one_bucket(ca, b);
}
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 785b0d9008fa..1d33e40d26ea 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -458,7 +458,7 @@ struct cache {
/* Allocation stuff: */
struct bucket *buckets;
- DEFINE_MIN_HEAP(struct bucket *, cache_heap) heap;
+ DECLARE_HEAP(struct bucket *, heap);
/*
* If nonzero, we know we aren't going to find any buckets to invalidate
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index bd97d8626887..463eb13bd0b2 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -54,11 +54,9 @@ void bch_dump_bucket(struct btree_keys *b)
int __bch_count_data(struct btree_keys *b)
{
unsigned int ret = 0;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
struct bkey *k;
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
-
if (b->ops->is_extents)
for_each_key(b, k, &iter)
ret += KEY_SIZE(k);
@@ -69,11 +67,9 @@ void __bch_check_keys(struct btree_keys *b, const char *fmt, ...)
{
va_list args;
struct bkey *k, *p = NULL;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
const char *err;
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
-
for_each_key(b, k, &iter) {
if (b->ops->is_extents) {
err = "Keys out of order";
@@ -114,9 +110,9 @@ bug:
static void bch_btree_iter_next_check(struct btree_iter *iter)
{
- struct bkey *k = iter->heap.data->k, *next = bkey_next(k);
+ struct bkey *k = iter->data->k, *next = bkey_next(k);
- if (next < iter->heap.data->end &&
+ if (next < iter->data->end &&
bkey_cmp(k, iter->b->ops->is_extents ?
&START_KEY(next) : next) > 0) {
bch_dump_bucket(iter->b);
@@ -883,14 +879,12 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
unsigned int status = BTREE_INSERT_STATUS_NO_INSERT;
struct bset *i = bset_tree_last(b)->data;
struct bkey *m, *prev = NULL;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
struct bkey preceding_key_on_stack = ZERO_KEY;
struct bkey *preceding_key_p = &preceding_key_on_stack;
BUG_ON(b->ops->is_extents && !KEY_SIZE(k));
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
-
/*
* If k has preceding key, preceding_key_p will be set to address
* of k's preceding key; otherwise preceding_key_p will be set
@@ -901,9 +895,9 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
else
preceding_key(k, &preceding_key_p);
- m = bch_btree_iter_init(b, &iter, preceding_key_p);
+ m = bch_btree_iter_stack_init(b, &iter, preceding_key_p);
- if (b->ops->insert_fixup(b, k, &iter, replace_key))
+ if (b->ops->insert_fixup(b, k, &iter.iter, replace_key))
return status;
status = BTREE_INSERT_STATUS_INSERT;
@@ -1083,102 +1077,79 @@ struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t,
/* Btree iterator */
-typedef bool (new_btree_iter_cmp_fn)(const void *, const void *, void *);
-
-static inline bool new_btree_iter_cmp(const void *l, const void *r, void __always_unused *args)
-{
- const struct btree_iter_set *_l = l;
- const struct btree_iter_set *_r = r;
-
- return bkey_cmp(_l->k, _r->k) <= 0;
-}
+typedef bool (btree_iter_cmp_fn)(struct btree_iter_set,
+ struct btree_iter_set);
-static inline void new_btree_iter_swap(void *iter1, void *iter2, void __always_unused *args)
+static inline bool btree_iter_cmp(struct btree_iter_set l,
+ struct btree_iter_set r)
{
- struct btree_iter_set *_iter1 = iter1;
- struct btree_iter_set *_iter2 = iter2;
-
- swap(*_iter1, *_iter2);
+ return bkey_cmp(l.k, r.k) > 0;
}
static inline bool btree_iter_end(struct btree_iter *iter)
{
- return !iter->heap.nr;
+ return !iter->used;
}
void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
struct bkey *end)
{
- const struct min_heap_callbacks callbacks = {
- .less = new_btree_iter_cmp,
- .swp = new_btree_iter_swap,
- };
-
if (k != end)
- BUG_ON(!min_heap_push(&iter->heap,
- &((struct btree_iter_set) { k, end }),
- &callbacks,
- NULL));
+ BUG_ON(!heap_add(iter,
+ ((struct btree_iter_set) { k, end }),
+ btree_iter_cmp));
}
-static struct bkey *__bch_btree_iter_init(struct btree_keys *b,
- struct btree_iter *iter,
- struct bkey *search,
- struct bset_tree *start)
+static struct bkey *__bch_btree_iter_stack_init(struct btree_keys *b,
+ struct btree_iter_stack *iter,
+ struct bkey *search,
+ struct bset_tree *start)
{
struct bkey *ret = NULL;
- iter->heap.size = ARRAY_SIZE(iter->heap.preallocated);
- iter->heap.nr = 0;
+ iter->iter.size = ARRAY_SIZE(iter->stack_data);
+ iter->iter.used = 0;
#ifdef CONFIG_BCACHE_DEBUG
- iter->b = b;
+ iter->iter.b = b;
#endif
for (; start <= bset_tree_last(b); start++) {
ret = bch_bset_search(b, start, search);
- bch_btree_iter_push(iter, ret, bset_bkey_last(start->data));
+ bch_btree_iter_push(&iter->iter, ret, bset_bkey_last(start->data));
}
return ret;
}
-struct bkey *bch_btree_iter_init(struct btree_keys *b,
- struct btree_iter *iter,
+struct bkey *bch_btree_iter_stack_init(struct btree_keys *b,
+ struct btree_iter_stack *iter,
struct bkey *search)
{
- return __bch_btree_iter_init(b, iter, search, b->set);
+ return __bch_btree_iter_stack_init(b, iter, search, b->set);
}
static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
- new_btree_iter_cmp_fn *cmp)
+ btree_iter_cmp_fn *cmp)
{
struct btree_iter_set b __maybe_unused;
struct bkey *ret = NULL;
- const struct min_heap_callbacks callbacks = {
- .less = cmp,
- .swp = new_btree_iter_swap,
- };
if (!btree_iter_end(iter)) {
bch_btree_iter_next_check(iter);
- ret = iter->heap.data->k;
- iter->heap.data->k = bkey_next(iter->heap.data->k);
+ ret = iter->data->k;
+ iter->data->k = bkey_next(iter->data->k);
- if (iter->heap.data->k > iter->heap.data->end) {
+ if (iter->data->k > iter->data->end) {
WARN_ONCE(1, "bset was corrupt!\n");
- iter->heap.data->k = iter->heap.data->end;
+ iter->data->k = iter->data->end;
}
- if (iter->heap.data->k == iter->heap.data->end) {
- if (iter->heap.nr) {
- b = min_heap_peek(&iter->heap)[0];
- min_heap_pop(&iter->heap, &callbacks, NULL);
- }
- }
+ if (iter->data->k == iter->data->end)
+ heap_pop(iter, b, cmp);
else
- min_heap_sift_down(&iter->heap, 0, &callbacks, NULL);
+ heap_sift(iter, 0, cmp);
}
return ret;
@@ -1186,7 +1157,7 @@ static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
struct bkey *bch_btree_iter_next(struct btree_iter *iter)
{
- return __bch_btree_iter_next(iter, new_btree_iter_cmp);
+ return __bch_btree_iter_next(iter, btree_iter_cmp);
}
@@ -1224,18 +1195,16 @@ static void btree_mergesort(struct btree_keys *b, struct bset *out,
struct btree_iter *iter,
bool fixup, bool remove_stale)
{
+ int i;
struct bkey *k, *last = NULL;
BKEY_PADDED(k) tmp;
bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale
? bch_ptr_bad
: bch_ptr_invalid;
- const struct min_heap_callbacks callbacks = {
- .less = b->ops->sort_cmp,
- .swp = new_btree_iter_swap,
- };
/* Heapify the iterator, using our comparison function */
- min_heapify_all(&iter->heap, &callbacks, NULL);
+ for (i = iter->used / 2 - 1; i >= 0; --i)
+ heap_sift(iter, i, b->ops->sort_cmp);
while (!btree_iter_end(iter)) {
if (b->ops->sort_fixup && fixup)
@@ -1324,11 +1293,10 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start,
struct bset_sort_state *state)
{
size_t order = b->page_order, keys = 0;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
int oldsize = bch_count_data(b);
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
- __bch_btree_iter_init(b, &iter, NULL, &b->set[start]);
+ __bch_btree_iter_stack_init(b, &iter, NULL, &b->set[start]);
if (start) {
unsigned int i;
@@ -1339,7 +1307,7 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start,
order = get_order(__set_bytes(b->set->data, keys));
}
- __btree_sort(b, &iter, start, order, false, state);
+ __btree_sort(b, &iter.iter, start, order, false, state);
EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize);
}
@@ -1355,13 +1323,11 @@ void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new,
struct bset_sort_state *state)
{
uint64_t start_time = local_clock();
- struct btree_iter iter;
-
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
+ struct btree_iter_stack iter;
- bch_btree_iter_init(b, &iter, NULL);
+ bch_btree_iter_stack_init(b, &iter, NULL);
- btree_mergesort(b, new->set->data, &iter, false, true);
+ btree_mergesort(b, new->set->data, &iter.iter, false, true);
bch_time_stats_update(&state->time, start_time);
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index f79441acd4c1..011f6062c4c0 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -187,9 +187,8 @@ struct bset_tree {
};
struct btree_keys_ops {
- bool (*sort_cmp)(const void *l,
- const void *r,
- void *args);
+ bool (*sort_cmp)(struct btree_iter_set l,
+ struct btree_iter_set r);
struct bkey *(*sort_fixup)(struct btree_iter *iter,
struct bkey *tmp);
bool (*insert_fixup)(struct btree_keys *b,
@@ -313,17 +312,23 @@ enum {
BTREE_INSERT_STATUS_FRONT_MERGE,
};
-struct btree_iter_set {
- struct bkey *k, *end;
-};
-
/* Btree key iteration */
struct btree_iter {
+ size_t size, used;
#ifdef CONFIG_BCACHE_DEBUG
struct btree_keys *b;
#endif
- MIN_HEAP_PREALLOCATED(struct btree_iter_set, btree_iter_heap, MAX_BSETS) heap;
+ struct btree_iter_set {
+ struct bkey *k, *end;
+ } data[];
+};
+
+/* Fixed-size btree_iter that can be allocated on the stack */
+
+struct btree_iter_stack {
+ struct btree_iter iter;
+ struct btree_iter_set stack_data[MAX_BSETS];
};
typedef bool (*ptr_filter_fn)(struct btree_keys *b, const struct bkey *k);
@@ -335,9 +340,9 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
struct bkey *end);
-struct bkey *bch_btree_iter_init(struct btree_keys *b,
- struct btree_iter *iter,
- struct bkey *search);
+struct bkey *bch_btree_iter_stack_init(struct btree_keys *b,
+ struct btree_iter_stack *iter,
+ struct bkey *search);
struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t,
const struct bkey *search);
@@ -352,13 +357,14 @@ static inline struct bkey *bch_bset_search(struct btree_keys *b,
return search ? __bch_bset_search(b, t, search) : t->data->start;
}
-#define for_each_key_filter(b, k, iter, filter) \
- for (bch_btree_iter_init((b), (iter), NULL); \
- ((k) = bch_btree_iter_next_filter((iter), (b), filter));)
+#define for_each_key_filter(b, k, stack_iter, filter) \
+ for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \
+ ((k) = bch_btree_iter_next_filter(&((stack_iter)->iter), (b), \
+ filter));)
-#define for_each_key(b, k, iter) \
- for (bch_btree_iter_init((b), (iter), NULL); \
- ((k) = bch_btree_iter_next(iter));)
+#define for_each_key(b, k, stack_iter) \
+ for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \
+ ((k) = bch_btree_iter_next(&((stack_iter)->iter)));)
/* Sorting */
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index ed40d8600656..4e6ccf2c8a0b 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -149,19 +149,19 @@ void bch_btree_node_read_done(struct btree *b)
{
const char *err = "bad btree header";
struct bset *i = btree_bset_first(b);
- struct btree_iter iter;
+ struct btree_iter *iter;
/*
* c->fill_iter can allocate an iterator with more memory space
* than static MAX_BSETS.
* See the comment arount cache_set->fill_iter.
*/
- iter.heap.data = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
- iter.heap.size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size;
- iter.heap.nr = 0;
+ iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
+ iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size;
+ iter->used = 0;
#ifdef CONFIG_BCACHE_DEBUG
- iter.b = &b->keys;
+ iter->b = &b->keys;
#endif
if (!i->seq)
@@ -199,7 +199,7 @@ void bch_btree_node_read_done(struct btree *b)
if (i != b->keys.set[0].data && !i->keys)
goto err;
- bch_btree_iter_push(&iter, i->start, bset_bkey_last(i));
+ bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
b->written += set_blocks(i, block_bytes(b->c->cache));
}
@@ -211,7 +211,7 @@ void bch_btree_node_read_done(struct btree *b)
if (i->seq == b->keys.set[0].data->seq)
goto err;
- bch_btree_sort_and_fix_extents(&b->keys, &iter, &b->c->sort);
+ bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort);
i = b->keys.set[0].data;
err = "short btree key";
@@ -223,7 +223,7 @@ void bch_btree_node_read_done(struct btree *b)
bch_bset_init_next(&b->keys, write_block(b),
bset_magic(&b->c->cache->sb));
out:
- mempool_free(iter.heap.data, &b->c->fill_iter);
+ mempool_free(iter, &b->c->fill_iter);
return;
err:
set_btree_node_io_error(b);
@@ -1309,11 +1309,9 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
uint8_t stale = 0;
unsigned int keys = 0, good_keys = 0;
struct bkey *k;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
struct bset_tree *t;
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
-
gc->nodes++;
for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
@@ -1572,11 +1570,9 @@ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
static unsigned int btree_gc_count_keys(struct btree *b)
{
struct bkey *k;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
unsigned int ret = 0;
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
-
for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
ret += bkey_u64s(k);
@@ -1615,18 +1611,18 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
int ret = 0;
bool should_rewrite;
struct bkey *k;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
struct gc_merge_info r[GC_MERGE_NODES];
struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1;
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
- bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
+ bch_btree_iter_stack_init(&b->keys, &iter, &b->c->gc_done);
for (i = r; i < r + ARRAY_SIZE(r); i++)
i->b = ERR_PTR(-EINTR);
while (1) {
- k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
+ k = bch_btree_iter_next_filter(&iter.iter, &b->keys,
+ bch_ptr_bad);
if (k) {
r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
true, b);
@@ -1921,9 +1917,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
{
int ret = 0;
struct bkey *k, *p = NULL;
- struct btree_iter iter;
-
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
+ struct btree_iter_stack iter;
for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid)
bch_initial_mark_key(b->c, b->level, k);
@@ -1931,10 +1925,10 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
bch_initial_mark_key(b->c, b->level + 1, &b->key);
if (b->level) {
- bch_btree_iter_init(&b->keys, &iter, NULL);
+ bch_btree_iter_stack_init(&b->keys, &iter, NULL);
do {
- k = bch_btree_iter_next_filter(&iter, &b->keys,
+ k = bch_btree_iter_next_filter(&iter.iter, &b->keys,
bch_ptr_bad);
if (k) {
btree_node_prefetch(b, k);
@@ -1962,7 +1956,7 @@ static int bch_btree_check_thread(void *arg)
struct btree_check_info *info = arg;
struct btree_check_state *check_state = info->state;
struct cache_set *c = check_state->c;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
struct bkey *k, *p;
int cur_idx, prev_idx, skip_nr;
@@ -1970,11 +1964,9 @@ static int bch_btree_check_thread(void *arg)
cur_idx = prev_idx = 0;
ret = 0;
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
-
/* root node keys are checked before thread created */
- bch_btree_iter_init(&c->root->keys, &iter, NULL);
- k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
+ bch_btree_iter_stack_init(&c->root->keys, &iter, NULL);
+ k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad);
BUG_ON(!k);
p = k;
@@ -1992,7 +1984,7 @@ static int bch_btree_check_thread(void *arg)
skip_nr = cur_idx - prev_idx;
while (skip_nr) {
- k = bch_btree_iter_next_filter(&iter,
+ k = bch_btree_iter_next_filter(&iter.iter,
&c->root->keys,
bch_ptr_bad);
if (k)
@@ -2065,11 +2057,9 @@ int bch_btree_check(struct cache_set *c)
int ret = 0;
int i;
struct bkey *k = NULL;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
struct btree_check_state check_state;
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
-
/* check and mark root node keys */
for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid)
bch_initial_mark_key(c, c->root->level, k);
@@ -2563,12 +2553,11 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
if (b->level) {
struct bkey *k;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
- bch_btree_iter_init(&b->keys, &iter, from);
+ bch_btree_iter_stack_init(&b->keys, &iter, from);
- while ((k = bch_btree_iter_next_filter(&iter, &b->keys,
+ while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys,
bch_ptr_bad))) {
ret = bcache_btree(map_nodes_recurse, k, b,
op, from, fn, flags);
@@ -2597,12 +2586,12 @@ int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
{
int ret = MAP_CONTINUE;
struct bkey *k;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
- bch_btree_iter_init(&b->keys, &iter, from);
+ bch_btree_iter_stack_init(&b->keys, &iter, from);
- while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) {
+ while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys,
+ bch_ptr_bad))) {
ret = !b->level
? fn(op, b, k)
: bcache_btree(map_keys_recurse, k,
diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c
index a7221e5dbe81..d626ffcbecb9 100644
--- a/drivers/md/bcache/extents.c
+++ b/drivers/md/bcache/extents.c
@@ -33,16 +33,15 @@ static void sort_key_next(struct btree_iter *iter,
i->k = bkey_next(i->k);
if (i->k == i->end)
- *i = iter->heap.data[--iter->heap.nr];
+ *i = iter->data[--iter->used];
}
-static bool new_bch_key_sort_cmp(const void *l, const void *r, void *args)
+static bool bch_key_sort_cmp(struct btree_iter_set l,
+ struct btree_iter_set r)
{
- struct btree_iter_set *_l = (struct btree_iter_set *)l;
- struct btree_iter_set *_r = (struct btree_iter_set *)r;
- int64_t c = bkey_cmp(_l->k, _r->k);
+ int64_t c = bkey_cmp(l.k, r.k);
- return !(c ? c > 0 : _l->k < _r->k);
+ return c ? c > 0 : l.k < r.k;
}
static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
@@ -239,7 +238,7 @@ static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk,
}
const struct btree_keys_ops bch_btree_keys_ops = {
- .sort_cmp = new_bch_key_sort_cmp,
+ .sort_cmp = bch_key_sort_cmp,
.insert_fixup = bch_btree_ptr_insert_fixup,
.key_invalid = bch_btree_ptr_invalid,
.key_bad = bch_btree_ptr_bad,
@@ -256,36 +255,22 @@ const struct btree_keys_ops bch_btree_keys_ops = {
* Necessary for btree_sort_fixup() - if there are multiple keys that compare
* equal in different sets, we have to process them newest to oldest.
*/
-
-static bool new_bch_extent_sort_cmp(const void *l, const void *r, void __always_unused *args)
-{
- struct btree_iter_set *_l = (struct btree_iter_set *)l;
- struct btree_iter_set *_r = (struct btree_iter_set *)r;
- int64_t c = bkey_cmp(&START_KEY(_l->k), &START_KEY(_r->k));
-
- return !(c ? c > 0 : _l->k < _r->k);
-}
-
-static inline void new_btree_iter_swap(void *iter1, void *iter2, void __always_unused *args)
+static bool bch_extent_sort_cmp(struct btree_iter_set l,
+ struct btree_iter_set r)
{
- struct btree_iter_set *_iter1 = iter1;
- struct btree_iter_set *_iter2 = iter2;
+ int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
- swap(*_iter1, *_iter2);
+ return c ? c > 0 : l.k < r.k;
}
static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
struct bkey *tmp)
{
- const struct min_heap_callbacks callbacks = {
- .less = new_bch_extent_sort_cmp,
- .swp = new_btree_iter_swap,
- };
- while (iter->heap.nr > 1) {
- struct btree_iter_set *top = iter->heap.data, *i = top + 1;
-
- if (iter->heap.nr > 2 &&
- !new_bch_extent_sort_cmp(&i[0], &i[1], NULL))
+ while (iter->used > 1) {
+ struct btree_iter_set *top = iter->data, *i = top + 1;
+
+ if (iter->used > 2 &&
+ bch_extent_sort_cmp(i[0], i[1]))
i++;
if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
@@ -293,7 +278,7 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
if (!KEY_SIZE(i->k)) {
sort_key_next(iter, i);
- min_heap_sift_down(&iter->heap, i - top, &callbacks, NULL);
+ heap_sift(iter, i - top, bch_extent_sort_cmp);
continue;
}
@@ -303,7 +288,7 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
else
bch_cut_front(top->k, i->k);
- min_heap_sift_down(&iter->heap, i - top, &callbacks, NULL);
+ heap_sift(iter, i - top, bch_extent_sort_cmp);
} else {
/* can't happen because of comparison func */
BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
@@ -313,7 +298,7 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
bch_cut_back(&START_KEY(i->k), tmp);
bch_cut_front(i->k, top->k);
- min_heap_sift_down(&iter->heap, 0, &callbacks, NULL);
+ heap_sift(iter, 0, bch_extent_sort_cmp);
return tmp;
} else {
@@ -633,7 +618,7 @@ static bool bch_extent_merge(struct btree_keys *bk,
}
const struct btree_keys_ops bch_extent_keys_ops = {
- .sort_cmp = new_bch_extent_sort_cmp,
+ .sort_cmp = bch_extent_sort_cmp,
.sort_fixup = bch_extent_sort_fixup,
.insert_fixup = bch_extent_insert_fixup,
.key_invalid = bch_extent_invalid,
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 7f482729c56d..ebd500bdf0b2 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -182,27 +182,16 @@ err: if (!IS_ERR_OR_NULL(w->private))
closure_sync(&cl);
}
-static bool new_bucket_cmp(const void *l, const void *r, void __always_unused *args)
+static bool bucket_cmp(struct bucket *l, struct bucket *r)
{
- struct bucket **_l = (struct bucket **)l;
- struct bucket **_r = (struct bucket **)r;
-
- return GC_SECTORS_USED(*_l) >= GC_SECTORS_USED(*_r);
-}
-
-static void new_bucket_swap(void *l, void *r, void __always_unused *args)
-{
- struct bucket **_l = l;
- struct bucket **_r = r;
-
- swap(*_l, *_r);
+ return GC_SECTORS_USED(l) < GC_SECTORS_USED(r);
}
static unsigned int bucket_heap_top(struct cache *ca)
{
struct bucket *b;
- return (b = min_heap_peek(&ca->heap)[0]) ? GC_SECTORS_USED(b) : 0;
+ return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0;
}
void bch_moving_gc(struct cache_set *c)
@@ -210,10 +199,6 @@ void bch_moving_gc(struct cache_set *c)
struct cache *ca = c->cache;
struct bucket *b;
unsigned long sectors_to_move, reserve_sectors;
- const struct min_heap_callbacks callbacks = {
- .less = new_bucket_cmp,
- .swp = new_bucket_swap,
- };
if (!c->copy_gc_enabled)
return;
@@ -224,7 +209,7 @@ void bch_moving_gc(struct cache_set *c)
reserve_sectors = ca->sb.bucket_size *
fifo_used(&ca->free[RESERVE_MOVINGGC]);
- ca->heap.nr = 0;
+ ca->heap.used = 0;
for_each_bucket(b, ca) {
if (GC_MARK(b) == GC_MARK_METADATA ||
@@ -233,31 +218,25 @@ void bch_moving_gc(struct cache_set *c)
atomic_read(&b->pin))
continue;
- if (!min_heap_full(&ca->heap)) {
+ if (!heap_full(&ca->heap)) {
sectors_to_move += GC_SECTORS_USED(b);
- min_heap_push(&ca->heap, &b, &callbacks, NULL);
- } else if (!new_bucket_cmp(&b, min_heap_peek(&ca->heap), ca)) {
+ heap_add(&ca->heap, b, bucket_cmp);
+ } else if (bucket_cmp(b, heap_peek(&ca->heap))) {
sectors_to_move -= bucket_heap_top(ca);
sectors_to_move += GC_SECTORS_USED(b);
ca->heap.data[0] = b;
- min_heap_sift_down(&ca->heap, 0, &callbacks, NULL);
+ heap_sift(&ca->heap, 0, bucket_cmp);
}
}
while (sectors_to_move > reserve_sectors) {
- if (ca->heap.nr) {
- b = min_heap_peek(&ca->heap)[0];
- min_heap_pop(&ca->heap, &callbacks, NULL);
- }
+ heap_pop(&ca->heap, b, bucket_cmp);
sectors_to_move -= GC_SECTORS_USED(b);
}
- while (ca->heap.nr) {
- b = min_heap_peek(&ca->heap)[0];
- min_heap_pop(&ca->heap, &callbacks, NULL);
+ while (heap_pop(&ca->heap, b, bucket_cmp))
SET_GC_MOVE(b, 1);
- }
mutex_unlock(&c->bucket_lock);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index e42f1400cea9..1084b3f0dfe7 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1733,7 +1733,12 @@ static CLOSURE_CALLBACK(cache_set_flush)
mutex_unlock(&b->write_lock);
}
- if (ca->alloc_thread)
+ /*
+ * If the register_cache_set() call to bch_cache_set_alloc() failed,
+ * ca has not been assigned a value and return error.
+ * So we need check ca is not NULL during bch_cache_set_unregister().
+ */
+ if (ca && ca->alloc_thread)
kthread_stop(ca->alloc_thread);
if (c->journal.cur) {
@@ -1907,7 +1912,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
INIT_LIST_HEAD(&c->btree_cache_freed);
INIT_LIST_HEAD(&c->data_buckets);
- iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size) *
+ iter_size = sizeof(struct btree_iter) +
+ ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size) *
sizeof(struct btree_iter_set);
c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL);
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index e8f696cb58c0..826b14cae4e5 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -660,9 +660,7 @@ static unsigned int bch_root_usage(struct cache_set *c)
unsigned int bytes = 0;
struct bkey *k;
struct btree *b;
- struct btree_iter iter;
-
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
+ struct btree_iter_stack iter;
goto lock_root;
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 539454d8e2d0..f61ab1bada6c 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -9,7 +9,6 @@
#include <linux/kernel.h>
#include <linux/sched/clock.h>
#include <linux/llist.h>
-#include <linux/min_heap.h>
#include <linux/ratelimit.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
@@ -31,10 +30,16 @@ struct closure;
#endif
+#define DECLARE_HEAP(type, name) \
+ struct { \
+ size_t size, used; \
+ type *data; \
+ } name
+
#define init_heap(heap, _size, gfp) \
({ \
size_t _bytes; \
- (heap)->nr = 0; \
+ (heap)->used = 0; \
(heap)->size = (_size); \
_bytes = (heap)->size * sizeof(*(heap)->data); \
(heap)->data = kvmalloc(_bytes, (gfp) & GFP_KERNEL); \
@@ -47,6 +52,64 @@ do { \
(heap)->data = NULL; \
} while (0)
+#define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j])
+
+#define heap_sift(h, i, cmp) \
+do { \
+ size_t _r, _j = i; \
+ \
+ for (; _j * 2 + 1 < (h)->used; _j = _r) { \
+ _r = _j * 2 + 1; \
+ if (_r + 1 < (h)->used && \
+ cmp((h)->data[_r], (h)->data[_r + 1])) \
+ _r++; \
+ \
+ if (cmp((h)->data[_r], (h)->data[_j])) \
+ break; \
+ heap_swap(h, _r, _j); \
+ } \
+} while (0)
+
+#define heap_sift_down(h, i, cmp) \
+do { \
+ while (i) { \
+ size_t p = (i - 1) / 2; \
+ if (cmp((h)->data[i], (h)->data[p])) \
+ break; \
+ heap_swap(h, i, p); \
+ i = p; \
+ } \
+} while (0)
+
+#define heap_add(h, d, cmp) \
+({ \
+ bool _r = !heap_full(h); \
+ if (_r) { \
+ size_t _i = (h)->used++; \
+ (h)->data[_i] = d; \
+ \
+ heap_sift_down(h, _i, cmp); \
+ heap_sift(h, _i, cmp); \
+ } \
+ _r; \
+})
+
+#define heap_pop(h, d, cmp) \
+({ \
+ bool _r = (h)->used; \
+ if (_r) { \
+ (d) = (h)->data[0]; \
+ (h)->used--; \
+ heap_swap(h, 0, (h)->used); \
+ heap_sift(h, 0, cmp); \
+ } \
+ _r; \
+})
+
+#define heap_peek(h) ((h)->used ? (h)->data[0] : NULL)
+
+#define heap_full(h) ((h)->used == (h)->size)
+
#define DECLARE_FIFO(type, name) \
struct { \
size_t front, back, size, mask; \
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index c1d28e365910..792e070ccf38 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -908,16 +908,15 @@ static int bch_dirty_init_thread(void *arg)
struct dirty_init_thrd_info *info = arg;
struct bch_dirty_init_state *state = info->state;
struct cache_set *c = state->c;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
struct bkey *k, *p;
int cur_idx, prev_idx, skip_nr;
k = p = NULL;
prev_idx = 0;
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
- bch_btree_iter_init(&c->root->keys, &iter, NULL);
- k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
+ bch_btree_iter_stack_init(&c->root->keys, &iter, NULL);
+ k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad);
BUG_ON(!k);
p = k;
@@ -931,7 +930,7 @@ static int bch_dirty_init_thread(void *arg)
skip_nr = cur_idx - prev_idx;
while (skip_nr) {
- k = bch_btree_iter_next_filter(&iter,
+ k = bch_btree_iter_next_filter(&iter.iter,
&c->root->keys,
bch_ptr_bad);
if (k)
@@ -980,13 +979,11 @@ void bch_sectors_dirty_init(struct bcache_device *d)
int i;
struct btree *b = NULL;
struct bkey *k = NULL;
- struct btree_iter iter;
+ struct btree_iter_stack iter;
struct sectors_dirty_init op;
struct cache_set *c = d->c;
struct bch_dirty_init_state state;
- min_heap_init(&iter.heap, NULL, MAX_BSETS);
-
retry_lock:
b = c->root;
rw_lock(0, b, b->level);
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 23e0b71b991e..aaa21fe295f2 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -68,6 +68,8 @@
#define LIST_DIRTY 1
#define LIST_SIZE 2
+#define SCAN_RESCHED_CYCLE 16
+
/*--------------------------------------------------------------*/
/*
@@ -2414,7 +2416,12 @@ static void __scan(struct dm_bufio_client *c)
atomic_long_dec(&c->need_shrink);
freed++;
- cond_resched();
+
+ if (unlikely(freed % SCAN_RESCHED_CYCLE == 0)) {
+ dm_bufio_unlock(c);
+ cond_resched();
+ dm_bufio_lock(c);
+ }
}
}
}
@@ -2734,7 +2741,11 @@ static unsigned long __evict_many(struct dm_bufio_client *c,
__make_buffer_clean(b);
__free_buffer_wake(b);
- cond_resched();
+ if (need_resched()) {
+ dm_bufio_unlock(c);
+ cond_resched();
+ dm_bufio_lock(c);
+ }
}
return count;
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 849eb6333e98..6aa4095dc587 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2899,6 +2899,27 @@ static dm_cblock_t get_cache_dev_size(struct cache *cache)
return to_cblock(size);
}
+static bool can_resume(struct cache *cache)
+{
+ /*
+ * Disallow retrying the resume operation for devices that failed the
+ * first resume attempt, as the failure leaves the policy object partially
+ * initialized. Retrying could trigger BUG_ON when loading cache mappings
+ * into the incomplete policy object.
+ */
+ if (cache->sized && !cache->loaded_mappings) {
+ if (get_cache_mode(cache) != CM_WRITE)
+ DMERR("%s: unable to resume a failed-loaded cache, please check metadata.",
+ cache_device_name(cache));
+ else
+ DMERR("%s: unable to resume cache due to missing proper cache table reload",
+ cache_device_name(cache));
+ return false;
+ }
+
+ return true;
+}
+
static bool can_resize(struct cache *cache, dm_cblock_t new_size)
{
if (from_cblock(new_size) > from_cblock(cache->cache_size)) {
@@ -2947,6 +2968,9 @@ static int cache_preresume(struct dm_target *ti)
struct cache *cache = ti->private;
dm_cblock_t csize = get_cache_dev_size(cache);
+ if (!can_resume(cache))
+ return -EINVAL;
+
/*
* Check to see if the cache has resized.
*/
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 3637761f3585..f3a3f2ef6322 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -141,6 +141,7 @@ struct mapped_device {
#ifdef CONFIG_BLK_DEV_ZONED
unsigned int nr_zones;
void *zone_revalidate_map;
+ struct task_struct *revalidate_map_task;
#endif
#ifdef CONFIG_IMA
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 1ae2c71bb383..78c975d7cd5f 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -59,6 +59,7 @@ struct convert_context {
struct bio *bio_out;
struct bvec_iter iter_out;
atomic_t cc_pending;
+ unsigned int tag_offset;
u64 cc_sector;
union {
struct skcipher_request *req;
@@ -1256,6 +1257,7 @@ static void crypt_convert_init(struct crypt_config *cc,
if (bio_out)
ctx->iter_out = bio_out->bi_iter;
ctx->cc_sector = sector + cc->iv_offset;
+ ctx->tag_offset = 0;
init_completion(&ctx->restart);
}
@@ -1588,7 +1590,6 @@ static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_
static blk_status_t crypt_convert(struct crypt_config *cc,
struct convert_context *ctx, bool atomic, bool reset_pending)
{
- unsigned int tag_offset = 0;
unsigned int sector_step = cc->sector_size >> SECTOR_SHIFT;
int r;
@@ -1611,9 +1612,9 @@ static blk_status_t crypt_convert(struct crypt_config *cc,
atomic_inc(&ctx->cc_pending);
if (crypt_integrity_aead(cc))
- r = crypt_convert_block_aead(cc, ctx, ctx->r.req_aead, tag_offset);
+ r = crypt_convert_block_aead(cc, ctx, ctx->r.req_aead, ctx->tag_offset);
else
- r = crypt_convert_block_skcipher(cc, ctx, ctx->r.req, tag_offset);
+ r = crypt_convert_block_skcipher(cc, ctx, ctx->r.req, ctx->tag_offset);
switch (r) {
/*
@@ -1633,8 +1634,8 @@ static blk_status_t crypt_convert(struct crypt_config *cc,
* exit and continue processing in a workqueue
*/
ctx->r.req = NULL;
+ ctx->tag_offset++;
ctx->cc_sector += sector_step;
- tag_offset++;
return BLK_STS_DEV_RESOURCE;
}
} else {
@@ -1648,8 +1649,8 @@ static blk_status_t crypt_convert(struct crypt_config *cc,
*/
case -EINPROGRESS:
ctx->r.req = NULL;
+ ctx->tag_offset++;
ctx->cc_sector += sector_step;
- tag_offset++;
continue;
/*
* The request was already processed (synchronously).
@@ -1657,7 +1658,7 @@ static blk_status_t crypt_convert(struct crypt_config *cc,
case 0:
atomic_dec(&ctx->cc_pending);
ctx->cc_sector += sector_step;
- tag_offset++;
+ ctx->tag_offset++;
if (!atomic)
cond_resched();
continue;
@@ -2092,7 +2093,6 @@ static void kcryptd_crypt_write_continue(struct work_struct *work)
struct crypt_config *cc = io->cc;
struct convert_context *ctx = &io->ctx;
int crypt_finished;
- sector_t sector = io->sector;
blk_status_t r;
wait_for_completion(&ctx->restart);
@@ -2109,10 +2109,8 @@ static void kcryptd_crypt_write_continue(struct work_struct *work)
}
/* Encryption was already finished, submit io now */
- if (crypt_finished) {
+ if (crypt_finished)
kcryptd_crypt_write_io_submit(io, 0);
- io->sector = sector;
- }
crypt_dec_pending(io);
}
@@ -2123,14 +2121,13 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
struct convert_context *ctx = &io->ctx;
struct bio *clone;
int crypt_finished;
- sector_t sector = io->sector;
blk_status_t r;
/*
* Prevent io from disappearing until this function completes.
*/
crypt_inc_pending(io);
- crypt_convert_init(cc, ctx, NULL, io->base_bio, sector);
+ crypt_convert_init(cc, ctx, NULL, io->base_bio, io->sector);
clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size);
if (unlikely(!clone)) {
@@ -2147,8 +2144,6 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
io->ctx.iter_in = clone->bi_iter;
}
- sector += bio_sectors(clone);
-
crypt_inc_pending(io);
r = crypt_convert(cc, ctx,
test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags), true);
@@ -2172,10 +2167,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
}
/* Encryption was already finished, submit io now */
- if (crypt_finished) {
+ if (crypt_finished)
kcryptd_crypt_write_io_submit(io, 0);
- io->sector = sector;
- }
dec:
crypt_dec_pending(io);
diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c
index 18ae45dcbfb2..b19b0142a690 100644
--- a/drivers/md/dm-ebs-target.c
+++ b/drivers/md/dm-ebs-target.c
@@ -390,6 +390,12 @@ static int ebs_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_REMAPPED;
}
+static void ebs_postsuspend(struct dm_target *ti)
+{
+ struct ebs_c *ec = ti->private;
+ dm_bufio_client_reset(ec->bufio);
+}
+
static void ebs_status(struct dm_target *ti, status_type_t type,
unsigned int status_flags, char *result, unsigned int maxlen)
{
@@ -447,6 +453,7 @@ static struct target_type ebs_target = {
.ctr = ebs_ctr,
.dtr = ebs_dtr,
.map = ebs_map,
+ .postsuspend = ebs_postsuspend,
.status = ebs_status,
.io_hints = ebs_io_hints,
.prepare_ioctl = ebs_prepare_ioctl,
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 731467d4ed10..347881f323d5 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -47,14 +47,15 @@ enum feature_flag_bits {
};
struct per_bio_data {
- bool bio_submitted;
+ bool bio_can_corrupt;
+ struct bvec_iter saved_iter;
};
static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
struct dm_target *ti)
{
- int r;
- unsigned int argc;
+ int r = 0;
+ unsigned int argc = 0;
const char *arg_name;
static const struct dm_arg _args[] = {
@@ -65,14 +66,13 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
{0, PROBABILITY_BASE, "Invalid random corrupt argument"},
};
- /* No feature arguments supplied. */
- if (!as->argc)
- return 0;
-
- r = dm_read_arg_group(_args, as, &argc, &ti->error);
- if (r)
+ if (as->argc && (r = dm_read_arg_group(_args, as, &argc, &ti->error)))
return r;
+ /* No feature arguments supplied. */
+ if (!argc)
+ goto error_all_io;
+
while (argc) {
arg_name = dm_shift_arg(as);
argc--;
@@ -217,6 +217,7 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
if (!fc->corrupt_bio_byte && !test_bit(ERROR_READS, &fc->flags) &&
!test_bit(DROP_WRITES, &fc->flags) && !test_bit(ERROR_WRITES, &fc->flags) &&
!fc->random_read_corrupt && !fc->random_write_corrupt) {
+error_all_io:
set_bit(ERROR_WRITES, &fc->flags);
set_bit(ERROR_READS, &fc->flags);
}
@@ -339,7 +340,8 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
}
static void corrupt_bio_common(struct bio *bio, unsigned int corrupt_bio_byte,
- unsigned char corrupt_bio_value)
+ unsigned char corrupt_bio_value,
+ struct bvec_iter start)
{
struct bvec_iter iter;
struct bio_vec bvec;
@@ -348,7 +350,7 @@ static void corrupt_bio_common(struct bio *bio, unsigned int corrupt_bio_byte,
* Overwrite the Nth byte of the bio's data, on whichever page
* it falls.
*/
- bio_for_each_segment(bvec, bio, iter) {
+ __bio_for_each_segment(bvec, bio, iter, start) {
if (bio_iter_len(bio, iter) > corrupt_bio_byte) {
unsigned char *segment = bvec_kmap_local(&bvec);
segment[corrupt_bio_byte] = corrupt_bio_value;
@@ -357,36 +359,31 @@ static void corrupt_bio_common(struct bio *bio, unsigned int corrupt_bio_byte,
"(rw=%c bi_opf=%u bi_sector=%llu size=%u)\n",
bio, corrupt_bio_value, corrupt_bio_byte,
(bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_opf,
- (unsigned long long)bio->bi_iter.bi_sector,
- bio->bi_iter.bi_size);
+ (unsigned long long)start.bi_sector,
+ start.bi_size);
break;
}
corrupt_bio_byte -= bio_iter_len(bio, iter);
}
}
-static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
+static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc,
+ struct bvec_iter start)
{
unsigned int corrupt_bio_byte = fc->corrupt_bio_byte - 1;
- if (!bio_has_data(bio))
- return;
-
- corrupt_bio_common(bio, corrupt_bio_byte, fc->corrupt_bio_value);
+ corrupt_bio_common(bio, corrupt_bio_byte, fc->corrupt_bio_value, start);
}
-static void corrupt_bio_random(struct bio *bio)
+static void corrupt_bio_random(struct bio *bio, struct bvec_iter start)
{
unsigned int corrupt_byte;
unsigned char corrupt_value;
- if (!bio_has_data(bio))
- return;
-
- corrupt_byte = get_random_u32() % bio->bi_iter.bi_size;
+ corrupt_byte = get_random_u32() % start.bi_size;
corrupt_value = get_random_u8();
- corrupt_bio_common(bio, corrupt_byte, corrupt_value);
+ corrupt_bio_common(bio, corrupt_byte, corrupt_value, start);
}
static void clone_free(struct bio *clone)
@@ -426,7 +423,7 @@ static struct bio *clone_bio(struct dm_target *ti, struct flakey_c *fc, struct b
if (!clone)
return NULL;
- bio_init(clone, fc->dev->bdev, bio->bi_inline_vecs, nr_iovecs, bio->bi_opf);
+ bio_init(clone, fc->dev->bdev, clone->bi_inline_vecs, nr_iovecs, bio->bi_opf);
clone->bi_iter.bi_sector = flakey_map_sector(ti, bio->bi_iter.bi_sector);
clone->bi_private = bio;
@@ -481,7 +478,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
unsigned int elapsed;
struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
- pb->bio_submitted = false;
+ pb->bio_can_corrupt = false;
if (op_is_zone_mgmt(bio_op(bio)))
goto map_bio;
@@ -490,10 +487,11 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
elapsed = (jiffies - fc->start_time) / HZ;
if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) {
bool corrupt_fixed, corrupt_random;
- /*
- * Flag this bio as submitted while down.
- */
- pb->bio_submitted = true;
+
+ if (bio_has_data(bio)) {
+ pb->bio_can_corrupt = true;
+ pb->saved_iter = bio->bi_iter;
+ }
/*
* Error reads if neither corrupt_bio_byte or drop_writes or error_writes are set.
@@ -516,6 +514,8 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_SUBMITTED;
}
+ if (!pb->bio_can_corrupt)
+ goto map_bio;
/*
* Corrupt matching writes.
*/
@@ -535,9 +535,11 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
struct bio *clone = clone_bio(ti, fc, bio);
if (clone) {
if (corrupt_fixed)
- corrupt_bio_data(clone, fc);
+ corrupt_bio_data(clone, fc,
+ clone->bi_iter);
if (corrupt_random)
- corrupt_bio_random(clone);
+ corrupt_bio_random(clone,
+ clone->bi_iter);
submit_bio(clone);
return DM_MAPIO_SUBMITTED;
}
@@ -559,21 +561,21 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio,
if (op_is_zone_mgmt(bio_op(bio)))
return DM_ENDIO_DONE;
- if (!*error && pb->bio_submitted && (bio_data_dir(bio) == READ)) {
+ if (!*error && pb->bio_can_corrupt && (bio_data_dir(bio) == READ)) {
if (fc->corrupt_bio_byte) {
if ((fc->corrupt_bio_rw == READ) &&
all_corrupt_bio_flags_match(bio, fc)) {
/*
* Corrupt successful matching READs while in down state.
*/
- corrupt_bio_data(bio, fc);
+ corrupt_bio_data(bio, fc, pb->saved_iter);
}
}
if (fc->random_read_corrupt) {
u64 rnd = get_random_u64();
u32 rem = do_div(rnd, PROBABILITY_BASE);
if (rem < fc->random_read_corrupt)
- corrupt_bio_random(bio);
+ corrupt_bio_random(bio, pb->saved_iter);
}
if (test_bit(ERROR_READS, &fc->flags)) {
/*
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index ee9f7cecd78e..450e1a7e7bac 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -21,6 +21,7 @@
#include <linux/reboot.h>
#include <crypto/hash.h>
#include <crypto/skcipher.h>
+#include <crypto/utils.h>
#include <linux/async_tx.h>
#include <linux/dm-bufio.h>
@@ -516,7 +517,7 @@ static int sb_mac(struct dm_integrity_c *ic, bool wr)
dm_integrity_io_error(ic, "crypto_shash_digest", r);
return r;
}
- if (memcmp(mac, actual_mac, mac_size)) {
+ if (crypto_memneq(mac, actual_mac, mac_size)) {
dm_integrity_io_error(ic, "superblock mac", -EILSEQ);
dm_audit_log_target(DM_MSG_PREFIX, "mac-superblock", ic->ti, 0);
return -EILSEQ;
@@ -859,7 +860,7 @@ static void rw_section_mac(struct dm_integrity_c *ic, unsigned int section, bool
if (likely(wr))
memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR);
else {
- if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) {
+ if (crypto_memneq(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) {
dm_integrity_io_error(ic, "journal mac", -EILSEQ);
dm_audit_log_target(DM_MSG_PREFIX, "mac-journal", ic->ti, 0);
}
@@ -1401,10 +1402,9 @@ static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_
static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block,
unsigned int *metadata_offset, unsigned int total_size, int op)
{
-#define MAY_BE_FILLER 1
-#define MAY_BE_HASH 2
unsigned int hash_offset = 0;
- unsigned int may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0);
+ unsigned char mismatch_hash = 0;
+ unsigned char mismatch_filler = !ic->discard;
do {
unsigned char *data, *dp;
@@ -1425,7 +1425,7 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se
if (op == TAG_READ) {
memcpy(tag, dp, to_copy);
} else if (op == TAG_WRITE) {
- if (memcmp(dp, tag, to_copy)) {
+ if (crypto_memneq(dp, tag, to_copy)) {
memcpy(dp, tag, to_copy);
dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy);
}
@@ -1433,29 +1433,30 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se
/* e.g.: op == TAG_CMP */
if (likely(is_power_of_2(ic->tag_size))) {
- if (unlikely(memcmp(dp, tag, to_copy)))
- if (unlikely(!ic->discard) ||
- unlikely(memchr_inv(dp, DISCARD_FILLER, to_copy) != NULL)) {
- goto thorough_test;
- }
+ if (unlikely(crypto_memneq(dp, tag, to_copy)))
+ goto thorough_test;
} else {
unsigned int i, ts;
thorough_test:
ts = total_size;
for (i = 0; i < to_copy; i++, ts--) {
- if (unlikely(dp[i] != tag[i]))
- may_be &= ~MAY_BE_HASH;
- if (likely(dp[i] != DISCARD_FILLER))
- may_be &= ~MAY_BE_FILLER;
+ /*
+ * Warning: the control flow must not be
+ * dependent on match/mismatch of
+ * individual bytes.
+ */
+ mismatch_hash |= dp[i] ^ tag[i];
+ mismatch_filler |= dp[i] ^ DISCARD_FILLER;
hash_offset++;
if (unlikely(hash_offset == ic->tag_size)) {
- if (unlikely(!may_be)) {
+ if (unlikely(mismatch_hash) && unlikely(mismatch_filler)) {
dm_bufio_release(b);
return ts;
}
hash_offset = 0;
- may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0);
+ mismatch_hash = 0;
+ mismatch_filler = !ic->discard;
}
}
}
@@ -1476,8 +1477,6 @@ thorough_test:
} while (unlikely(total_size));
return 0;
-#undef MAY_BE_FILLER
-#undef MAY_BE_HASH
}
struct flush_request {
@@ -2076,7 +2075,7 @@ retry_kmap:
char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack);
- if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
+ if (unlikely(crypto_memneq(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx",
logical_sector);
dm_audit_log_bio(DM_MSG_PREFIX, "journal-checksum",
@@ -2595,7 +2594,7 @@ static void dm_integrity_inline_recheck(struct work_struct *w)
bio_put(outgoing_bio);
integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, outgoing_data, digest);
- if (unlikely(memcmp(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) {
+ if (unlikely(crypto_memneq(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) {
DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx",
ic->dev->bdev, dio->bio_details.bi_iter.bi_sector);
atomic64_inc(&ic->number_of_mismatches);
@@ -2634,7 +2633,7 @@ static int dm_integrity_end_io(struct dm_target *ti, struct bio *bio, blk_status
char *mem = bvec_kmap_local(&bv);
//memset(mem, 0xff, ic->sectors_per_block << SECTOR_SHIFT);
integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, digest);
- if (unlikely(memcmp(digest, dio->integrity_payload + pos,
+ if (unlikely(crypto_memneq(digest, dio->integrity_payload + pos,
min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) {
kunmap_local(mem);
dm_integrity_free_payload(dio);
@@ -2911,7 +2910,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned int write_start
integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block),
(char *)access_journal_data(ic, i, l), test_tag);
- if (unlikely(memcmp(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) {
+ if (unlikely(crypto_memneq(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) {
dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ);
dm_audit_log_target(DM_MSG_PREFIX, "integrity-replay-journal", ic->ti, 0);
}
@@ -3790,10 +3789,6 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
break;
case STATUSTYPE_TABLE: {
- __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100;
-
- watermark_percentage += ic->journal_entries / 2;
- do_div(watermark_percentage, ic->journal_entries);
arg_count = 3;
arg_count += !!ic->meta_dev;
arg_count += ic->sectors_per_block != 1;
@@ -3826,6 +3821,10 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type,
DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors);
DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors);
if (ic->mode == 'J') {
+ __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100;
+
+ watermark_percentage += ic->journal_entries / 2;
+ do_div(watermark_percentage, ic->journal_entries);
DMEMIT(" journal_watermark:%u", (unsigned int)watermark_percentage);
DMEMIT(" commit_time:%u", ic->autocommit_msec);
}
@@ -5081,16 +5080,19 @@ try_smaller_buffer:
ic->recalc_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages);
if (!ic->recalc_bitmap) {
+ ti->error = "Could not allocate memory for bitmap";
r = -ENOMEM;
goto bad;
}
ic->may_write_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages);
if (!ic->may_write_bitmap) {
+ ti->error = "Could not allocate memory for bitmap";
r = -ENOMEM;
goto bad;
}
ic->bbs = kvmalloc_array(ic->n_bitmap_blocks, sizeof(struct bitmap_block_status), GFP_KERNEL);
if (!ic->bbs) {
+ ti->error = "Could not allocate memory for bitmap";
r = -ENOMEM;
goto bad;
}
@@ -5171,7 +5173,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
BUG_ON(!list_empty(&ic->wait_list));
- if (ic->mode == 'B')
+ if (ic->mode == 'B' && ic->bitmap_flush_work.work.func)
cancel_delayed_work_sync(&ic->bitmap_flush_work);
if (ic->metadata_wq)
destroy_workqueue(ic->metadata_wq);
diff --git a/drivers/md/dm-ps-historical-service-time.c b/drivers/md/dm-ps-historical-service-time.c
index b49e10d76d03..2c8626a83de4 100644
--- a/drivers/md/dm-ps-historical-service-time.c
+++ b/drivers/md/dm-ps-historical-service-time.c
@@ -541,8 +541,10 @@ static int __init dm_hst_init(void)
{
int r = dm_register_path_selector(&hst_ps);
- if (r < 0)
+ if (r < 0) {
DMERR("register failed %d", r);
+ return r;
+ }
DMINFO("version " HST_VERSION " loaded");
diff --git a/drivers/md/dm-ps-queue-length.c b/drivers/md/dm-ps-queue-length.c
index e305f05ad1e5..eb543e6431e0 100644
--- a/drivers/md/dm-ps-queue-length.c
+++ b/drivers/md/dm-ps-queue-length.c
@@ -260,8 +260,10 @@ static int __init dm_ql_init(void)
{
int r = dm_register_path_selector(&ql_ps);
- if (r < 0)
+ if (r < 0) {
DMERR("register failed %d", r);
+ return r;
+ }
DMINFO("version " QL_VERSION " loaded");
diff --git a/drivers/md/dm-ps-round-robin.c b/drivers/md/dm-ps-round-robin.c
index d1745b123dc1..66a15ac0c22c 100644
--- a/drivers/md/dm-ps-round-robin.c
+++ b/drivers/md/dm-ps-round-robin.c
@@ -220,8 +220,10 @@ static int __init dm_rr_init(void)
{
int r = dm_register_path_selector(&rr_ps);
- if (r < 0)
+ if (r < 0) {
DMERR("register failed %d", r);
+ return r;
+ }
DMINFO("version " RR_VERSION " loaded");
diff --git a/drivers/md/dm-ps-service-time.c b/drivers/md/dm-ps-service-time.c
index 969d31c40272..f8c43aecdb27 100644
--- a/drivers/md/dm-ps-service-time.c
+++ b/drivers/md/dm-ps-service-time.c
@@ -341,8 +341,10 @@ static int __init dm_st_init(void)
{
int r = dm_register_path_selector(&st_ps);
- if (r < 0)
+ if (r < 0) {
DMERR("register failed %d", r);
+ return r;
+ }
DMINFO("version " ST_VERSION " loaded");
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 1e0d3b9b75d6..163a5bbd485f 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2410,7 +2410,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
*/
sb_retrieve_failed_devices(sb, failed_devices);
rdev_for_each(r, mddev) {
- if (test_bit(Journal, &rdev->flags) ||
+ if (test_bit(Journal, &r->flags) ||
!r->sb_page)
continue;
sb2 = page_address(r->sb_page);
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 9511dae5b556..94b6c43dfa5c 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -133,10 +133,9 @@ static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw)
spin_lock_irqsave(&ms->lock, flags);
should_wake = !(bl->head);
bio_list_add(bl, bio);
- spin_unlock_irqrestore(&ms->lock, flags);
-
if (should_wake)
wakeup_mirrord(ms);
+ spin_unlock_irqrestore(&ms->lock, flags);
}
static void dispatch_bios(void *context, struct bio_list *bio_list)
@@ -646,9 +645,9 @@ static void write_callback(unsigned long error, void *context)
if (!ms->failures.head)
should_wake = 1;
bio_list_add(&ms->failures, bio);
- spin_unlock_irqrestore(&ms->lock, flags);
if (should_wake)
wakeup_mirrord(ms);
+ spin_unlock_irqrestore(&ms->lock, flags);
}
static void do_write(struct mirror_set *ms, struct bio *bio)
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 4112071de0be..c68dc1653cfd 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -459,6 +459,7 @@ static void stripe_io_hints(struct dm_target *ti,
struct stripe_c *sc = ti->private;
unsigned int chunk_size = sc->chunk_size << SECTOR_SHIFT;
+ limits->chunk_sectors = sc->chunk_size;
limits->io_min = chunk_size;
limits->io_opt = chunk_size * sc->stripes;
}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index dbd39b9722b9..20b8f560a2da 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -431,6 +431,7 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
return 0;
}
+ mutex_lock(&q->limits_lock);
if (blk_stack_limits(limits, &q->limits,
get_start_sect(bdev) + start) < 0)
DMWARN("%s: adding target device %pg caused an alignment inconsistency: "
@@ -448,6 +449,7 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
*/
if (!dm_target_has_integrity(ti->type))
queue_limits_stack_integrity_bdev(limits, bdev);
+ mutex_unlock(&q->limits_lock);
return 0;
}
@@ -523,8 +525,9 @@ static char **realloc_argv(unsigned int *size, char **old_argv)
gfp = GFP_NOIO;
}
argv = kmalloc_array(new_size, sizeof(*argv), gfp);
- if (argv && old_argv) {
- memcpy(argv, old_argv, *size * sizeof(*argv));
+ if (argv) {
+ if (old_argv)
+ memcpy(argv, old_argv, *size * sizeof(*argv));
*size = new_size;
}
@@ -697,6 +700,10 @@ int dm_table_add_target(struct dm_table *t, const char *type,
DMERR("%s: zero-length target", dm_device_name(t->md));
return -EINVAL;
}
+ if (start + len < start || start + len > LLONG_MAX >> SECTOR_SHIFT) {
+ DMERR("%s: too large device", dm_device_name(t->md));
+ return -EINVAL;
+ }
ti->type = dm_get_target_type(type);
if (!ti->type) {
@@ -887,17 +894,17 @@ static bool dm_table_supports_dax(struct dm_table *t,
return true;
}
-static int device_is_rq_stackable(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, sector_t len, void *data)
+static int device_is_not_rq_stackable(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
{
struct block_device *bdev = dev->bdev;
struct request_queue *q = bdev_get_queue(bdev);
/* request-based cannot stack on partitions! */
if (bdev_is_partition(bdev))
- return false;
+ return true;
- return queue_is_mq(q);
+ return !queue_is_mq(q);
}
static int dm_table_determine_type(struct dm_table *t)
@@ -993,7 +1000,7 @@ verify_rq_based:
/* Non-request-stackable devices can't be used for request-based dm */
if (!ti->type->iterate_devices ||
- !ti->type->iterate_devices(ti, device_is_rq_stackable, NULL)) {
+ ti->type->iterate_devices(ti, device_is_not_rq_stackable, NULL)) {
DMERR("table load rejected: including non-request-stackable devices");
return -EINVAL;
}
@@ -1182,7 +1189,7 @@ static int dm_keyslot_evict(struct blk_crypto_profile *profile,
t = dm_get_live_table(md, &srcu_idx);
if (!t)
- return 0;
+ goto put_live_table;
for (unsigned int i = 0; i < t->num_targets; i++) {
struct dm_target *ti = dm_table_get_target(t, i);
@@ -1193,6 +1200,7 @@ static int dm_keyslot_evict(struct blk_crypto_profile *profile,
(void *)key);
}
+put_live_table:
dm_put_live_table(md, srcu_idx);
return 0;
}
@@ -1728,8 +1736,12 @@ static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev *
sector_t start, sector_t len, void *data)
{
struct request_queue *q = bdev_get_queue(dev->bdev);
+ int b;
- return !q->limits.max_write_zeroes_sectors;
+ mutex_lock(&q->limits_lock);
+ b = !q->limits.max_write_zeroes_sectors;
+ mutex_unlock(&q->limits_lock);
+ return b;
}
static bool dm_table_supports_write_zeroes(struct dm_table *t)
diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c
index 80628ae93fba..5a74b3a85ec4 100644
--- a/drivers/md/dm-vdo/dedupe.c
+++ b/drivers/md/dm-vdo/dedupe.c
@@ -2178,6 +2178,7 @@ static int initialize_index(struct vdo *vdo, struct hash_zones *zones)
vdo_set_dedupe_index_timeout_interval(vdo_dedupe_index_timeout_interval);
vdo_set_dedupe_index_min_timer_interval(vdo_dedupe_index_min_timer_interval);
+ spin_lock_init(&zones->lock);
/*
* Since we will save up the timeouts that would have been reported but were ratelimited,
diff --git a/drivers/md/dm-vdo/indexer/index-layout.c b/drivers/md/dm-vdo/indexer/index-layout.c
index 627adc24af3b..053b7845d1f3 100644
--- a/drivers/md/dm-vdo/indexer/index-layout.c
+++ b/drivers/md/dm-vdo/indexer/index-layout.c
@@ -54,7 +54,6 @@
* Each save also has a unique nonce.
*/
-#define MAGIC_SIZE 32
#define NONCE_INFO_SIZE 32
#define MAX_SAVES 2
@@ -98,9 +97,11 @@ enum region_type {
#define SUPER_VERSION_CURRENT 3
#define SUPER_VERSION_MAXIMUM 7
-static const u8 LAYOUT_MAGIC[MAGIC_SIZE] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*";
+static const u8 LAYOUT_MAGIC[] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*";
static const u64 REGION_MAGIC = 0x416c6252676e3031; /* 'AlbRgn01' */
+#define MAGIC_SIZE (sizeof(LAYOUT_MAGIC) - 1)
+
struct region_header {
u64 magic;
u64 region_blocks;
diff --git a/drivers/md/dm-vdo/indexer/volume.c b/drivers/md/dm-vdo/indexer/volume.c
index 655453bb276b..425b3a74f4db 100644
--- a/drivers/md/dm-vdo/indexer/volume.c
+++ b/drivers/md/dm-vdo/indexer/volume.c
@@ -754,10 +754,11 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request *
u32 physical_page, struct cached_page **page_ptr)
{
struct cached_page *page;
+ unsigned int zone_number = request->zone_number;
get_page_from_cache(&volume->page_cache, physical_page, &page);
if (page != NULL) {
- if (request->zone_number == 0) {
+ if (zone_number == 0) {
/* Only one zone is allowed to update the LRU. */
make_page_most_recent(&volume->page_cache, page);
}
@@ -767,7 +768,7 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request *
}
/* Prepare to enqueue a read for the page. */
- end_pending_search(&volume->page_cache, request->zone_number);
+ end_pending_search(&volume->page_cache, zone_number);
mutex_lock(&volume->read_threads_mutex);
/*
@@ -787,8 +788,7 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request *
* the order does not matter for correctness as it does below.
*/
mutex_unlock(&volume->read_threads_mutex);
- begin_pending_search(&volume->page_cache, physical_page,
- request->zone_number);
+ begin_pending_search(&volume->page_cache, physical_page, zone_number);
return UDS_QUEUED;
}
@@ -797,7 +797,7 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request *
* "search pending" state in careful order so no other thread can mess with the data before
* the caller gets to look at it.
*/
- begin_pending_search(&volume->page_cache, physical_page, request->zone_number);
+ begin_pending_search(&volume->page_cache, physical_page, zone_number);
mutex_unlock(&volume->read_threads_mutex);
*page_ptr = page;
return UDS_SUCCESS;
@@ -849,6 +849,7 @@ static int search_cached_index_page(struct volume *volume, struct uds_request *r
{
int result;
struct cached_page *page = NULL;
+ unsigned int zone_number = request->zone_number;
u32 physical_page = map_to_physical_page(volume->geometry, chapter,
index_page_number);
@@ -858,18 +859,18 @@ static int search_cached_index_page(struct volume *volume, struct uds_request *r
* invalidation by the reader thread, before the reader thread has noticed that the
* invalidate_counter has been incremented.
*/
- begin_pending_search(&volume->page_cache, physical_page, request->zone_number);
+ begin_pending_search(&volume->page_cache, physical_page, zone_number);
result = get_volume_page_protected(volume, request, physical_page, &page);
if (result != UDS_SUCCESS) {
- end_pending_search(&volume->page_cache, request->zone_number);
+ end_pending_search(&volume->page_cache, zone_number);
return result;
}
result = uds_search_chapter_index_page(&page->index_page, volume->geometry,
&request->record_name,
record_page_number);
- end_pending_search(&volume->page_cache, request->zone_number);
+ end_pending_search(&volume->page_cache, zone_number);
return result;
}
@@ -882,6 +883,7 @@ int uds_search_cached_record_page(struct volume *volume, struct uds_request *req
{
struct cached_page *record_page;
struct index_geometry *geometry = volume->geometry;
+ unsigned int zone_number = request->zone_number;
int result;
u32 physical_page, page_number;
@@ -905,11 +907,11 @@ int uds_search_cached_record_page(struct volume *volume, struct uds_request *req
* invalidation by the reader thread, before the reader thread has noticed that the
* invalidate_counter has been incremented.
*/
- begin_pending_search(&volume->page_cache, physical_page, request->zone_number);
+ begin_pending_search(&volume->page_cache, physical_page, zone_number);
result = get_volume_page_protected(volume, request, physical_page, &record_page);
if (result != UDS_SUCCESS) {
- end_pending_search(&volume->page_cache, request->zone_number);
+ end_pending_search(&volume->page_cache, zone_number);
return result;
}
@@ -917,7 +919,7 @@ int uds_search_cached_record_page(struct volume *volume, struct uds_request *req
&request->record_name, geometry, &request->old_metadata))
*found = true;
- end_pending_search(&volume->page_cache, request->zone_number);
+ end_pending_search(&volume->page_cache, zone_number);
return UDS_SUCCESS;
}
diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c
index fff847767755..b897f88250d2 100644
--- a/drivers/md/dm-vdo/vdo.c
+++ b/drivers/md/dm-vdo/vdo.c
@@ -31,9 +31,7 @@
#include <linux/completion.h>
#include <linux/device-mapper.h>
-#include <linux/kernel.h>
#include <linux/lz4.h>
-#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/types.h>
@@ -142,12 +140,6 @@ static void finish_vdo_request_queue(void *ptr)
vdo_unregister_allocating_thread();
}
-#ifdef MODULE
-#define MODULE_NAME THIS_MODULE->name
-#else
-#define MODULE_NAME "dm-vdo"
-#endif /* MODULE */
-
static const struct vdo_work_queue_type default_queue_type = {
.start = start_vdo_request_queue,
.finish = finish_vdo_request_queue,
@@ -559,8 +551,7 @@ int vdo_make(unsigned int instance, struct device_config *config, char **reason,
*vdo_ptr = vdo;
snprintf(vdo->thread_name_prefix, sizeof(vdo->thread_name_prefix),
- "%s%u", MODULE_NAME, instance);
- BUG_ON(vdo->thread_name_prefix[0] == '\0');
+ "vdo%u", instance);
result = vdo_allocate(vdo->thread_config.thread_count,
struct vdo_thread, __func__, &vdo->threads);
if (result != VDO_SUCCESS) {
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 6bd9848518d4..559b8179ac50 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -604,6 +604,10 @@ int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
(*argc)--;
if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV)) {
+ if (v->fec->dev) {
+ ti->error = "FEC device already specified";
+ return -EINVAL;
+ }
r = dm_get_device(ti, arg_value, BLK_OPEN_READ, &v->fec->dev);
if (r) {
ti->error = "FEC device lookup failed";
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index c142ec5458b7..ce0462e751a6 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -796,6 +796,13 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_SUBMITTED;
}
+static void verity_postsuspend(struct dm_target *ti)
+{
+ struct dm_verity *v = ti->private;
+ flush_workqueue(v->verify_wq);
+ dm_bufio_client_reset(v->bufio);
+}
+
/*
* Status: V (valid) or C (corruption found)
*/
@@ -1073,6 +1080,9 @@ static int verity_alloc_most_once(struct dm_verity *v)
{
struct dm_target *ti = v->ti;
+ if (v->validated_blocks)
+ return 0;
+
/* the bitset can only handle INT_MAX blocks */
if (v->data_blocks > INT_MAX) {
ti->error = "device too large to use check_at_most_once";
@@ -1096,6 +1106,9 @@ static int verity_alloc_zero_digest(struct dm_verity *v)
struct dm_verity_io *io;
u8 *zero_data;
+ if (v->zero_digest)
+ return 0;
+
v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL);
if (!v->zero_digest)
@@ -1530,7 +1543,7 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
- /* Root hash signature is a optional parameter*/
+ /* Root hash signature is an optional parameter */
r = verity_verify_root_hash(root_hash_digest_to_validate,
strlen(root_hash_digest_to_validate),
verify_args.sig,
@@ -1766,6 +1779,7 @@ static struct target_type verity_target = {
.ctr = verity_ctr,
.dtr = verity_dtr,
.map = verity_map,
+ .postsuspend = verity_postsuspend,
.status = verity_status,
.prepare_ioctl = verity_prepare_ioctl,
.iterate_devices = verity_iterate_devices,
diff --git a/drivers/md/dm-verity-verify-sig.c b/drivers/md/dm-verity-verify-sig.c
index a9e2c6c0a33c..d5261a0e4232 100644
--- a/drivers/md/dm-verity-verify-sig.c
+++ b/drivers/md/dm-verity-verify-sig.c
@@ -71,9 +71,14 @@ int verity_verify_sig_parse_opt_args(struct dm_arg_set *as,
const char *arg_name)
{
struct dm_target *ti = v->ti;
- int ret = 0;
+ int ret;
const char *sig_key = NULL;
+ if (v->signature_key_desc) {
+ ti->error = DM_VERITY_VERIFY_ERR("root_hash_sig_key_desc already specified");
+ return -EINVAL;
+ }
+
if (!*argc) {
ti->error = DM_VERITY_VERIFY_ERR("Signature key not specified");
return -EINVAL;
@@ -83,14 +88,18 @@ int verity_verify_sig_parse_opt_args(struct dm_arg_set *as,
(*argc)--;
ret = verity_verify_get_sig_from_key(sig_key, sig_opts);
- if (ret < 0)
+ if (ret < 0) {
ti->error = DM_VERITY_VERIFY_ERR("Invalid key specified");
+ return ret;
+ }
v->signature_key_desc = kstrdup(sig_key, GFP_KERNEL);
- if (!v->signature_key_desc)
+ if (!v->signature_key_desc) {
+ ti->error = DM_VERITY_VERIFY_ERR("Could not allocate memory for signature key");
return -ENOMEM;
+ }
- return ret;
+ return 0;
}
/*
diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
index c0d41c36e06e..04cc36a9d5ca 100644
--- a/drivers/md/dm-zone.c
+++ b/drivers/md/dm-zone.c
@@ -56,24 +56,31 @@ int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
{
struct mapped_device *md = disk->private_data;
struct dm_table *map;
- int srcu_idx, ret;
+ struct dm_table *zone_revalidate_map = md->zone_revalidate_map;
+ int srcu_idx, ret = -EIO;
+ bool put_table = false;
- if (!md->zone_revalidate_map) {
- /* Regular user context */
+ if (!zone_revalidate_map || md->revalidate_map_task != current) {
+ /*
+ * Regular user context or
+ * Zone revalidation during __bind() is in progress, but this
+ * call is from a different process
+ */
if (dm_suspended_md(md))
return -EAGAIN;
map = dm_get_live_table(md, &srcu_idx);
- if (!map)
- return -EIO;
+ put_table = true;
} else {
/* Zone revalidation during __bind() */
- map = md->zone_revalidate_map;
+ map = zone_revalidate_map;
}
- ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data);
+ if (map)
+ ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb,
+ data);
- if (!md->zone_revalidate_map)
+ if (put_table)
dm_put_live_table(md, srcu_idx);
return ret;
@@ -175,7 +182,9 @@ int dm_revalidate_zones(struct dm_table *t, struct request_queue *q)
* our table for dm_blk_report_zones() to use directly.
*/
md->zone_revalidate_map = t;
+ md->revalidate_map_task = current;
ret = blk_revalidate_disk_zones(disk);
+ md->revalidate_map_task = NULL;
md->zone_revalidate_map = NULL;
if (ret) {
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 6141fc25d842..c38bd6e4c273 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -1061,7 +1061,7 @@ static int dmz_iterate_devices(struct dm_target *ti,
struct dmz_target *dmz = ti->private;
unsigned int zone_nr_sectors = dmz_zone_nr_sectors(dmz->metadata);
sector_t capacity;
- int i, r;
+ int i, r = 0;
for (i = 0; i < dmz->nr_ddevs; i++) {
capacity = dmz->dev[i].capacity & ~(zone_nr_sectors - 1);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 19230404d8c2..c5dcd632404c 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1541,14 +1541,18 @@ static void __send_empty_flush(struct clone_info *ci)
{
struct dm_table *t = ci->map;
struct bio flush_bio;
+ blk_opf_t opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+
+ if ((ci->io->orig_bio->bi_opf & (REQ_IDLE | REQ_SYNC)) ==
+ (REQ_IDLE | REQ_SYNC))
+ opf |= REQ_IDLE;
/*
* Use an on-stack bio for this, it's safe since we don't
* need to reference it after submit. It's just used as
* the basis for the clone(s).
*/
- bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0,
- REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC);
+ bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, opf);
ci->bio = &flush_bio;
ci->sector_count = 0;
@@ -1784,19 +1788,35 @@ static void init_clone_info(struct clone_info *ci, struct dm_io *io,
}
#ifdef CONFIG_BLK_DEV_ZONED
-static inline bool dm_zone_bio_needs_split(struct mapped_device *md,
- struct bio *bio)
+static inline bool dm_zone_bio_needs_split(struct bio *bio)
{
/*
- * For mapped device that need zone append emulation, we must
- * split any large BIO that straddles zone boundaries.
+ * Special case the zone operations that cannot or should not be split.
*/
- return dm_emulate_zone_append(md) && bio_straddles_zones(bio) &&
- !bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
+ switch (bio_op(bio)) {
+ case REQ_OP_ZONE_APPEND:
+ case REQ_OP_ZONE_FINISH:
+ case REQ_OP_ZONE_RESET:
+ case REQ_OP_ZONE_RESET_ALL:
+ return false;
+ default:
+ break;
+ }
+
+ /*
+ * When mapped devices use the block layer zone write plugging, we must
+ * split any large BIO to the mapped device limits to not submit BIOs
+ * that span zone boundaries and to avoid potential deadlocks with
+ * queue freeze operations.
+ */
+ return bio_needs_zone_write_plugging(bio) || bio_straddles_zones(bio);
}
+
static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio)
{
- return dm_emulate_zone_append(md) && blk_zone_plug_bio(bio, 0);
+ if (!bio_needs_zone_write_plugging(bio))
+ return false;
+ return blk_zone_plug_bio(bio, 0);
}
static blk_status_t __send_zone_reset_all_emulated(struct clone_info *ci,
@@ -1912,8 +1932,7 @@ static blk_status_t __send_zone_reset_all(struct clone_info *ci)
}
#else
-static inline bool dm_zone_bio_needs_split(struct mapped_device *md,
- struct bio *bio)
+static inline bool dm_zone_bio_needs_split(struct bio *bio)
{
return false;
}
@@ -1940,9 +1959,7 @@ static void dm_split_and_process_bio(struct mapped_device *md,
is_abnormal = is_abnormal_io(bio);
if (static_branch_unlikely(&zoned_enabled)) {
- /* Special case REQ_OP_ZONE_RESET_ALL as it cannot be split. */
- need_split = (bio_op(bio) != REQ_OP_ZONE_RESET_ALL) &&
- (is_abnormal || dm_zone_bio_needs_split(md, bio));
+ need_split = is_abnormal || dm_zone_bio_needs_split(bio);
} else {
need_split = is_abnormal;
}
@@ -2406,21 +2423,29 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
struct queue_limits *limits)
{
struct dm_table *old_map;
- sector_t size;
+ sector_t size, old_size;
int ret;
lockdep_assert_held(&md->suspend_lock);
size = dm_table_get_size(t);
+ old_size = dm_get_size(md);
+ set_capacity(md->disk, size);
+
+ ret = dm_table_set_restrictions(t, md->queue, limits);
+ if (ret) {
+ set_capacity(md->disk, old_size);
+ old_map = ERR_PTR(ret);
+ goto out;
+ }
+
/*
* Wipe any geometry if the size of the table changed.
*/
- if (size != dm_get_size(md))
+ if (size != old_size)
memset(&md->geometry, 0, sizeof(md->geometry));
- set_capacity(md->disk, size);
-
dm_table_event_callback(t, event_callback, md);
if (dm_table_request_based(t)) {
@@ -2438,10 +2463,10 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
* requests in the queue may refer to bio from the old bioset,
* so you must walk through the queue to unprep.
*/
- if (!md->mempools) {
+ if (!md->mempools)
md->mempools = t->mempools;
- t->mempools = NULL;
- }
+ else
+ dm_free_md_mempools(t->mempools);
} else {
/*
* The md may already have mempools that need changing.
@@ -2450,14 +2475,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
*/
dm_free_md_mempools(md->mempools);
md->mempools = t->mempools;
- t->mempools = NULL;
- }
-
- ret = dm_table_set_restrictions(t, md->queue, limits);
- if (ret) {
- old_map = ERR_PTR(ret);
- goto out;
}
+ t->mempools = NULL;
old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
rcu_assign_pointer(md->map, (void *)t);
diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c
index b2a00f213c2c..4b80165afd23 100644
--- a/drivers/md/md-autodetect.c
+++ b/drivers/md/md-autodetect.c
@@ -49,6 +49,7 @@ static int md_setup_ents __initdata;
* instead of just one. -- KTK
* 18May2000: Added support for persistent-superblock arrays:
* md=n,0,factor,fault,device-list uses RAID0 for device n
+ * md=n,-1,factor,fault,device-list uses LINEAR for device n
* md=n,device-list reads a RAID superblock from the devices
* elements in device-list are read by name_to_kdev_t so can be
* a hex number or something like /dev/hda1 /dev/sdb
@@ -87,7 +88,7 @@ static int __init md_setup(char *str)
md_setup_ents++;
switch (get_option(&str, &level)) { /* RAID level */
case 2: /* could be 0 or -1.. */
- if (level == 0) {
+ if (level == 0 || level == LEVEL_LINEAR) {
if (get_option(&str, &factor) != 2 || /* Chunk Size */
get_option(&str, &fault) != 2) {
printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
@@ -95,7 +96,10 @@ static int __init md_setup(char *str)
}
md_setup_args[ent].level = level;
md_setup_args[ent].chunk = 1 << (factor+12);
- pername = "raid0";
+ if (level == LEVEL_LINEAR)
+ pername = "linear";
+ else
+ pername = "raid0";
break;
}
fallthrough;
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index c3a42dd66ce5..0da1d0723f88 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -787,7 +787,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap)
* is a good choice? We choose COUNTER_MAX / 2 arbitrarily.
*/
write_behind = bitmap->mddev->bitmap_info.max_write_behind;
- if (write_behind > COUNTER_MAX)
+ if (write_behind > COUNTER_MAX / 2)
write_behind = COUNTER_MAX / 2;
sb->write_behind = cpu_to_le32(write_behind);
bitmap->mddev->bitmap_info.max_write_behind = write_behind;
@@ -1671,24 +1671,13 @@ __acquires(bitmap->lock)
}
static int bitmap_startwrite(struct mddev *mddev, sector_t offset,
- unsigned long sectors, bool behind)
+ unsigned long sectors)
{
struct bitmap *bitmap = mddev->bitmap;
if (!bitmap)
return 0;
- if (behind) {
- int bw;
- atomic_inc(&bitmap->behind_writes);
- bw = atomic_read(&bitmap->behind_writes);
- if (bw > bitmap->behind_writes_used)
- bitmap->behind_writes_used = bw;
-
- pr_debug("inc write-behind count %d/%lu\n",
- bw, bitmap->mddev->bitmap_info.max_write_behind);
- }
-
while (sectors) {
sector_t blocks;
bitmap_counter_t *bmc;
@@ -1737,21 +1726,13 @@ static int bitmap_startwrite(struct mddev *mddev, sector_t offset,
}
static void bitmap_endwrite(struct mddev *mddev, sector_t offset,
- unsigned long sectors, bool success, bool behind)
+ unsigned long sectors)
{
struct bitmap *bitmap = mddev->bitmap;
if (!bitmap)
return;
- if (behind) {
- if (atomic_dec_and_test(&bitmap->behind_writes))
- wake_up(&bitmap->behind_wait);
- pr_debug("dec write-behind count %d/%lu\n",
- atomic_read(&bitmap->behind_writes),
- bitmap->mddev->bitmap_info.max_write_behind);
- }
-
while (sectors) {
sector_t blocks;
unsigned long flags;
@@ -1764,15 +1745,16 @@ static void bitmap_endwrite(struct mddev *mddev, sector_t offset,
return;
}
- if (success && !bitmap->mddev->degraded &&
- bitmap->events_cleared < bitmap->mddev->events) {
- bitmap->events_cleared = bitmap->mddev->events;
- bitmap->need_sync = 1;
- sysfs_notify_dirent_safe(bitmap->sysfs_can_clear);
- }
-
- if (!success && !NEEDED(*bmc))
+ if (!bitmap->mddev->degraded) {
+ if (bitmap->events_cleared < bitmap->mddev->events) {
+ bitmap->events_cleared = bitmap->mddev->events;
+ bitmap->need_sync = 1;
+ sysfs_notify_dirent_safe(
+ bitmap->sysfs_can_clear);
+ }
+ } else if (!NEEDED(*bmc)) {
*bmc |= NEEDED_MASK;
+ }
if (COUNTER(*bmc) == COUNTER_MAX)
wake_up(&bitmap->overflow_wait);
@@ -2062,6 +2044,37 @@ static void md_bitmap_free(void *data)
kfree(bitmap);
}
+static void bitmap_start_behind_write(struct mddev *mddev)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+ int bw;
+
+ if (!bitmap)
+ return;
+
+ atomic_inc(&bitmap->behind_writes);
+ bw = atomic_read(&bitmap->behind_writes);
+ if (bw > bitmap->behind_writes_used)
+ bitmap->behind_writes_used = bw;
+
+ pr_debug("inc write-behind count %d/%lu\n",
+ bw, bitmap->mddev->bitmap_info.max_write_behind);
+}
+
+static void bitmap_end_behind_write(struct mddev *mddev)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+
+ if (!bitmap)
+ return;
+
+ if (atomic_dec_and_test(&bitmap->behind_writes))
+ wake_up(&bitmap->behind_wait);
+ pr_debug("dec write-behind count %d/%lu\n",
+ atomic_read(&bitmap->behind_writes),
+ bitmap->mddev->bitmap_info.max_write_behind);
+}
+
static void bitmap_wait_behind_writes(struct mddev *mddev)
{
struct bitmap *bitmap = mddev->bitmap;
@@ -2342,7 +2355,8 @@ static int bitmap_get_stats(void *data, struct md_bitmap_stats *stats)
if (!bitmap)
return -ENOENT;
-
+ if (!bitmap->storage.sb_page)
+ return -EINVAL;
sb = kmap_local_page(bitmap->storage.sb_page);
stats->sync_size = le64_to_cpu(sb->sync_size);
kunmap_local(sb);
@@ -2981,6 +2995,9 @@ static struct bitmap_operations bitmap_ops = {
.dirty_bits = bitmap_dirty_bits,
.unplug = bitmap_unplug,
.daemon_work = bitmap_daemon_work,
+
+ .start_behind_write = bitmap_start_behind_write,
+ .end_behind_write = bitmap_end_behind_write,
.wait_behind_writes = bitmap_wait_behind_writes,
.startwrite = bitmap_startwrite,
diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
index 662e6fc141a7..31c93019c76b 100644
--- a/drivers/md/md-bitmap.h
+++ b/drivers/md/md-bitmap.h
@@ -84,12 +84,15 @@ struct bitmap_operations {
unsigned long e);
void (*unplug)(struct mddev *mddev, bool sync);
void (*daemon_work)(struct mddev *mddev);
+
+ void (*start_behind_write)(struct mddev *mddev);
+ void (*end_behind_write)(struct mddev *mddev);
void (*wait_behind_writes)(struct mddev *mddev);
int (*startwrite)(struct mddev *mddev, sector_t offset,
- unsigned long sectors, bool behind);
+ unsigned long sectors);
void (*endwrite)(struct mddev *mddev, sector_t offset,
- unsigned long sectors, bool success, bool behind);
+ unsigned long sectors);
bool (*start_sync)(struct mddev *mddev, sector_t offset,
sector_t *blocks, bool degraded);
void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks);
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
new file mode 100644
index 000000000000..369aed044b40
--- /dev/null
+++ b/drivers/md/md-linear.c
@@ -0,0 +1,352 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * linear.c : Multiple Devices driver for Linux Copyright (C) 1994-96 Marc
+ * ZYNGIER <zyngier@ufr-info-p7.ibp.fr> or <maz@gloups.fdn.fr>
+ */
+
+#include <linux/blkdev.h>
+#include <linux/raid/md_u.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <trace/events/block.h>
+#include "md.h"
+
+struct dev_info {
+ struct md_rdev *rdev;
+ sector_t end_sector;
+};
+
+struct linear_conf {
+ struct rcu_head rcu;
+ sector_t array_sectors;
+ /* a copy of mddev->raid_disks */
+ int raid_disks;
+ struct dev_info disks[] __counted_by(raid_disks);
+};
+
+/*
+ * find which device holds a particular offset
+ */
+static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
+{
+ int lo, mid, hi;
+ struct linear_conf *conf;
+
+ lo = 0;
+ hi = mddev->raid_disks - 1;
+ conf = mddev->private;
+
+ /*
+ * Binary Search
+ */
+
+ while (hi > lo) {
+
+ mid = (hi + lo) / 2;
+ if (sector < conf->disks[mid].end_sector)
+ hi = mid;
+ else
+ lo = mid + 1;
+ }
+
+ return conf->disks + lo;
+}
+
+static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disks)
+{
+ struct linear_conf *conf;
+ sector_t array_sectors;
+
+ conf = mddev->private;
+ WARN_ONCE(sectors || raid_disks,
+ "%s does not support generic reshape\n", __func__);
+ array_sectors = conf->array_sectors;
+
+ return array_sectors;
+}
+
+static int linear_set_limits(struct mddev *mddev)
+{
+ struct queue_limits lim;
+ int err;
+
+ md_init_stacking_limits(&lim);
+ lim.max_hw_sectors = mddev->chunk_sectors;
+ lim.max_write_zeroes_sectors = mddev->chunk_sectors;
+ lim.io_min = mddev->chunk_sectors << 9;
+ err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
+ if (err)
+ return err;
+
+ return queue_limits_set(mddev->gendisk->queue, &lim);
+}
+
+static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
+{
+ struct linear_conf *conf;
+ struct md_rdev *rdev;
+ int ret = -EINVAL;
+ int cnt;
+ int i;
+
+ conf = kzalloc(struct_size(conf, disks, raid_disks), GFP_KERNEL);
+ if (!conf)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * conf->raid_disks is copy of mddev->raid_disks. The reason to
+ * keep a copy of mddev->raid_disks in struct linear_conf is,
+ * mddev->raid_disks may not be consistent with pointers number of
+ * conf->disks[] when it is updated in linear_add() and used to
+ * iterate old conf->disks[] earray in linear_congested().
+ * Here conf->raid_disks is always consitent with number of
+ * pointers in conf->disks[] array, and mddev->private is updated
+ * with rcu_assign_pointer() in linear_addr(), such race can be
+ * avoided.
+ */
+ conf->raid_disks = raid_disks;
+
+ cnt = 0;
+ conf->array_sectors = 0;
+
+ rdev_for_each(rdev, mddev) {
+ int j = rdev->raid_disk;
+ struct dev_info *disk = conf->disks + j;
+ sector_t sectors;
+
+ if (j < 0 || j >= raid_disks || disk->rdev) {
+ pr_warn("md/linear:%s: disk numbering problem. Aborting!\n",
+ mdname(mddev));
+ goto out;
+ }
+
+ disk->rdev = rdev;
+ if (mddev->chunk_sectors) {
+ sectors = rdev->sectors;
+ sector_div(sectors, mddev->chunk_sectors);
+ rdev->sectors = sectors * mddev->chunk_sectors;
+ }
+
+ conf->array_sectors += rdev->sectors;
+ cnt++;
+ }
+ if (cnt != raid_disks) {
+ pr_warn("md/linear:%s: not enough drives present. Aborting!\n",
+ mdname(mddev));
+ goto out;
+ }
+
+ /*
+ * Here we calculate the device offsets.
+ */
+ conf->disks[0].end_sector = conf->disks[0].rdev->sectors;
+
+ for (i = 1; i < raid_disks; i++)
+ conf->disks[i].end_sector =
+ conf->disks[i-1].end_sector +
+ conf->disks[i].rdev->sectors;
+
+ if (!mddev_is_dm(mddev)) {
+ ret = linear_set_limits(mddev);
+ if (ret)
+ goto out;
+ }
+
+ return conf;
+
+out:
+ kfree(conf);
+ return ERR_PTR(ret);
+}
+
+static int linear_run(struct mddev *mddev)
+{
+ struct linear_conf *conf;
+ int ret;
+
+ if (md_check_no_bitmap(mddev))
+ return -EINVAL;
+
+ conf = linear_conf(mddev, mddev->raid_disks);
+ if (IS_ERR(conf))
+ return PTR_ERR(conf);
+
+ mddev->private = conf;
+ md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
+
+ ret = md_integrity_register(mddev);
+ if (ret) {
+ kfree(conf);
+ mddev->private = NULL;
+ }
+ return ret;
+}
+
+static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
+{
+ /* Adding a drive to a linear array allows the array to grow.
+ * It is permitted if the new drive has a matching superblock
+ * already on it, with raid_disk equal to raid_disks.
+ * It is achieved by creating a new linear_private_data structure
+ * and swapping it in in-place of the current one.
+ * The current one is never freed until the array is stopped.
+ * This avoids races.
+ */
+ struct linear_conf *newconf, *oldconf;
+
+ if (rdev->saved_raid_disk != mddev->raid_disks)
+ return -EINVAL;
+
+ rdev->raid_disk = rdev->saved_raid_disk;
+ rdev->saved_raid_disk = -1;
+
+ newconf = linear_conf(mddev, mddev->raid_disks + 1);
+ if (IS_ERR(newconf))
+ return PTR_ERR(newconf);
+
+ /* newconf->raid_disks already keeps a copy of * the increased
+ * value of mddev->raid_disks, WARN_ONCE() is just used to make
+ * sure of this. It is possible that oldconf is still referenced
+ * in linear_congested(), therefore kfree_rcu() is used to free
+ * oldconf until no one uses it anymore.
+ */
+ oldconf = rcu_dereference_protected(mddev->private,
+ lockdep_is_held(&mddev->reconfig_mutex));
+ mddev->raid_disks++;
+ WARN_ONCE(mddev->raid_disks != newconf->raid_disks,
+ "copied raid_disks doesn't match mddev->raid_disks");
+ rcu_assign_pointer(mddev->private, newconf);
+ md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
+ set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
+ kfree_rcu(oldconf, rcu);
+ return 0;
+}
+
+static void linear_free(struct mddev *mddev, void *priv)
+{
+ struct linear_conf *conf = priv;
+
+ kfree(conf);
+}
+
+static bool linear_make_request(struct mddev *mddev, struct bio *bio)
+{
+ struct dev_info *tmp_dev;
+ sector_t start_sector, end_sector, data_offset;
+ sector_t bio_sector = bio->bi_iter.bi_sector;
+
+ if (unlikely(bio->bi_opf & REQ_PREFLUSH)
+ && md_flush_request(mddev, bio))
+ return true;
+
+ tmp_dev = which_dev(mddev, bio_sector);
+ start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
+ end_sector = tmp_dev->end_sector;
+ data_offset = tmp_dev->rdev->data_offset;
+
+ if (unlikely(bio_sector >= end_sector ||
+ bio_sector < start_sector))
+ goto out_of_bounds;
+
+ if (unlikely(is_rdev_broken(tmp_dev->rdev))) {
+ md_error(mddev, tmp_dev->rdev);
+ bio_io_error(bio);
+ return true;
+ }
+
+ if (unlikely(bio_end_sector(bio) > end_sector)) {
+ /* This bio crosses a device boundary, so we have to split it */
+ struct bio *split = bio_split(bio, end_sector - bio_sector,
+ GFP_NOIO, &mddev->bio_set);
+
+ if (IS_ERR(split)) {
+ bio->bi_status = errno_to_blk_status(PTR_ERR(split));
+ bio_endio(bio);
+ return true;
+ }
+
+ bio_chain(split, bio);
+ submit_bio_noacct(bio);
+ bio = split;
+ }
+
+ md_account_bio(mddev, &bio);
+ bio_set_dev(bio, tmp_dev->rdev->bdev);
+ bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
+ start_sector + data_offset;
+
+ if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
+ !bdev_max_discard_sectors(bio->bi_bdev))) {
+ /* Just ignore it */
+ bio_endio(bio);
+ } else {
+ if (mddev->gendisk)
+ trace_block_bio_remap(bio, disk_devt(mddev->gendisk),
+ bio_sector);
+ mddev_check_write_zeroes(mddev, bio);
+ submit_bio_noacct(bio);
+ }
+ return true;
+
+out_of_bounds:
+ pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %pg: %llu sectors, offset %llu\n",
+ mdname(mddev),
+ (unsigned long long)bio->bi_iter.bi_sector,
+ tmp_dev->rdev->bdev,
+ (unsigned long long)tmp_dev->rdev->sectors,
+ (unsigned long long)start_sector);
+ bio_io_error(bio);
+ return true;
+}
+
+static void linear_status(struct seq_file *seq, struct mddev *mddev)
+{
+ seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
+}
+
+static void linear_error(struct mddev *mddev, struct md_rdev *rdev)
+{
+ if (!test_and_set_bit(MD_BROKEN, &mddev->flags)) {
+ char *md_name = mdname(mddev);
+
+ pr_crit("md/linear%s: Disk failure on %pg detected, failing array.\n",
+ md_name, rdev->bdev);
+ }
+}
+
+static void linear_quiesce(struct mddev *mddev, int state)
+{
+}
+
+static struct md_personality linear_personality = {
+ .name = "linear",
+ .level = LEVEL_LINEAR,
+ .owner = THIS_MODULE,
+ .make_request = linear_make_request,
+ .run = linear_run,
+ .free = linear_free,
+ .status = linear_status,
+ .hot_add_disk = linear_add,
+ .size = linear_size,
+ .quiesce = linear_quiesce,
+ .error_handler = linear_error,
+};
+
+static int __init linear_init(void)
+{
+ return register_md_personality(&linear_personality);
+}
+
+static void linear_exit(void)
+{
+ unregister_md_personality(&linear_personality);
+}
+
+module_init(linear_init);
+module_exit(linear_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Linear device concatenation personality for MD (deprecated)");
+MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
+MODULE_ALIAS("md-linear");
+MODULE_ALIAS("md-level--1");
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 67108c397c5a..4b3291723670 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -629,6 +629,12 @@ static void __mddev_put(struct mddev *mddev)
queue_work(md_misc_wq, &mddev->del_work);
}
+static void mddev_put_locked(struct mddev *mddev)
+{
+ if (atomic_dec_and_test(&mddev->active))
+ __mddev_put(mddev);
+}
+
void mddev_put(struct mddev *mddev)
{
if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
@@ -8124,7 +8130,7 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev)
return;
mddev->pers->error_handler(mddev, rdev);
- if (mddev->pers->level == 0)
+ if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR)
return;
if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags))
@@ -8376,6 +8382,10 @@ static int md_seq_show(struct seq_file *seq, void *v)
return 0;
spin_unlock(&all_mddevs_lock);
+
+ /* prevent bitmap to be freed after checking */
+ mutex_lock(&mddev->bitmap_info.mutex);
+
spin_lock(&mddev->lock);
if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
seq_printf(seq, "%s : ", mdname(mddev));
@@ -8451,14 +8461,13 @@ static int md_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "\n");
}
spin_unlock(&mddev->lock);
+ mutex_unlock(&mddev->bitmap_info.mutex);
spin_lock(&all_mddevs_lock);
if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs))
status_unused(seq);
- if (atomic_dec_and_test(&mddev->active))
- __mddev_put(mddev);
-
+ mddev_put_locked(mddev);
return 0;
}
@@ -8745,12 +8754,32 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
}
EXPORT_SYMBOL_GPL(md_submit_discard_bio);
+static void md_bitmap_start(struct mddev *mddev,
+ struct md_io_clone *md_io_clone)
+{
+ if (mddev->pers->bitmap_sector)
+ mddev->pers->bitmap_sector(mddev, &md_io_clone->offset,
+ &md_io_clone->sectors);
+
+ mddev->bitmap_ops->startwrite(mddev, md_io_clone->offset,
+ md_io_clone->sectors);
+}
+
+static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone)
+{
+ mddev->bitmap_ops->endwrite(mddev, md_io_clone->offset,
+ md_io_clone->sectors);
+}
+
static void md_end_clone_io(struct bio *bio)
{
struct md_io_clone *md_io_clone = bio->bi_private;
struct bio *orig_bio = md_io_clone->orig_bio;
struct mddev *mddev = md_io_clone->mddev;
+ if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
+ md_bitmap_end(mddev, md_io_clone);
+
if (bio->bi_status && !orig_bio->bi_status)
orig_bio->bi_status = bio->bi_status;
@@ -8775,6 +8804,12 @@ static void md_clone_bio(struct mddev *mddev, struct bio **bio)
if (blk_queue_io_stat(bdev->bd_disk->queue))
md_io_clone->start_time = bio_start_io_acct(*bio);
+ if (bio_data_dir(*bio) == WRITE && mddev->bitmap) {
+ md_io_clone->offset = (*bio)->bi_iter.bi_sector;
+ md_io_clone->sectors = bio_sectors(*bio);
+ md_bitmap_start(mddev, md_io_clone);
+ }
+
clone->bi_end_io = md_end_clone_io;
clone->bi_private = md_io_clone;
*bio = clone;
@@ -8793,6 +8828,9 @@ void md_free_cloned_bio(struct bio *bio)
struct bio *orig_bio = md_io_clone->orig_bio;
struct mddev *mddev = md_io_clone->mddev;
+ if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap)
+ md_bitmap_end(mddev, md_io_clone);
+
if (bio->bi_status && !orig_bio->bi_status)
orig_bio->bi_status = bio->bi_status;
@@ -9664,8 +9702,8 @@ void md_check_recovery(struct mddev *mddev)
* remove disk.
*/
rdev_for_each_safe(rdev, tmp, mddev) {
- if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
- rdev->raid_disk < 0)
+ if (rdev->raid_disk < 0 &&
+ test_and_clear_bit(ClusterRemove, &rdev->flags))
md_kick_rdev_from_array(rdev);
}
}
@@ -9852,11 +9890,11 @@ EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
static int md_notify_reboot(struct notifier_block *this,
unsigned long code, void *x)
{
- struct mddev *mddev, *n;
+ struct mddev *mddev;
int need_delay = 0;
spin_lock(&all_mddevs_lock);
- list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) {
+ list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
if (!mddev_get(mddev))
continue;
spin_unlock(&all_mddevs_lock);
@@ -9868,8 +9906,8 @@ static int md_notify_reboot(struct notifier_block *this,
mddev_unlock(mddev);
}
need_delay = 1;
- mddev_put(mddev);
spin_lock(&all_mddevs_lock);
+ mddev_put_locked(mddev);
}
spin_unlock(&all_mddevs_lock);
@@ -9962,8 +10000,11 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
/* Check for change of roles in the active devices */
rdev_for_each_safe(rdev2, tmp, mddev) {
- if (test_bit(Faulty, &rdev2->flags))
+ if (test_bit(Faulty, &rdev2->flags)) {
+ if (test_bit(ClusterRemove, &rdev2->flags))
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
continue;
+ }
/* Check if the roles changed */
role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
@@ -10202,7 +10243,7 @@ void md_autostart_arrays(int part)
static __exit void md_exit(void)
{
- struct mddev *mddev, *n;
+ struct mddev *mddev;
int delay = 1;
unregister_blkdev(MD_MAJOR,"md");
@@ -10223,7 +10264,7 @@ static __exit void md_exit(void)
remove_proc_entry("mdstat", NULL);
spin_lock(&all_mddevs_lock);
- list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) {
+ list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
if (!mddev_get(mddev))
continue;
spin_unlock(&all_mddevs_lock);
@@ -10235,8 +10276,8 @@ static __exit void md_exit(void)
* the mddev for destruction by a workqueue, and the
* destroy_workqueue() below will wait for that to complete.
*/
- mddev_put(mddev);
spin_lock(&all_mddevs_lock);
+ mddev_put_locked(mddev);
}
spin_unlock(&all_mddevs_lock);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 5d2e6bd58e4d..8826dce9717d 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -746,6 +746,9 @@ struct md_personality
void *(*takeover) (struct mddev *mddev);
/* Changes the consistency policy of an active array. */
int (*change_consistency_policy)(struct mddev *mddev, const char *buf);
+ /* convert io ranges from array to bitmap */
+ void (*bitmap_sector)(struct mddev *mddev, sector_t *offset,
+ unsigned long *sectors);
};
struct md_sysfs_entry {
@@ -828,6 +831,8 @@ struct md_io_clone {
struct mddev *mddev;
struct bio *orig_bio;
unsigned long start_time;
+ sector_t offset;
+ unsigned long sectors;
struct bio bio_clone;
};
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 32d587524778..31bea72bcb01 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -385,10 +385,8 @@ static int raid0_set_limits(struct mddev *mddev)
lim.io_min = mddev->chunk_sectors << 9;
lim.io_opt = lim.io_min * mddev->raid_disks;
err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
- if (err) {
- queue_limits_cancel_update(mddev->gendisk->queue);
+ if (err)
return err;
- }
return queue_limits_set(mddev->gendisk->queue, &lim);
}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 6c9d24203f39..fe1599db69c8 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -420,10 +420,8 @@ static void close_write(struct r1bio *r1_bio)
r1_bio->behind_master_bio = NULL;
}
- /* clear the bitmap if all writes complete successfully */
- mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors,
- !test_bit(R1BIO_Degraded, &r1_bio->state),
- test_bit(R1BIO_BehindIO, &r1_bio->state));
+ if (test_bit(R1BIO_BehindIO, &r1_bio->state))
+ mddev->bitmap_ops->end_behind_write(mddev);
md_write_end(mddev);
}
@@ -480,8 +478,6 @@ static void raid1_end_write_request(struct bio *bio)
if (!test_bit(Faulty, &rdev->flags))
set_bit(R1BIO_WriteError, &r1_bio->state);
else {
- /* Fail the request */
- set_bit(R1BIO_Degraded, &r1_bio->state);
/* Finished with this branch */
r1_bio->bios[mirror] = NULL;
to_put = bio;
@@ -1492,11 +1488,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
break;
}
r1_bio->bios[i] = NULL;
- if (!rdev || test_bit(Faulty, &rdev->flags)) {
- if (i < conf->raid_disks)
- set_bit(R1BIO_Degraded, &r1_bio->state);
+ if (!rdev || test_bit(Faulty, &rdev->flags))
continue;
- }
atomic_inc(&rdev->nr_pending);
if (test_bit(WriteErrorSeen, &rdev->flags)) {
@@ -1522,16 +1515,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
*/
max_sectors = bad_sectors;
rdev_dec_pending(rdev, mddev);
- /* We don't set R1BIO_Degraded as that
- * only applies if the disk is
- * missing, so it might be re-added,
- * and we want to know to recover this
- * chunk.
- * In this case the device is here,
- * and the fact that this chunk is not
- * in-sync is recorded in the bad
- * block log
- */
continue;
}
if (is_bad) {
@@ -1611,9 +1594,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
stats.behind_writes < max_write_behind)
alloc_behind_master_bio(r1_bio, bio);
- mddev->bitmap_ops->startwrite(
- mddev, r1_bio->sector, r1_bio->sectors,
- test_bit(R1BIO_BehindIO, &r1_bio->state));
+ if (test_bit(R1BIO_BehindIO, &r1_bio->state))
+ mddev->bitmap_ops->start_behind_write(mddev);
first_clone = 0;
}
@@ -2174,14 +2156,9 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
if (!rdev_set_badblocks(rdev, sect, s, 0))
abort = 1;
}
- if (abort) {
- conf->recovery_disabled =
- mddev->recovery_disabled;
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- md_done_sync(mddev, r1_bio->sectors, 0);
- put_buf(r1_bio);
+ if (abort)
return 0;
- }
+
/* Try next page */
sectors -= s;
sect += s;
@@ -2320,10 +2297,21 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
int disks = conf->raid_disks * 2;
struct bio *wbio;
- if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
- /* ouch - failed to read all of that. */
- if (!fix_sync_read_error(r1_bio))
+ if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
+ /*
+ * ouch - failed to read all of that.
+ * No need to fix read error for check/repair
+ * because all member disks are read.
+ */
+ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) ||
+ !fix_sync_read_error(r1_bio)) {
+ conf->recovery_disabled = mddev->recovery_disabled;
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_done_sync(mddev, r1_bio->sectors, 0);
+ put_buf(r1_bio);
return;
+ }
+ }
if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
process_checks(r1_bio);
@@ -2567,12 +2555,10 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
* errors.
*/
fail = true;
- if (!narrow_write_error(r1_bio, m)) {
+ if (!narrow_write_error(r1_bio, m))
md_error(conf->mddev,
conf->mirrors[m].rdev);
/* an I/O failed, we can't clear the bitmap */
- set_bit(R1BIO_Degraded, &r1_bio->state);
- }
rdev_dec_pending(conf->mirrors[m].rdev,
conf->mddev);
}
@@ -2663,8 +2649,6 @@ static void raid1d(struct md_thread *thread)
list_del(&r1_bio->retry_list);
idx = sector_to_idx(r1_bio->sector);
atomic_dec(&conf->nr_queued[idx]);
- if (mddev->degraded)
- set_bit(R1BIO_Degraded, &r1_bio->state);
if (test_bit(R1BIO_WriteError, &r1_bio->state))
close_write(r1_bio);
raid_end_bio_io(r1_bio);
@@ -3193,10 +3177,8 @@ static int raid1_set_limits(struct mddev *mddev)
md_init_stacking_limits(&lim);
lim.max_write_zeroes_sectors = 0;
err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
- if (err) {
- queue_limits_cancel_update(mddev->gendisk->queue);
+ if (err)
return err;
- }
return queue_limits_set(mddev->gendisk->queue, &lim);
}
@@ -3398,6 +3380,7 @@ static int raid1_reshape(struct mddev *mddev)
/* ok, everything is stopped */
oldpool = conf->r1bio_pool;
conf->r1bio_pool = newpool;
+ init_waitqueue_head(&conf->r1bio_pool.wait);
for (d = d2 = 0; d < conf->raid_disks; d++) {
struct md_rdev *rdev = conf->mirrors[d].rdev;
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 5300cbaa58a4..33f318fcc268 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -188,7 +188,6 @@ struct r1bio {
enum r1bio_state {
R1BIO_Uptodate,
R1BIO_IsSync,
- R1BIO_Degraded,
R1BIO_BehindIO,
/* Set ReadError on bios that experience a readerror so that
* raid1d knows what to do with them.
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 862b1fb71d86..7515a98001ca 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -428,10 +428,6 @@ static void close_write(struct r10bio *r10_bio)
{
struct mddev *mddev = r10_bio->mddev;
- /* clear the bitmap if all writes complete successfully */
- mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors,
- !test_bit(R10BIO_Degraded, &r10_bio->state),
- false);
md_write_end(mddev);
}
@@ -501,7 +497,6 @@ static void raid10_end_write_request(struct bio *bio)
set_bit(R10BIO_WriteError, &r10_bio->state);
else {
/* Fail the request */
- set_bit(R10BIO_Degraded, &r10_bio->state);
r10_bio->devs[slot].bio = NULL;
to_put = bio;
dec_rdev = 1;
@@ -1186,8 +1181,11 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
}
}
- if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors))
+ if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors)) {
+ raid_end_bio_io(r10_bio);
return;
+ }
+
rdev = read_balance(conf, r10_bio, &max_sectors);
if (!rdev) {
if (err_rdev) {
@@ -1373,8 +1371,11 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
}
sectors = r10_bio->sectors;
- if (!regular_request_wait(mddev, conf, bio, sectors))
+ if (!regular_request_wait(mddev, conf, bio, sectors)) {
+ raid_end_bio_io(r10_bio);
return;
+ }
+
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
(mddev->reshape_backwards
? (bio->bi_iter.bi_sector < conf->reshape_safe &&
@@ -1430,10 +1431,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
r10_bio->devs[i].bio = NULL;
r10_bio->devs[i].repl_bio = NULL;
- if (!rdev && !rrdev) {
- set_bit(R10BIO_Degraded, &r10_bio->state);
+ if (!rdev && !rrdev)
continue;
- }
if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
sector_t first_bad;
sector_t dev_sector = r10_bio->devs[i].addr;
@@ -1450,14 +1449,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
* to other devices yet
*/
max_sectors = bad_sectors;
- /* We don't set R10BIO_Degraded as that
- * only applies if the disk is missing,
- * so it might be re-added, and we want to
- * know to recover this chunk.
- * In this case the device is here, and the
- * fact that this chunk is not in-sync is
- * recorded in the bad block log.
- */
continue;
}
if (is_bad) {
@@ -1493,8 +1484,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
md_account_bio(mddev, &bio);
r10_bio->master_bio = bio;
atomic_set(&r10_bio->remaining, 1);
- mddev->bitmap_ops->startwrite(mddev, r10_bio->sector, r10_bio->sectors,
- false);
for (i = 0; i < conf->copies; i++) {
if (r10_bio->devs[i].bio)
@@ -1704,6 +1693,7 @@ retry_discard:
* The discard bio returns only first r10bio finishes
*/
if (first_copy) {
+ md_account_bio(mddev, &bio);
r10_bio->master_bio = bio;
set_bit(R10BIO_Discard, &r10_bio->state);
first_copy = false;
@@ -2910,11 +2900,8 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
rdev_dec_pending(rdev, conf->mddev);
} else if (bio != NULL && bio->bi_status) {
fail = true;
- if (!narrow_write_error(r10_bio, m)) {
+ if (!narrow_write_error(r10_bio, m))
md_error(conf->mddev, rdev);
- set_bit(R10BIO_Degraded,
- &r10_bio->state);
- }
rdev_dec_pending(rdev, conf->mddev);
}
bio = r10_bio->devs[m].repl_bio;
@@ -2973,8 +2960,6 @@ static void raid10d(struct md_thread *thread)
r10_bio = list_first_entry(&tmp, struct r10bio,
retry_list);
list_del(&r10_bio->retry_list);
- if (mddev->degraded)
- set_bit(R10BIO_Degraded, &r10_bio->state);
if (test_bit(R10BIO_WriteError,
&r10_bio->state))
@@ -3983,12 +3968,11 @@ static int raid10_set_queue_limits(struct mddev *mddev)
md_init_stacking_limits(&lim);
lim.max_write_zeroes_sectors = 0;
lim.io_min = mddev->chunk_sectors << 9;
+ lim.chunk_sectors = mddev->chunk_sectors;
lim.io_opt = lim.io_min * raid10_nr_stripes(conf);
err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
- if (err) {
- queue_limits_cancel_update(mddev->gendisk->queue);
+ if (err)
return err;
- }
return queue_limits_set(mddev->gendisk->queue, &lim);
}
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 2e75e88d0802..3f16ad6904a9 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -161,7 +161,6 @@ enum r10bio_state {
R10BIO_IsSync,
R10BIO_IsRecover,
R10BIO_IsReshape,
- R10BIO_Degraded,
/* Set ReadError on bios that experience a read error
* so that raid10d knows what to do with them.
*/
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index b4f7b79fd187..011246e16a99 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -313,10 +313,6 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
if (sh->dev[i].written) {
set_bit(R5_UPTODATE, &sh->dev[i].flags);
r5c_return_dev_pending_writes(conf, &sh->dev[i]);
- conf->mddev->bitmap_ops->endwrite(conf->mddev,
- sh->sector, RAID5_STRIPE_SECTORS(conf),
- !test_bit(STRIPE_DEGRADED, &sh->state),
- false);
}
}
}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2fa1f270fb1d..39e7596e78c0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -906,8 +906,7 @@ static bool stripe_can_batch(struct stripe_head *sh)
if (raid5_has_log(conf) || raid5_has_ppl(conf))
return false;
return test_bit(STRIPE_BATCH_READY, &sh->state) &&
- !test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
- is_full_stripe_write(sh);
+ is_full_stripe_write(sh);
}
/* we only do back search */
@@ -1345,8 +1344,6 @@ again:
submit_bio_noacct(rbi);
}
if (!rdev && !rrdev) {
- if (op_is_write(op))
- set_bit(STRIPE_DEGRADED, &sh->state);
pr_debug("skip op %d on disc %d for sector %llu\n",
bi->bi_opf, i, (unsigned long long)sh->sector);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
@@ -2884,7 +2881,6 @@ static void raid5_end_write_request(struct bio *bi)
set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
} else {
if (bi->bi_status) {
- set_bit(STRIPE_DEGRADED, &sh->state);
set_bit(WriteErrorSeen, &rdev->flags);
set_bit(R5_WriteError, &sh->dev[i].flags);
if (!test_and_set_bit(WantReplacement, &rdev->flags))
@@ -3548,29 +3544,9 @@ static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi,
(*bip)->bi_iter.bi_sector, sh->sector, dd_idx,
sh->dev[dd_idx].sector);
- if (conf->mddev->bitmap && firstwrite) {
- /* Cannot hold spinlock over bitmap_startwrite,
- * but must ensure this isn't added to a batch until
- * we have added to the bitmap and set bm_seq.
- * So set STRIPE_BITMAP_PENDING to prevent
- * batching.
- * If multiple __add_stripe_bio() calls race here they
- * much all set STRIPE_BITMAP_PENDING. So only the first one
- * to complete "bitmap_startwrite" gets to set
- * STRIPE_BIT_DELAY. This is important as once a stripe
- * is added to a batch, STRIPE_BIT_DELAY cannot be changed
- * any more.
- */
- set_bit(STRIPE_BITMAP_PENDING, &sh->state);
- spin_unlock_irq(&sh->stripe_lock);
- conf->mddev->bitmap_ops->startwrite(conf->mddev, sh->sector,
- RAID5_STRIPE_SECTORS(conf), false);
- spin_lock_irq(&sh->stripe_lock);
- clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
- if (!sh->batch_head) {
- sh->bm_seq = conf->seq_flush+1;
- set_bit(STRIPE_BIT_DELAY, &sh->state);
- }
+ if (conf->mddev->bitmap && firstwrite && !sh->batch_head) {
+ sh->bm_seq = conf->seq_flush+1;
+ set_bit(STRIPE_BIT_DELAY, &sh->state);
}
}
@@ -3621,7 +3597,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
BUG_ON(sh->batch_head);
for (i = disks; i--; ) {
struct bio *bi;
- int bitmap_end = 0;
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
struct md_rdev *rdev = conf->disks[i].rdev;
@@ -3646,8 +3621,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
sh->dev[i].towrite = NULL;
sh->overwrite_disks = 0;
spin_unlock_irq(&sh->stripe_lock);
- if (bi)
- bitmap_end = 1;
log_stripe_write_finished(sh);
@@ -3662,11 +3635,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
bio_io_error(bi);
bi = nextbi;
}
- if (bitmap_end)
- conf->mddev->bitmap_ops->endwrite(conf->mddev,
- sh->sector, RAID5_STRIPE_SECTORS(conf),
- false, false);
- bitmap_end = 0;
/* and fail all 'written' */
bi = sh->dev[i].written;
sh->dev[i].written = NULL;
@@ -3675,7 +3643,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
sh->dev[i].page = sh->dev[i].orig_page;
}
- if (bi) bitmap_end = 1;
while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector);
@@ -3709,10 +3676,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
bi = nextbi;
}
}
- if (bitmap_end)
- conf->mddev->bitmap_ops->endwrite(conf->mddev,
- sh->sector, RAID5_STRIPE_SECTORS(conf),
- false, false);
/* If we were in the middle of a write the parity block might
* still be locked - so just clear all R5_LOCKED flags
*/
@@ -4061,10 +4024,7 @@ returnbi:
bio_endio(wbi);
wbi = wbi2;
}
- conf->mddev->bitmap_ops->endwrite(conf->mddev,
- sh->sector, RAID5_STRIPE_SECTORS(conf),
- !test_bit(STRIPE_DEGRADED, &sh->state),
- false);
+
if (head_sh->batch_head) {
sh = list_first_entry(&sh->batch_list,
struct stripe_head,
@@ -4341,7 +4301,6 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
s->locked++;
set_bit(R5_Wantwrite, &dev->flags);
- clear_bit(STRIPE_DEGRADED, &sh->state);
set_bit(STRIPE_INSYNC, &sh->state);
break;
case check_state_run:
@@ -4498,7 +4457,6 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
clear_bit(R5_Wantwrite, &dev->flags);
s->locked--;
}
- clear_bit(STRIPE_DEGRADED, &sh->state);
set_bit(STRIPE_INSYNC, &sh->state);
break;
@@ -4892,8 +4850,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
(1 << STRIPE_COMPUTE_RUN) |
(1 << STRIPE_DISCARD) |
(1 << STRIPE_BATCH_READY) |
- (1 << STRIPE_BATCH_ERR) |
- (1 << STRIPE_BITMAP_PENDING)),
+ (1 << STRIPE_BATCH_ERR)),
"stripe state: %lx\n", sh->state);
WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
(1 << STRIPE_REPLACED)),
@@ -4901,7 +4858,6 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
(1 << STRIPE_PREREAD_ACTIVE) |
- (1 << STRIPE_DEGRADED) |
(1 << STRIPE_ON_UNPLUG_LIST)),
head_sh->state & (1 << STRIPE_INSYNC));
@@ -5785,10 +5741,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
}
spin_unlock_irq(&sh->stripe_lock);
if (conf->mddev->bitmap) {
- for (d = 0; d < conf->raid_disks - conf->max_degraded;
- d++)
- mddev->bitmap_ops->startwrite(mddev, sh->sector,
- RAID5_STRIPE_SECTORS(conf), false);
sh->bm_seq = conf->seq_flush + 1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
}
@@ -5929,6 +5881,54 @@ static enum reshape_loc get_reshape_loc(struct mddev *mddev,
return LOC_BEHIND_RESHAPE;
}
+static void raid5_bitmap_sector(struct mddev *mddev, sector_t *offset,
+ unsigned long *sectors)
+{
+ struct r5conf *conf = mddev->private;
+ sector_t start = *offset;
+ sector_t end = start + *sectors;
+ sector_t prev_start = start;
+ sector_t prev_end = end;
+ int sectors_per_chunk;
+ enum reshape_loc loc;
+ int dd_idx;
+
+ sectors_per_chunk = conf->chunk_sectors *
+ (conf->raid_disks - conf->max_degraded);
+ start = round_down(start, sectors_per_chunk);
+ end = round_up(end, sectors_per_chunk);
+
+ start = raid5_compute_sector(conf, start, 0, &dd_idx, NULL);
+ end = raid5_compute_sector(conf, end, 0, &dd_idx, NULL);
+
+ /*
+ * For LOC_INSIDE_RESHAPE, this IO will wait for reshape to make
+ * progress, hence it's the same as LOC_BEHIND_RESHAPE.
+ */
+ loc = get_reshape_loc(mddev, conf, prev_start);
+ if (likely(loc != LOC_AHEAD_OF_RESHAPE)) {
+ *offset = start;
+ *sectors = end - start;
+ return;
+ }
+
+ sectors_per_chunk = conf->prev_chunk_sectors *
+ (conf->previous_raid_disks - conf->max_degraded);
+ prev_start = round_down(prev_start, sectors_per_chunk);
+ prev_end = round_down(prev_end, sectors_per_chunk);
+
+ prev_start = raid5_compute_sector(conf, prev_start, 1, &dd_idx, NULL);
+ prev_end = raid5_compute_sector(conf, prev_end, 1, &dd_idx, NULL);
+
+ /*
+ * for LOC_AHEAD_OF_RESHAPE, reshape can make progress before this IO
+ * is handled in make_stripe_request(), we can't know this here hence
+ * we set bits for both.
+ */
+ *offset = min(start, prev_start);
+ *sectors = max(end, prev_end) - *offset;
+}
+
static enum stripe_result make_stripe_request(struct mddev *mddev,
struct r5conf *conf, struct stripe_request_ctx *ctx,
sector_t logical_sector, struct bio *bi)
@@ -8977,6 +8977,7 @@ static struct md_personality raid6_personality =
.takeover = raid6_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
.prepare_suspend = raid5_prepare_suspend,
+ .bitmap_sector = raid5_bitmap_sector,
};
static struct md_personality raid5_personality =
{
@@ -9002,6 +9003,7 @@ static struct md_personality raid5_personality =
.takeover = raid5_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
.prepare_suspend = raid5_prepare_suspend,
+ .bitmap_sector = raid5_bitmap_sector,
};
static struct md_personality raid4_personality =
@@ -9028,6 +9030,7 @@ static struct md_personality raid4_personality =
.takeover = raid4_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
.prepare_suspend = raid5_prepare_suspend,
+ .bitmap_sector = raid5_bitmap_sector,
};
static int __init raid5_init(void)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 896ecfc4afa6..2e42c1641049 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -358,7 +358,6 @@ enum {
STRIPE_REPLACED,
STRIPE_PREREAD_ACTIVE,
STRIPE_DELAYED,
- STRIPE_DEGRADED,
STRIPE_BIT_DELAY,
STRIPE_EXPANDING,
STRIPE_EXPAND_SOURCE,
@@ -372,9 +371,6 @@ enum {
STRIPE_ON_RELEASE_LIST,
STRIPE_BATCH_READY,
STRIPE_BATCH_ERR,
- STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add
- * to batch yet.
- */
STRIPE_LOG_TRAPPED, /* trapped into log (see raid5-cache.c)
* this bit is used in two scenarios:
*