diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-11-25 22:15:41 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-11-25 22:15:41 +0300 |
commit | 2d53943090c336c9d298638bad292be349e1b9c4 (patch) | |
tree | 18fc9bbc1ff13fe031a922cf2efc22a903257caf /drivers/md | |
parent | ff6814b078e33a4d26fee9ea80779c81a6744cd8 (diff) | |
parent | 00b89892c869f34528deca957b10d1468c4e8b38 (diff) | |
download | linux-2d53943090c336c9d298638bad292be349e1b9c4.tar.xz |
Merge tag 'for-5.5/drivers-20191121' of git://git.kernel.dk/linux-block
Pull block driver updates from Jens Axboe:
"Here are the main block driver updates for 5.5. Nothing major in here,
mostly just fixes. This contains:
- a set of bcache changes via Coly
- MD changes from Song
- loop unmap write-zeroes fix (Darrick)
- spelling fixes (Geert)
- zoned additions cleanups to null_blk/dm (Ajay)
- allow null_blk online submit queue changes (Bart)
- NVMe changes via Keith, nothing major here either"
* tag 'for-5.5/drivers-20191121' of git://git.kernel.dk/linux-block: (56 commits)
Revert "bcache: fix fifo index swapping condition in journal_pin_cmp()"
drivers/md/raid5-ppl.c: use the new spelling of RWH_WRITE_LIFE_NOT_SET
drivers/md/raid5.c: use the new spelling of RWH_WRITE_LIFE_NOT_SET
bcache: don't export symbols
bcache: remove the extra cflags for request.o
bcache: at least try to shrink 1 node in bch_mca_scan()
bcache: add idle_max_writeback_rate sysfs interface
bcache: add code comments in bch_btree_leaf_dirty()
bcache: fix deadlock in bcache_allocator
bcache: add code comment bch_keylist_pop() and bch_keylist_pop_front()
bcache: deleted code comments for dead code in bch_data_insert_keys()
bcache: add more accurate error messages in read_super()
bcache: fix static checker warning in bcache_device_free()
bcache: fix a lost wake-up problem caused by mca_cannibalize_lock
bcache: fix fifo index swapping condition in journal_pin_cmp()
md/raid10: prevent access of uninitialized resync_pages offset
md: avoid invalid memory access for array sb->dev_roles
md/raid1: avoid soft lockup under high load
null_blk: add zone open, close, and finish support
dm: add zone open, close and finish support
...
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bcache/Makefile | 2 | ||||
-rw-r--r-- | drivers/md/bcache/alloc.c | 5 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 4 | ||||
-rw-r--r-- | drivers/md/bcache/bset.c | 17 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 19 | ||||
-rw-r--r-- | drivers/md/bcache/closure.c | 7 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 12 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 56 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 7 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-flakey.c | 7 | ||||
-rw-r--r-- | drivers/md/dm-linear.c | 2 | ||||
-rw-r--r-- | drivers/md/dm.c | 5 | ||||
-rw-r--r-- | drivers/md/md-bitmap.c | 2 | ||||
-rw-r--r-- | drivers/md/md-linear.c | 5 | ||||
-rw-r--r-- | drivers/md/md-multipath.c | 5 | ||||
-rw-r--r-- | drivers/md/md.c | 57 | ||||
-rw-r--r-- | drivers/md/md.h | 4 | ||||
-rw-r--r-- | drivers/md/raid0.c | 7 | ||||
-rw-r--r-- | drivers/md/raid1.c | 6 | ||||
-rw-r--r-- | drivers/md/raid10.c | 7 | ||||
-rw-r--r-- | drivers/md/raid5-ppl.c | 2 | ||||
-rw-r--r-- | drivers/md/raid5.c | 8 |
23 files changed, 153 insertions, 97 deletions
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index d26b35195825..fd714628da6a 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -5,5 +5,3 @@ obj-$(CONFIG_BCACHE) += bcache.o bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ util.o writeback.o - -CFLAGS_request.o += -Iblock diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 6f776823b9ba..a1df0d95151c 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -377,7 +377,10 @@ retry_invalidate: if (!fifo_full(&ca->free_inc)) goto retry_invalidate; - bch_prio_write(ca); + if (bch_prio_write(ca, false) < 0) { + ca->invalidate_needs_gc = 1; + wake_up_gc(ca->set); + } } } out: diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 013e35a9e317..9198c1b480d9 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -582,6 +582,7 @@ struct cache_set { */ wait_queue_head_t btree_cache_wait; struct task_struct *btree_cache_alloc_lock; + spinlock_t btree_cannibalize_lock; /* * When we free a btree node, we increment the gen of the bucket the @@ -723,6 +724,7 @@ struct cache_set { unsigned int gc_always_rewrite:1; unsigned int shrinker_disabled:1; unsigned int copy_gc_enabled:1; + unsigned int idle_max_writeback_rate_enabled:1; #define BUCKET_HASH_BITS 12 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; @@ -977,7 +979,7 @@ bool bch_cached_dev_error(struct cached_dev *dc); __printf(2, 3) bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...); -void bch_prio_write(struct cache *ca); +int bch_prio_write(struct cache *ca, bool wait); void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent); extern struct workqueue_struct *bcache_wq; diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 08768796b543..cffcdc9feefb 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -155,6 +155,7 @@ int __bch_keylist_realloc(struct keylist *l, unsigned int u64s) return 0; } +/* Pop the top key of keylist by pointing l->top to its previous key */ struct bkey *bch_keylist_pop(struct keylist *l) { struct bkey *k = l->keys; @@ -168,6 +169,7 @@ struct bkey *bch_keylist_pop(struct keylist *l) return l->top = k; } +/* Pop the bottom key of keylist and update l->top_p */ void bch_keylist_pop_front(struct keylist *l) { l->top_p -= bkey_u64s(l->keys); @@ -309,7 +311,6 @@ void bch_btree_keys_free(struct btree_keys *b) t->tree = NULL; t->data = NULL; } -EXPORT_SYMBOL(bch_btree_keys_free); int bch_btree_keys_alloc(struct btree_keys *b, unsigned int page_order, @@ -342,7 +343,6 @@ err: bch_btree_keys_free(b); return -ENOMEM; } -EXPORT_SYMBOL(bch_btree_keys_alloc); void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, bool *expensive_debug_checks) @@ -361,7 +361,6 @@ void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, * any more. */ } -EXPORT_SYMBOL(bch_btree_keys_init); /* Binary tree stuff for auxiliary search trees */ @@ -678,7 +677,6 @@ void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic) bch_bset_build_unwritten_tree(b); } -EXPORT_SYMBOL(bch_bset_init_next); /* * Build auxiliary binary tree 'struct bset_tree *t', this tree is used to @@ -732,7 +730,6 @@ void bch_bset_build_written_tree(struct btree_keys *b) j = inorder_next(j, t->size)) make_bfloat(t, j); } -EXPORT_SYMBOL(bch_bset_build_written_tree); /* Insert */ @@ -780,7 +777,6 @@ fix_right: do { j = j * 2 + 1; } while (j < t->size); } -EXPORT_SYMBOL(bch_bset_fix_invalidated_key); static void bch_bset_fix_lookup_table(struct btree_keys *b, struct bset_tree *t, @@ -855,7 +851,6 @@ bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r) return b->ops->key_merge(b, l, r); } -EXPORT_SYMBOL(bch_bkey_try_merge); void bch_bset_insert(struct btree_keys *b, struct bkey *where, struct bkey *insert) @@ -875,7 +870,6 @@ void bch_bset_insert(struct btree_keys *b, struct bkey *where, bkey_copy(where, insert); bch_bset_fix_lookup_table(b, t, where); } -EXPORT_SYMBOL(bch_bset_insert); unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, struct bkey *replace_key) @@ -931,7 +925,6 @@ copy: bkey_copy(m, k); merged: return status; } -EXPORT_SYMBOL(bch_btree_insert_key); /* Lookup */ @@ -1077,7 +1070,6 @@ struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, return i.l; } -EXPORT_SYMBOL(__bch_bset_search); /* Btree iterator */ @@ -1132,7 +1124,6 @@ struct bkey *bch_btree_iter_init(struct btree_keys *b, { return __bch_btree_iter_init(b, iter, search, b->set); } -EXPORT_SYMBOL(bch_btree_iter_init); static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, btree_iter_cmp_fn *cmp) @@ -1165,7 +1156,6 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter) return __bch_btree_iter_next(iter, btree_iter_cmp); } -EXPORT_SYMBOL(bch_btree_iter_next); struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, struct btree_keys *b, ptr_filter_fn fn) @@ -1196,7 +1186,6 @@ int bch_bset_sort_state_init(struct bset_sort_state *state, return mempool_init_page_pool(&state->pool, 1, page_order); } -EXPORT_SYMBOL(bch_bset_sort_state_init); static void btree_mergesort(struct btree_keys *b, struct bset *out, struct btree_iter *iter, @@ -1313,7 +1302,6 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize); } -EXPORT_SYMBOL(bch_btree_sort_partial); void bch_btree_sort_and_fix_extents(struct btree_keys *b, struct btree_iter *iter, @@ -1366,7 +1354,6 @@ void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state) out: bch_bset_build_written_tree(b); } -EXPORT_SYMBOL(bch_btree_sort_lazy); void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats) { diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index ba434d9ac720..14d6c33b0957 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -543,6 +543,11 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) set_btree_node_dirty(b); + /* + * w->journal is always the oldest journal pin of all bkeys + * in the leaf node, to make sure the oldest jset seq won't + * be increased before this btree node is flushed. + */ if (journal_ref) { if (w->journal && journal_pin_cmp(b->c, w->journal, journal_ref)) { @@ -723,6 +728,8 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, * IO can always make forward progress: */ nr /= c->btree_pages; + if (nr == 0) + nr = 1; nr = min_t(unsigned long, nr, mca_can_free(c)); i = 0; @@ -884,15 +891,17 @@ out: static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op) { - struct task_struct *old; - - old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current); - if (old && old != current) { + spin_lock(&c->btree_cannibalize_lock); + if (likely(c->btree_cache_alloc_lock == NULL)) { + c->btree_cache_alloc_lock = current; + } else if (c->btree_cache_alloc_lock != current) { if (op) prepare_to_wait(&c->btree_cache_wait, &op->wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&c->btree_cannibalize_lock); return -EINTR; } + spin_unlock(&c->btree_cannibalize_lock); return 0; } @@ -927,10 +936,12 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, */ static void bch_cannibalize_unlock(struct cache_set *c) { + spin_lock(&c->btree_cannibalize_lock); if (c->btree_cache_alloc_lock == current) { c->btree_cache_alloc_lock = NULL; wake_up(&c->btree_cache_wait); } + spin_unlock(&c->btree_cannibalize_lock); } static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op, diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index c12cd809ab19..0164a1fe94a9 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -45,7 +45,6 @@ void closure_sub(struct closure *cl, int v) { closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); } -EXPORT_SYMBOL(closure_sub); /* * closure_put - decrement a closure's refcount @@ -54,7 +53,6 @@ void closure_put(struct closure *cl) { closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); } -EXPORT_SYMBOL(closure_put); /* * closure_wake_up - wake up all closures on a wait list, without memory barrier @@ -76,7 +74,6 @@ void __closure_wake_up(struct closure_waitlist *wait_list) closure_sub(cl, CLOSURE_WAITING + 1); } } -EXPORT_SYMBOL(__closure_wake_up); /** * closure_wait - add a closure to a waitlist @@ -96,7 +93,6 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) return true; } -EXPORT_SYMBOL(closure_wait); struct closure_syncer { struct task_struct *task; @@ -131,7 +127,6 @@ void __sched __closure_sync(struct closure *cl) __set_current_state(TASK_RUNNING); } -EXPORT_SYMBOL(__closure_sync); #ifdef CONFIG_BCACHE_CLOSURES_DEBUG @@ -149,7 +144,6 @@ void closure_debug_create(struct closure *cl) list_add(&cl->all, &closure_list); spin_unlock_irqrestore(&closure_list_lock, flags); } -EXPORT_SYMBOL(closure_debug_create); void closure_debug_destroy(struct closure *cl) { @@ -162,7 +156,6 @@ void closure_debug_destroy(struct closure *cl) list_del(&cl->all); spin_unlock_irqrestore(&closure_list_lock, flags); } -EXPORT_SYMBOL(closure_debug_destroy); static struct dentry *closure_debug; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 41adcd1546f1..73478a91a342 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -62,18 +62,6 @@ static void bch_data_insert_keys(struct closure *cl) struct bkey *replace_key = op->replace ? &op->replace_key : NULL; int ret; - /* - * If we're looping, might already be waiting on - * another journal write - can't wait on more than one journal write at - * a time - * - * XXX: this looks wrong - */ -#if 0 - while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING) - closure_sync(&s->cl); -#endif - if (!op->replace) journal_ref = bch_journal(op->c, &op->insert_keys, op->flush_journal ? cl : NULL); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 20ed838e9413..77e9869345e7 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -92,10 +92,11 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u", sb->version, sb->flags, sb->seq, sb->keys); - err = "Not a bcache superblock"; + err = "Not a bcache superblock (bad offset)"; if (sb->offset != SB_SECTOR) goto err; + err = "Not a bcache superblock (bad magic)"; if (memcmp(sb->magic, bcache_magic, 16)) goto err; @@ -529,12 +530,29 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op, closure_sync(cl); } -void bch_prio_write(struct cache *ca) +int bch_prio_write(struct cache *ca, bool wait) { int i; struct bucket *b; struct closure cl; + pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu", + fifo_used(&ca->free[RESERVE_PRIO]), + fifo_used(&ca->free[RESERVE_NONE]), + fifo_used(&ca->free_inc)); + + /* + * Pre-check if there are enough free buckets. In the non-blocking + * scenario it's better to fail early rather than starting to allocate + * buckets and do a cleanup later in case of failure. + */ + if (!wait) { + size_t avail = fifo_used(&ca->free[RESERVE_PRIO]) + + fifo_used(&ca->free[RESERVE_NONE]); + if (prio_buckets(ca) > avail) + return -ENOMEM; + } + closure_init_stack(&cl); lockdep_assert_held(&ca->set->bucket_lock); @@ -544,9 +562,6 @@ void bch_prio_write(struct cache *ca) atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), &ca->meta_sectors_written); - //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), - // fifo_used(&ca->free_inc), fifo_used(&ca->unused)); - for (i = prio_buckets(ca) - 1; i >= 0; --i) { long bucket; struct prio_set *p = ca->disk_buckets; @@ -564,7 +579,7 @@ void bch_prio_write(struct cache *ca) p->magic = pset_magic(&ca->sb); p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); - bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true); + bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait); BUG_ON(bucket == -1); mutex_unlock(&ca->set->bucket_lock); @@ -593,6 +608,7 @@ void bch_prio_write(struct cache *ca) ca->prio_last_buckets[i] = ca->prio_buckets[i]; } + return 0; } static void prio_read(struct cache *ca, uint64_t bucket) @@ -761,20 +777,28 @@ static inline int idx_to_first_minor(int idx) static void bcache_device_free(struct bcache_device *d) { + struct gendisk *disk = d->disk; + lockdep_assert_held(&bch_register_lock); - pr_info("%s stopped", d->disk->disk_name); + if (disk) + pr_info("%s stopped", disk->disk_name); + else + pr_err("bcache device (NULL gendisk) stopped"); if (d->c) bcache_device_detach(d); - if (d->disk && d->disk->flags & GENHD_FL_UP) - del_gendisk(d->disk); - if (d->disk && d->disk->queue) - blk_cleanup_queue(d->disk->queue); - if (d->disk) { + + if (disk) { + if (disk->flags & GENHD_FL_UP) + del_gendisk(disk); + + if (disk->queue) + blk_cleanup_queue(disk->queue); + ida_simple_remove(&bcache_device_idx, - first_minor_to_idx(d->disk->first_minor)); - put_disk(d->disk); + first_minor_to_idx(disk->first_minor)); + put_disk(disk); } bioset_exit(&d->bio_split); @@ -1769,6 +1793,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) sema_init(&c->sb_write_mutex, 1); mutex_init(&c->bucket_lock); init_waitqueue_head(&c->btree_cache_wait); + spin_lock_init(&c->btree_cannibalize_lock); init_waitqueue_head(&c->bucket_wait); init_waitqueue_head(&c->gc_wait); sema_init(&c->uuid_write_mutex, 1); @@ -1809,6 +1834,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->congested_read_threshold_us = 2000; c->congested_write_threshold_us = 20000; c->error_limit = DEFAULT_IO_ERROR_LIMIT; + c->idle_max_writeback_rate_enabled = 1; WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags)); return c; @@ -1954,7 +1980,7 @@ static int run_cache_set(struct cache_set *c) mutex_lock(&c->bucket_lock); for_each_cache(ca, c, i) - bch_prio_write(ca); + bch_prio_write(ca, true); mutex_unlock(&c->bucket_lock); err = "cannot allocate new UUID bucket"; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 627dcea0f5b6..733e2ddf3c78 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -134,6 +134,7 @@ rw_attribute(expensive_debug_checks); rw_attribute(cache_replacement_policy); rw_attribute(btree_shrinker_disabled); rw_attribute(copy_gc_enabled); +rw_attribute(idle_max_writeback_rate); rw_attribute(gc_after_writeback); rw_attribute(size); @@ -747,6 +748,8 @@ SHOW(__bch_cache_set) sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); + sysfs_printf(idle_max_writeback_rate, "%i", + c->idle_max_writeback_rate_enabled); sysfs_printf(gc_after_writeback, "%i", c->gc_after_writeback); sysfs_printf(io_disable, "%i", test_bit(CACHE_SET_IO_DISABLE, &c->flags)); @@ -864,6 +867,9 @@ STORE(__bch_cache_set) sysfs_strtoul_bool(gc_always_rewrite, c->gc_always_rewrite); sysfs_strtoul_bool(btree_shrinker_disabled, c->shrinker_disabled); sysfs_strtoul_bool(copy_gc_enabled, c->copy_gc_enabled); + sysfs_strtoul_bool(idle_max_writeback_rate, + c->idle_max_writeback_rate_enabled); + /* * write gc_after_writeback here may overwrite an already set * BCH_DO_AUTO_GC, it doesn't matter because this flag will be @@ -954,6 +960,7 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_gc_always_rewrite, &sysfs_btree_shrinker_disabled, &sysfs_copy_gc_enabled, + &sysfs_idle_max_writeback_rate, &sysfs_gc_after_writeback, &sysfs_io_disable, &sysfs_cutoff_writeback, diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index d60268fe49e1..4a40f9eadeaf 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -122,6 +122,10 @@ static void __update_writeback_rate(struct cached_dev *dc) static bool set_at_max_writeback_rate(struct cache_set *c, struct cached_dev *dc) { + /* Don't sst max writeback rate if it is disabled */ + if (!c->idle_max_writeback_rate_enabled) + return false; + /* Don't set max writeback rate if gc is running */ if (!c->gc_mark_valid) return false; diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 2900fbde89b3..76587e9af0ef 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -280,7 +280,7 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio) struct flakey_c *fc = ti->private; bio_set_dev(bio, fc->dev->bdev); - if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET) + if (bio_sectors(bio) || op_is_zone_mgmt(bio_op(bio))) bio->bi_iter.bi_sector = flakey_map_sector(ti, bio->bi_iter.bi_sector); } @@ -322,8 +322,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); pb->bio_submitted = false; - /* Do not fail reset zone */ - if (bio_op(bio) == REQ_OP_ZONE_RESET) + if (op_is_zone_mgmt(bio_op(bio))) goto map_bio; /* Are we alive ? */ @@ -384,7 +383,7 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, struct flakey_c *fc = ti->private; struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); - if (bio_op(bio) == REQ_OP_ZONE_RESET) + if (op_is_zone_mgmt(bio_op(bio))) return DM_ENDIO_DONE; if (!*error && pb->bio_submitted && (bio_data_dir(bio) == READ)) { diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index ecefe6703736..97acafd48c85 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -90,7 +90,7 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio) struct linear_c *lc = ti->private; bio_set_dev(bio, lc->dev->bdev); - if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET) + if (bio_sectors(bio) || op_is_zone_mgmt(bio_op(bio))) bio->bi_iter.bi_sector = linear_map_sector(ti, bio->bi_iter.bi_sector); } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 1a5e328c443a..bc143c1b2333 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1174,7 +1174,8 @@ static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, /* * A target may call dm_accept_partial_bio only from the map routine. It is - * allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET. + * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_RESET, + * REQ_OP_ZONE_OPEN, REQ_OP_ZONE_CLOSE and REQ_OP_ZONE_FINISH. * * dm_accept_partial_bio informs the dm that the target only wants to process * additional n_sectors sectors of the bio and the rest of the data should be @@ -1627,7 +1628,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, ci.sector_count = 0; error = __send_empty_flush(&ci); /* dec_pending submits any data associated with flush */ - } else if (bio_op(bio) == REQ_OP_ZONE_RESET) { + } else if (op_is_zone_mgmt(bio_op(bio))) { ci.bio = bio; ci.sector_count = 0; error = __split_and_process_non_flush(&ci); diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index b092c7b5282f..3ad18246fcb3 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2139,6 +2139,7 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, memcpy(page_address(store.sb_page), page_address(bitmap->storage.sb_page), sizeof(bitmap_super_t)); + spin_lock_irq(&bitmap->counts.lock); md_bitmap_file_unmap(&bitmap->storage); bitmap->storage = store; @@ -2154,7 +2155,6 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, blocks = min(old_counts.chunks << old_counts.chunkshift, chunks << chunkshift); - spin_lock_irq(&bitmap->counts.lock); /* For cluster raid, need to pre-allocate bitmap */ if (mddev_is_clustered(bitmap->mddev)) { unsigned long page; diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index c766c559d36d..26c75c0199fa 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -244,10 +244,9 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) sector_t start_sector, end_sector, data_offset; sector_t bio_sector = bio->bi_iter.bi_sector; - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) return true; - } tmp_dev = which_dev(mddev, bio_sector); start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c index 6780938d2991..152f9e65a226 100644 --- a/drivers/md/md-multipath.c +++ b/drivers/md/md-multipath.c @@ -104,10 +104,9 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio) struct multipath_bh * mp_bh; struct multipath_info *multipath; - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) return true; - } mp_bh = mempool_alloc(&conf->pool, GFP_NOIO); diff --git a/drivers/md/md.c b/drivers/md/md.c index 1be7abeb24fd..805b33e27496 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -550,7 +550,13 @@ static void md_submit_flush_data(struct work_struct *ws) } } -void md_flush_request(struct mddev *mddev, struct bio *bio) +/* + * Manages consolidation of flushes and submitting any flushes needed for + * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is + * being finished in another context. Returns false if the flushing is + * complete but still needs the I/O portion of the bio to be processed. + */ +bool md_flush_request(struct mddev *mddev, struct bio *bio) { ktime_t start = ktime_get_boottime(); spin_lock_irq(&mddev->lock); @@ -575,9 +581,10 @@ void md_flush_request(struct mddev *mddev, struct bio *bio) bio_endio(bio); else { bio->bi_opf &= ~REQ_PREFLUSH; - mddev->pers->make_request(mddev, bio); + return false; } } + return true; } EXPORT_SYMBOL(md_flush_request); @@ -1098,6 +1105,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; mdp_super_t *sb; int ret; + bool spare_disk = true; /* * Calculate the position of the superblock (512byte sectors), @@ -1148,8 +1156,18 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor else rdev->desc_nr = sb->this_disk.number; + /* not spare disk, or LEVEL_MULTIPATH */ + if (sb->level == LEVEL_MULTIPATH || + (rdev->desc_nr >= 0 && + sb->disks[rdev->desc_nr].state & + ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))) + spare_disk = false; + if (!refdev) { - ret = 1; + if (!spare_disk) + ret = 1; + else + ret = 0; } else { __u64 ev1, ev2; mdp_super_t *refsb = page_address(refdev->sb_page); @@ -1165,7 +1183,8 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor } ev1 = md_event(sb); ev2 = md_event(refsb); - if (ev1 > ev2) + + if (!spare_disk && ev1 > ev2) ret = 1; else ret = 0; @@ -1525,6 +1544,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ sector_t sectors; char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; int bmask; + bool spare_disk = true; /* * Calculate the position of the superblock in 512byte sectors. @@ -1658,8 +1678,19 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ sb->level != 0) return -EINVAL; + /* not spare disk, or LEVEL_MULTIPATH */ + if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) || + (rdev->desc_nr >= 0 && + rdev->desc_nr < le32_to_cpu(sb->max_dev) && + (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || + le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))) + spare_disk = false; + if (!refdev) { - ret = 1; + if (!spare_disk) + ret = 1; + else + ret = 0; } else { __u64 ev1, ev2; struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); @@ -1676,7 +1707,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ ev1 = le64_to_cpu(sb->events); ev2 = le64_to_cpu(refsb->events); - if (ev1 > ev2) + if (!spare_disk && ev1 > ev2) ret = 1; else ret = 0; @@ -3597,7 +3628,7 @@ abort_free: * Check a full RAID array for plausibility */ -static void analyze_sbs(struct mddev *mddev) +static int analyze_sbs(struct mddev *mddev) { int i; struct md_rdev *rdev, *freshest, *tmp; @@ -3618,6 +3649,12 @@ static void analyze_sbs(struct mddev *mddev) md_kick_rdev_from_array(rdev); } + /* Cannot find a valid fresh disk */ + if (!freshest) { + pr_warn("md: cannot find a valid disk\n"); + return -EINVAL; + } + super_types[mddev->major_version]. validate_super(mddev, freshest); @@ -3652,6 +3689,8 @@ static void analyze_sbs(struct mddev *mddev) clear_bit(In_sync, &rdev->flags); } } + + return 0; } /* Read a fixed-point number. @@ -5570,7 +5609,9 @@ int md_run(struct mddev *mddev) if (!mddev->raid_disks) { if (!mddev->persistent) return -EINVAL; - analyze_sbs(mddev); + err = analyze_sbs(mddev); + if (err) + return -EINVAL; } if (mddev->level != LEVEL_NONE) diff --git a/drivers/md/md.h b/drivers/md/md.h index c5e3ff398b59..5f86f8adb0a4 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -550,7 +550,7 @@ struct md_personality int level; struct list_head list; struct module *owner; - bool (*make_request)(struct mddev *mddev, struct bio *bio); + bool __must_check (*make_request)(struct mddev *mddev, struct bio *bio); /* * start up works that do NOT require md_thread. tasks that * requires md_thread should go into start() @@ -703,7 +703,7 @@ extern void md_error(struct mddev *mddev, struct md_rdev *rdev); extern void md_finish_reshape(struct mddev *mddev); extern int mddev_congested(struct mddev *mddev, int bits); -extern void md_flush_request(struct mddev *mddev, struct bio *bio); +extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, sector_t sector, int size, struct page *page); extern int md_super_wait(struct mddev *mddev); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 1e772287b1c8..b7c20979bd19 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -575,10 +575,9 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) unsigned chunk_sects; unsigned sectors; - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) return true; - } if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) { raid0_handle_discard(mddev, bio); @@ -615,7 +614,7 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) tmp_dev = map_sector(mddev, zone, sector, §or); break; default: - WARN("md/raid0:%s: Invalid layout\n", mdname(mddev)); + WARN(1, "md/raid0:%s: Invalid layout\n", mdname(mddev)); bio_io_error(bio); return true; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 0466ee2453b4..a409ab6f30bc 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -819,6 +819,7 @@ static void flush_bio_list(struct r1conf *conf, struct bio *bio) else generic_make_request(bio); bio = next; + cond_resched(); } } @@ -1567,10 +1568,9 @@ static bool raid1_make_request(struct mddev *mddev, struct bio *bio) { sector_t sectors; - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) return true; - } /* * There is a limit to the maximum size, but diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 299c7b1c9718..ec136e44aef7 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -191,7 +191,7 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) out_free_pages: while (--j >= 0) - resync_free_pages(&rps[j * 2]); + resync_free_pages(&rps[j]); j = 0; out_free_bio: @@ -1525,10 +1525,9 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio) int chunk_sects = chunk_mask + 1; int sectors = bio_sectors(bio); - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) return true; - } if (!md_write_start(mddev, bio)) return false; diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 18a4064a61a8..cab5b1352892 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -1404,7 +1404,7 @@ int ppl_init_log(struct r5conf *conf) atomic64_set(&ppl_conf->seq, 0); INIT_LIST_HEAD(&ppl_conf->no_mem_stripes); spin_lock_init(&ppl_conf->no_mem_stripes_lock); - ppl_conf->write_hint = RWF_WRITE_LIFE_NOT_SET; + ppl_conf->write_hint = RWH_WRITE_LIFE_NOT_SET; if (!mddev->external) { ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid)); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 223e97ab27e6..f0fc538bfe59 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1134,7 +1134,7 @@ again: bi->bi_iter.bi_size = STRIPE_SIZE; bi->bi_write_hint = sh->dev[i].write_hint; if (!rrdev) - sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET; + sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET; /* * If this is discard request, set bi_vcnt 0. We don't * want to confuse SCSI because SCSI will replace payload @@ -1187,7 +1187,7 @@ again: rbi->bi_io_vec[0].bv_offset = 0; rbi->bi_iter.bi_size = STRIPE_SIZE; rbi->bi_write_hint = sh->dev[i].write_hint; - sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET; + sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET; /* * If this is discard request, set bi_vcnt 0. We don't * want to confuse SCSI because SCSI will replace payload @@ -5592,8 +5592,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) if (ret == 0) return true; if (ret == -ENODEV) { - md_flush_request(mddev, bi); - return true; + if (md_flush_request(mddev, bi)) + return true; } /* ret == -EAGAIN, fallback */ /* |