diff options
Diffstat (limited to 'drivers/md')
43 files changed, 779 insertions, 739 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index aa98953f4462..d6d5ab23c088 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -38,9 +38,9 @@ config MD_AUTODETECT default y ---help--- If you say Y here, then the kernel will try to autodetect raid - arrays as part of its boot process. + arrays as part of its boot process. - If you don't use raid and say Y, this autodetection can cause + If you don't use raid and say Y, this autodetection can cause a several-second delay in the boot time due to various synchronisation steps that are part of this step. @@ -290,7 +290,7 @@ config DM_SNAPSHOT depends on BLK_DEV_DM select DM_BUFIO ---help--- - Allow volume managers to take writable snapshots of a device. + Allow volume managers to take writable snapshots of a device. config DM_THIN_PROVISIONING tristate "Thin provisioning target" @@ -298,7 +298,7 @@ config DM_THIN_PROVISIONING select DM_PERSISTENT_DATA select DM_BIO_PRISON ---help--- - Provides thin provisioning and snapshots that share a data store. + Provides thin provisioning and snapshots that share a data store. config DM_CACHE tristate "Cache target (EXPERIMENTAL)" @@ -307,23 +307,23 @@ config DM_CACHE select DM_PERSISTENT_DATA select DM_BIO_PRISON ---help--- - dm-cache attempts to improve performance of a block device by - moving frequently used data to a smaller, higher performance - device. Different 'policy' plugins can be used to change the - algorithms used to select which blocks are promoted, demoted, - cleaned etc. It supports writeback and writethrough modes. + dm-cache attempts to improve performance of a block device by + moving frequently used data to a smaller, higher performance + device. Different 'policy' plugins can be used to change the + algorithms used to select which blocks are promoted, demoted, + cleaned etc. It supports writeback and writethrough modes. config DM_CACHE_SMQ tristate "Stochastic MQ Cache Policy (EXPERIMENTAL)" depends on DM_CACHE default y ---help--- - A cache policy that uses a multiqueue ordered by recent hits - to select which blocks should be promoted and demoted. - This is meant to be a general purpose policy. It prioritises - reads over writes. This SMQ policy (vs MQ) offers the promise - of less memory utilization, improved performance and increased - adaptability in the face of changing workloads. + A cache policy that uses a multiqueue ordered by recent hits + to select which blocks should be promoted and demoted. + This is meant to be a general purpose policy. It prioritises + reads over writes. This SMQ policy (vs MQ) offers the promise + of less memory utilization, improved performance and increased + adaptability in the face of changing workloads. config DM_WRITECACHE tristate "Writecache target" @@ -343,9 +343,9 @@ config DM_ERA select DM_PERSISTENT_DATA select DM_BIO_PRISON ---help--- - dm-era tracks which parts of a block device are written to - over time. Useful for maintaining cache coherency when using - vendor snapshots. + dm-era tracks which parts of a block device are written to + over time. Useful for maintaining cache coherency when using + vendor snapshots. config DM_CLONE tristate "Clone target (EXPERIMENTAL)" @@ -353,20 +353,20 @@ config DM_CLONE default n select DM_PERSISTENT_DATA ---help--- - dm-clone produces a one-to-one copy of an existing, read-only source - device into a writable destination device. The cloned device is - visible/mountable immediately and the copy of the source device to the - destination device happens in the background, in parallel with user - I/O. + dm-clone produces a one-to-one copy of an existing, read-only source + device into a writable destination device. The cloned device is + visible/mountable immediately and the copy of the source device to the + destination device happens in the background, in parallel with user + I/O. - If unsure, say N. + If unsure, say N. config DM_MIRROR tristate "Mirror target" depends on BLK_DEV_DM ---help--- - Allow volume managers to mirror logical volumes, also - needed for live data migration tools such as 'pvmove'. + Allow volume managers to mirror logical volumes, also + needed for live data migration tools such as 'pvmove'. config DM_LOG_USERSPACE tristate "Mirror userspace logging" @@ -483,7 +483,7 @@ config DM_FLAKEY tristate "Flakey target" depends on BLK_DEV_DM ---help--- - A target that intermittently fails I/O for debugging purposes. + A target that intermittently fails I/O for debugging purposes. config DM_VERITY tristate "Verity target support" diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index d26b35195825..fd714628da6a 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -5,5 +5,3 @@ obj-$(CONFIG_BCACHE) += bcache.o bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ util.o writeback.o - -CFLAGS_request.o += -Iblock diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 6f776823b9ba..a1df0d95151c 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -377,7 +377,10 @@ retry_invalidate: if (!fifo_full(&ca->free_inc)) goto retry_invalidate; - bch_prio_write(ca); + if (bch_prio_write(ca, false) < 0) { + ca->invalidate_needs_gc = 1; + wake_up_gc(ca->set); + } } } out: diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 013e35a9e317..9198c1b480d9 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -582,6 +582,7 @@ struct cache_set { */ wait_queue_head_t btree_cache_wait; struct task_struct *btree_cache_alloc_lock; + spinlock_t btree_cannibalize_lock; /* * When we free a btree node, we increment the gen of the bucket the @@ -723,6 +724,7 @@ struct cache_set { unsigned int gc_always_rewrite:1; unsigned int shrinker_disabled:1; unsigned int copy_gc_enabled:1; + unsigned int idle_max_writeback_rate_enabled:1; #define BUCKET_HASH_BITS 12 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; @@ -977,7 +979,7 @@ bool bch_cached_dev_error(struct cached_dev *dc); __printf(2, 3) bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...); -void bch_prio_write(struct cache *ca); +int bch_prio_write(struct cache *ca, bool wait); void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent); extern struct workqueue_struct *bcache_wq; diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 08768796b543..cffcdc9feefb 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -155,6 +155,7 @@ int __bch_keylist_realloc(struct keylist *l, unsigned int u64s) return 0; } +/* Pop the top key of keylist by pointing l->top to its previous key */ struct bkey *bch_keylist_pop(struct keylist *l) { struct bkey *k = l->keys; @@ -168,6 +169,7 @@ struct bkey *bch_keylist_pop(struct keylist *l) return l->top = k; } +/* Pop the bottom key of keylist and update l->top_p */ void bch_keylist_pop_front(struct keylist *l) { l->top_p -= bkey_u64s(l->keys); @@ -309,7 +311,6 @@ void bch_btree_keys_free(struct btree_keys *b) t->tree = NULL; t->data = NULL; } -EXPORT_SYMBOL(bch_btree_keys_free); int bch_btree_keys_alloc(struct btree_keys *b, unsigned int page_order, @@ -342,7 +343,6 @@ err: bch_btree_keys_free(b); return -ENOMEM; } -EXPORT_SYMBOL(bch_btree_keys_alloc); void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, bool *expensive_debug_checks) @@ -361,7 +361,6 @@ void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, * any more. */ } -EXPORT_SYMBOL(bch_btree_keys_init); /* Binary tree stuff for auxiliary search trees */ @@ -678,7 +677,6 @@ void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic) bch_bset_build_unwritten_tree(b); } -EXPORT_SYMBOL(bch_bset_init_next); /* * Build auxiliary binary tree 'struct bset_tree *t', this tree is used to @@ -732,7 +730,6 @@ void bch_bset_build_written_tree(struct btree_keys *b) j = inorder_next(j, t->size)) make_bfloat(t, j); } -EXPORT_SYMBOL(bch_bset_build_written_tree); /* Insert */ @@ -780,7 +777,6 @@ fix_right: do { j = j * 2 + 1; } while (j < t->size); } -EXPORT_SYMBOL(bch_bset_fix_invalidated_key); static void bch_bset_fix_lookup_table(struct btree_keys *b, struct bset_tree *t, @@ -855,7 +851,6 @@ bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r) return b->ops->key_merge(b, l, r); } -EXPORT_SYMBOL(bch_bkey_try_merge); void bch_bset_insert(struct btree_keys *b, struct bkey *where, struct bkey *insert) @@ -875,7 +870,6 @@ void bch_bset_insert(struct btree_keys *b, struct bkey *where, bkey_copy(where, insert); bch_bset_fix_lookup_table(b, t, where); } -EXPORT_SYMBOL(bch_bset_insert); unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, struct bkey *replace_key) @@ -931,7 +925,6 @@ copy: bkey_copy(m, k); merged: return status; } -EXPORT_SYMBOL(bch_btree_insert_key); /* Lookup */ @@ -1077,7 +1070,6 @@ struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, return i.l; } -EXPORT_SYMBOL(__bch_bset_search); /* Btree iterator */ @@ -1132,7 +1124,6 @@ struct bkey *bch_btree_iter_init(struct btree_keys *b, { return __bch_btree_iter_init(b, iter, search, b->set); } -EXPORT_SYMBOL(bch_btree_iter_init); static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, btree_iter_cmp_fn *cmp) @@ -1165,7 +1156,6 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter) return __bch_btree_iter_next(iter, btree_iter_cmp); } -EXPORT_SYMBOL(bch_btree_iter_next); struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, struct btree_keys *b, ptr_filter_fn fn) @@ -1196,7 +1186,6 @@ int bch_bset_sort_state_init(struct bset_sort_state *state, return mempool_init_page_pool(&state->pool, 1, page_order); } -EXPORT_SYMBOL(bch_bset_sort_state_init); static void btree_mergesort(struct btree_keys *b, struct bset *out, struct btree_iter *iter, @@ -1313,7 +1302,6 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize); } -EXPORT_SYMBOL(bch_btree_sort_partial); void bch_btree_sort_and_fix_extents(struct btree_keys *b, struct btree_iter *iter, @@ -1366,7 +1354,6 @@ void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state) out: bch_bset_build_written_tree(b); } -EXPORT_SYMBOL(bch_btree_sort_lazy); void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats) { diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index ba434d9ac720..14d6c33b0957 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -543,6 +543,11 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) set_btree_node_dirty(b); + /* + * w->journal is always the oldest journal pin of all bkeys + * in the leaf node, to make sure the oldest jset seq won't + * be increased before this btree node is flushed. + */ if (journal_ref) { if (w->journal && journal_pin_cmp(b->c, w->journal, journal_ref)) { @@ -723,6 +728,8 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, * IO can always make forward progress: */ nr /= c->btree_pages; + if (nr == 0) + nr = 1; nr = min_t(unsigned long, nr, mca_can_free(c)); i = 0; @@ -884,15 +891,17 @@ out: static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op) { - struct task_struct *old; - - old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current); - if (old && old != current) { + spin_lock(&c->btree_cannibalize_lock); + if (likely(c->btree_cache_alloc_lock == NULL)) { + c->btree_cache_alloc_lock = current; + } else if (c->btree_cache_alloc_lock != current) { if (op) prepare_to_wait(&c->btree_cache_wait, &op->wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&c->btree_cannibalize_lock); return -EINTR; } + spin_unlock(&c->btree_cannibalize_lock); return 0; } @@ -927,10 +936,12 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, */ static void bch_cannibalize_unlock(struct cache_set *c) { + spin_lock(&c->btree_cannibalize_lock); if (c->btree_cache_alloc_lock == current) { c->btree_cache_alloc_lock = NULL; wake_up(&c->btree_cache_wait); } + spin_unlock(&c->btree_cannibalize_lock); } static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op, diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index c12cd809ab19..0164a1fe94a9 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -45,7 +45,6 @@ void closure_sub(struct closure *cl, int v) { closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); } -EXPORT_SYMBOL(closure_sub); /* * closure_put - decrement a closure's refcount @@ -54,7 +53,6 @@ void closure_put(struct closure *cl) { closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); } -EXPORT_SYMBOL(closure_put); /* * closure_wake_up - wake up all closures on a wait list, without memory barrier @@ -76,7 +74,6 @@ void __closure_wake_up(struct closure_waitlist *wait_list) closure_sub(cl, CLOSURE_WAITING + 1); } } -EXPORT_SYMBOL(__closure_wake_up); /** * closure_wait - add a closure to a waitlist @@ -96,7 +93,6 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) return true; } -EXPORT_SYMBOL(closure_wait); struct closure_syncer { struct task_struct *task; @@ -131,7 +127,6 @@ void __sched __closure_sync(struct closure *cl) __set_current_state(TASK_RUNNING); } -EXPORT_SYMBOL(__closure_sync); #ifdef CONFIG_BCACHE_CLOSURES_DEBUG @@ -149,7 +144,6 @@ void closure_debug_create(struct closure *cl) list_add(&cl->all, &closure_list); spin_unlock_irqrestore(&closure_list_lock, flags); } -EXPORT_SYMBOL(closure_debug_create); void closure_debug_destroy(struct closure *cl) { @@ -162,7 +156,6 @@ void closure_debug_destroy(struct closure *cl) list_del(&cl->all); spin_unlock_irqrestore(&closure_list_lock, flags); } -EXPORT_SYMBOL(closure_debug_destroy); static struct dentry *closure_debug; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 41adcd1546f1..73478a91a342 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -62,18 +62,6 @@ static void bch_data_insert_keys(struct closure *cl) struct bkey *replace_key = op->replace ? &op->replace_key : NULL; int ret; - /* - * If we're looping, might already be waiting on - * another journal write - can't wait on more than one journal write at - * a time - * - * XXX: this looks wrong - */ -#if 0 - while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING) - closure_sync(&s->cl); -#endif - if (!op->replace) journal_ref = bch_journal(op->c, &op->insert_keys, op->flush_journal ? cl : NULL); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 20ed838e9413..77e9869345e7 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -92,10 +92,11 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u", sb->version, sb->flags, sb->seq, sb->keys); - err = "Not a bcache superblock"; + err = "Not a bcache superblock (bad offset)"; if (sb->offset != SB_SECTOR) goto err; + err = "Not a bcache superblock (bad magic)"; if (memcmp(sb->magic, bcache_magic, 16)) goto err; @@ -529,12 +530,29 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op, closure_sync(cl); } -void bch_prio_write(struct cache *ca) +int bch_prio_write(struct cache *ca, bool wait) { int i; struct bucket *b; struct closure cl; + pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu", + fifo_used(&ca->free[RESERVE_PRIO]), + fifo_used(&ca->free[RESERVE_NONE]), + fifo_used(&ca->free_inc)); + + /* + * Pre-check if there are enough free buckets. In the non-blocking + * scenario it's better to fail early rather than starting to allocate + * buckets and do a cleanup later in case of failure. + */ + if (!wait) { + size_t avail = fifo_used(&ca->free[RESERVE_PRIO]) + + fifo_used(&ca->free[RESERVE_NONE]); + if (prio_buckets(ca) > avail) + return -ENOMEM; + } + closure_init_stack(&cl); lockdep_assert_held(&ca->set->bucket_lock); @@ -544,9 +562,6 @@ void bch_prio_write(struct cache *ca) atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), &ca->meta_sectors_written); - //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), - // fifo_used(&ca->free_inc), fifo_used(&ca->unused)); - for (i = prio_buckets(ca) - 1; i >= 0; --i) { long bucket; struct prio_set *p = ca->disk_buckets; @@ -564,7 +579,7 @@ void bch_prio_write(struct cache *ca) p->magic = pset_magic(&ca->sb); p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); - bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true); + bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait); BUG_ON(bucket == -1); mutex_unlock(&ca->set->bucket_lock); @@ -593,6 +608,7 @@ void bch_prio_write(struct cache *ca) ca->prio_last_buckets[i] = ca->prio_buckets[i]; } + return 0; } static void prio_read(struct cache *ca, uint64_t bucket) @@ -761,20 +777,28 @@ static inline int idx_to_first_minor(int idx) static void bcache_device_free(struct bcache_device *d) { + struct gendisk *disk = d->disk; + lockdep_assert_held(&bch_register_lock); - pr_info("%s stopped", d->disk->disk_name); + if (disk) + pr_info("%s stopped", disk->disk_name); + else + pr_err("bcache device (NULL gendisk) stopped"); if (d->c) bcache_device_detach(d); - if (d->disk && d->disk->flags & GENHD_FL_UP) - del_gendisk(d->disk); - if (d->disk && d->disk->queue) - blk_cleanup_queue(d->disk->queue); - if (d->disk) { + + if (disk) { + if (disk->flags & GENHD_FL_UP) + del_gendisk(disk); + + if (disk->queue) + blk_cleanup_queue(disk->queue); + ida_simple_remove(&bcache_device_idx, - first_minor_to_idx(d->disk->first_minor)); - put_disk(d->disk); + first_minor_to_idx(disk->first_minor)); + put_disk(disk); } bioset_exit(&d->bio_split); @@ -1769,6 +1793,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) sema_init(&c->sb_write_mutex, 1); mutex_init(&c->bucket_lock); init_waitqueue_head(&c->btree_cache_wait); + spin_lock_init(&c->btree_cannibalize_lock); init_waitqueue_head(&c->bucket_wait); init_waitqueue_head(&c->gc_wait); sema_init(&c->uuid_write_mutex, 1); @@ -1809,6 +1834,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->congested_read_threshold_us = 2000; c->congested_write_threshold_us = 20000; c->error_limit = DEFAULT_IO_ERROR_LIMIT; + c->idle_max_writeback_rate_enabled = 1; WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags)); return c; @@ -1954,7 +1980,7 @@ static int run_cache_set(struct cache_set *c) mutex_lock(&c->bucket_lock); for_each_cache(ca, c, i) - bch_prio_write(ca); + bch_prio_write(ca, true); mutex_unlock(&c->bucket_lock); err = "cannot allocate new UUID bucket"; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 627dcea0f5b6..733e2ddf3c78 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -134,6 +134,7 @@ rw_attribute(expensive_debug_checks); rw_attribute(cache_replacement_policy); rw_attribute(btree_shrinker_disabled); rw_attribute(copy_gc_enabled); +rw_attribute(idle_max_writeback_rate); rw_attribute(gc_after_writeback); rw_attribute(size); @@ -747,6 +748,8 @@ SHOW(__bch_cache_set) sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); + sysfs_printf(idle_max_writeback_rate, "%i", + c->idle_max_writeback_rate_enabled); sysfs_printf(gc_after_writeback, "%i", c->gc_after_writeback); sysfs_printf(io_disable, "%i", test_bit(CACHE_SET_IO_DISABLE, &c->flags)); @@ -864,6 +867,9 @@ STORE(__bch_cache_set) sysfs_strtoul_bool(gc_always_rewrite, c->gc_always_rewrite); sysfs_strtoul_bool(btree_shrinker_disabled, c->shrinker_disabled); sysfs_strtoul_bool(copy_gc_enabled, c->copy_gc_enabled); + sysfs_strtoul_bool(idle_max_writeback_rate, + c->idle_max_writeback_rate_enabled); + /* * write gc_after_writeback here may overwrite an already set * BCH_DO_AUTO_GC, it doesn't matter because this flag will be @@ -954,6 +960,7 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_gc_always_rewrite, &sysfs_btree_shrinker_disabled, &sysfs_copy_gc_enabled, + &sysfs_idle_max_writeback_rate, &sysfs_gc_after_writeback, &sysfs_io_disable, &sysfs_cutoff_writeback, diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index d60268fe49e1..4a40f9eadeaf 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -122,6 +122,10 @@ static void __update_writeback_rate(struct cached_dev *dc) static bool set_at_max_writeback_rate(struct cache_set *c, struct cached_dev *dc) { + /* Don't sst max writeback rate if it is disabled */ + if (!c->idle_max_writeback_rate_enabled) + return false; + /* Don't set max writeback rate if gc is running */ if (!c->gc_mark_valid) return false; diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c index b5389890bbc3..1f8f98efd97a 100644 --- a/drivers/md/dm-bio-prison-v1.c +++ b/drivers/md/dm-bio-prison-v1.c @@ -150,11 +150,10 @@ static int bio_detain(struct dm_bio_prison *prison, struct dm_bio_prison_cell **cell_result) { int r; - unsigned long flags; - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); return r; } @@ -198,11 +197,9 @@ void dm_cell_release(struct dm_bio_prison *prison, struct dm_bio_prison_cell *cell, struct bio_list *bios) { - unsigned long flags; - - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); __cell_release(prison, cell, bios); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); } EXPORT_SYMBOL_GPL(dm_cell_release); @@ -250,12 +247,10 @@ void dm_cell_visit_release(struct dm_bio_prison *prison, void *context, struct dm_bio_prison_cell *cell) { - unsigned long flags; - - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); visit_fn(context, cell); rb_erase(&cell->node, &prison->cells); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); } EXPORT_SYMBOL_GPL(dm_cell_visit_release); @@ -275,11 +270,10 @@ int dm_cell_promote_or_release(struct dm_bio_prison *prison, struct dm_bio_prison_cell *cell) { int r; - unsigned long flags; - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); r = __promote_or_release(prison, cell); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); return r; } @@ -379,10 +373,9 @@ EXPORT_SYMBOL_GPL(dm_deferred_entry_dec); int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work) { int r = 1; - unsigned long flags; unsigned next_entry; - spin_lock_irqsave(&ds->lock, flags); + spin_lock_irq(&ds->lock); if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->current_entry].count) r = 0; @@ -392,7 +385,7 @@ int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work) if (!ds->entries[next_entry].count) ds->current_entry = next_entry; } - spin_unlock_irqrestore(&ds->lock, flags); + spin_unlock_irq(&ds->lock); return r; } diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c index b092cdc8e1ae..8ee019eda32d 100644 --- a/drivers/md/dm-bio-prison-v2.c +++ b/drivers/md/dm-bio-prison-v2.c @@ -177,11 +177,10 @@ bool dm_cell_get_v2(struct dm_bio_prison_v2 *prison, struct dm_bio_prison_cell_v2 **cell_result) { int r; - unsigned long flags; - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); r = __get(prison, key, lock_level, inmate, cell_prealloc, cell_result); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); return r; } @@ -261,11 +260,10 @@ int dm_cell_lock_v2(struct dm_bio_prison_v2 *prison, struct dm_bio_prison_cell_v2 **cell_result) { int r; - unsigned long flags; - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); r = __lock(prison, key, lock_level, cell_prealloc, cell_result); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); return r; } @@ -285,11 +283,9 @@ void dm_cell_quiesce_v2(struct dm_bio_prison_v2 *prison, struct dm_bio_prison_cell_v2 *cell, struct work_struct *continuation) { - unsigned long flags; - - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); __quiesce(prison, cell, continuation); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); } EXPORT_SYMBOL_GPL(dm_cell_quiesce_v2); @@ -309,11 +305,10 @@ int dm_cell_lock_promote_v2(struct dm_bio_prison_v2 *prison, unsigned new_lock_level) { int r; - unsigned long flags; - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); r = __promote(prison, cell, new_lock_level); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); return r; } @@ -342,11 +337,10 @@ bool dm_cell_unlock_v2(struct dm_bio_prison_v2 *prison, struct bio_list *bios) { bool r; - unsigned long flags; - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); r = __unlock(prison, cell, bios); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); return r; } diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index d249cf8ac277..2d32821b3a5b 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -74,22 +74,19 @@ static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs) static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs) { bool r; - unsigned long flags; - spin_lock_irqsave(&iot->lock, flags); + spin_lock_irq(&iot->lock); r = __iot_idle_for(iot, jifs); - spin_unlock_irqrestore(&iot->lock, flags); + spin_unlock_irq(&iot->lock); return r; } static void iot_io_begin(struct io_tracker *iot, sector_t len) { - unsigned long flags; - - spin_lock_irqsave(&iot->lock, flags); + spin_lock_irq(&iot->lock); iot->in_flight += len; - spin_unlock_irqrestore(&iot->lock, flags); + spin_unlock_irq(&iot->lock); } static void __iot_io_end(struct io_tracker *iot, sector_t len) @@ -172,7 +169,6 @@ static void __commit(struct work_struct *_ws) { struct batcher *b = container_of(_ws, struct batcher, commit_work); blk_status_t r; - unsigned long flags; struct list_head work_items; struct work_struct *ws, *tmp; struct continuation *k; @@ -186,12 +182,12 @@ static void __commit(struct work_struct *_ws) * We have to grab these before the commit_op to avoid a race * condition. */ - spin_lock_irqsave(&b->lock, flags); + spin_lock_irq(&b->lock); list_splice_init(&b->work_items, &work_items); bio_list_merge(&bios, &b->bios); bio_list_init(&b->bios); b->commit_scheduled = false; - spin_unlock_irqrestore(&b->lock, flags); + spin_unlock_irq(&b->lock); r = b->commit_op(b->commit_context); @@ -238,13 +234,12 @@ static void async_commit(struct batcher *b) static void continue_after_commit(struct batcher *b, struct continuation *k) { - unsigned long flags; bool commit_scheduled; - spin_lock_irqsave(&b->lock, flags); + spin_lock_irq(&b->lock); commit_scheduled = b->commit_scheduled; list_add_tail(&k->ws.entry, &b->work_items); - spin_unlock_irqrestore(&b->lock, flags); + spin_unlock_irq(&b->lock); if (commit_scheduled) async_commit(b); @@ -255,13 +250,12 @@ static void continue_after_commit(struct batcher *b, struct continuation *k) */ static void issue_after_commit(struct batcher *b, struct bio *bio) { - unsigned long flags; bool commit_scheduled; - spin_lock_irqsave(&b->lock, flags); + spin_lock_irq(&b->lock); commit_scheduled = b->commit_scheduled; bio_list_add(&b->bios, bio); - spin_unlock_irqrestore(&b->lock, flags); + spin_unlock_irq(&b->lock); if (commit_scheduled) async_commit(b); @@ -273,12 +267,11 @@ static void issue_after_commit(struct batcher *b, struct bio *bio) static void schedule_commit(struct batcher *b) { bool immediate; - unsigned long flags; - spin_lock_irqsave(&b->lock, flags); + spin_lock_irq(&b->lock); immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios); b->commit_scheduled = true; - spin_unlock_irqrestore(&b->lock, flags); + spin_unlock_irq(&b->lock); if (immediate) async_commit(b); @@ -542,7 +535,7 @@ static void wake_migration_worker(struct cache *cache) static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache) { - return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT); + return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOIO); } static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell) @@ -554,9 +547,7 @@ static struct dm_cache_migration *alloc_migration(struct cache *cache) { struct dm_cache_migration *mg; - mg = mempool_alloc(&cache->migration_pool, GFP_NOWAIT); - if (!mg) - return NULL; + mg = mempool_alloc(&cache->migration_pool, GFP_NOIO); memset(mg, 0, sizeof(*mg)); @@ -632,23 +623,19 @@ static struct per_bio_data *init_per_bio_data(struct bio *bio) static void defer_bio(struct cache *cache, struct bio *bio) { - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); bio_list_add(&cache->deferred_bios, bio); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); wake_deferred_bio_worker(cache); } static void defer_bios(struct cache *cache, struct bio_list *bios) { - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); bio_list_merge(&cache->deferred_bios, bios); bio_list_init(bios); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); wake_deferred_bio_worker(cache); } @@ -664,10 +651,6 @@ static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bi struct dm_bio_prison_cell_v2 *cell_prealloc, *cell; cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */ - if (!cell_prealloc) { - defer_bio(cache, bio); - return false; - } build_key(oblock, end, &key); r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell); @@ -762,33 +745,27 @@ static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) static void set_discard(struct cache *cache, dm_dblock_t b) { - unsigned long flags; - BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); atomic_inc(&cache->stats.discard_count); - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); set_bit(from_dblock(b), cache->discard_bitset); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); } static void clear_discard(struct cache *cache, dm_dblock_t b) { - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); clear_bit(from_dblock(b), cache->discard_bitset); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); } static bool is_discarded(struct cache *cache, dm_dblock_t b) { int r; - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); r = test_bit(from_dblock(b), cache->discard_bitset); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); return r; } @@ -796,12 +773,10 @@ static bool is_discarded(struct cache *cache, dm_dblock_t b) static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) { int r; - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); r = test_bit(from_dblock(oblock_to_dblock(cache, b)), cache->discard_bitset); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); return r; } @@ -833,17 +808,16 @@ static void remap_to_cache(struct cache *cache, struct bio *bio, static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) { - unsigned long flags; struct per_bio_data *pb; - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) && bio_op(bio) != REQ_OP_DISCARD) { pb = get_per_bio_data(bio); pb->tick = true; cache->need_tick_bio = false; } - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); } static void __remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, @@ -1493,11 +1467,6 @@ static int mg_lock_writes(struct dm_cache_migration *mg) struct dm_bio_prison_cell_v2 *prealloc; prealloc = alloc_prison_cell(cache); - if (!prealloc) { - DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache)); - mg_complete(mg, false); - return -ENOMEM; - } /* * Prevent writes to the block, but allow reads to continue. @@ -1535,11 +1504,6 @@ static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio } mg = alloc_migration(cache); - if (!mg) { - policy_complete_background_work(cache->policy, op, false); - background_work_end(cache); - return -ENOMEM; - } mg->op = op; mg->overwrite_bio = bio; @@ -1628,10 +1592,6 @@ static int invalidate_lock(struct dm_cache_migration *mg) struct dm_bio_prison_cell_v2 *prealloc; prealloc = alloc_prison_cell(cache); - if (!prealloc) { - invalidate_complete(mg, false); - return -ENOMEM; - } build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key); r = dm_cell_lock_v2(cache->prison, &key, @@ -1669,10 +1629,6 @@ static int invalidate_start(struct cache *cache, dm_cblock_t cblock, return -EPERM; mg = alloc_migration(cache); - if (!mg) { - background_work_end(cache); - return -ENOMEM; - } mg->overwrite_bio = bio; mg->invalidate_cblock = cblock; @@ -1913,17 +1869,16 @@ static void process_deferred_bios(struct work_struct *ws) { struct cache *cache = container_of(ws, struct cache, deferred_bio_worker); - unsigned long flags; bool commit_needed = false; struct bio_list bios; struct bio *bio; bio_list_init(&bios); - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); bio_list_merge(&bios, &cache->deferred_bios); bio_list_init(&cache->deferred_bios); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); while ((bio = bio_list_pop(&bios))) { if (bio->bi_opf & REQ_PREFLUSH) diff --git a/drivers/md/dm-clone-metadata.c b/drivers/md/dm-clone-metadata.c index 6bc8c1d1c351..08c552e5e41b 100644 --- a/drivers/md/dm-clone-metadata.c +++ b/drivers/md/dm-clone-metadata.c @@ -712,7 +712,7 @@ static int __metadata_commit(struct dm_clone_metadata *cmd) static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap) { int r; - unsigned long word, flags; + unsigned long word; word = 0; do { @@ -736,9 +736,9 @@ static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap) return r; /* Update the changed flag */ - spin_lock_irqsave(&cmd->bitmap_lock, flags); + spin_lock_irq(&cmd->bitmap_lock); dmap->changed = 0; - spin_unlock_irqrestore(&cmd->bitmap_lock, flags); + spin_unlock_irq(&cmd->bitmap_lock); return 0; } @@ -746,7 +746,6 @@ static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap) int dm_clone_metadata_commit(struct dm_clone_metadata *cmd) { int r = -EPERM; - unsigned long flags; struct dirty_map *dmap, *next_dmap; down_write(&cmd->lock); @@ -770,9 +769,9 @@ int dm_clone_metadata_commit(struct dm_clone_metadata *cmd) } /* Swap dirty bitmaps */ - spin_lock_irqsave(&cmd->bitmap_lock, flags); + spin_lock_irq(&cmd->bitmap_lock); cmd->current_dmap = next_dmap; - spin_unlock_irqrestore(&cmd->bitmap_lock, flags); + spin_unlock_irq(&cmd->bitmap_lock); /* * No one is accessing the old dirty bitmap anymore, so we can flush @@ -817,9 +816,9 @@ int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start, { int r = 0; struct dirty_map *dmap; - unsigned long word, region_nr, flags; + unsigned long word, region_nr; - spin_lock_irqsave(&cmd->bitmap_lock, flags); + spin_lock_irq(&cmd->bitmap_lock); if (cmd->read_only) { r = -EPERM; @@ -836,7 +835,7 @@ int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start, } } out: - spin_unlock_irqrestore(&cmd->bitmap_lock, flags); + spin_unlock_irq(&cmd->bitmap_lock); return r; } @@ -903,13 +902,11 @@ out: void dm_clone_metadata_set_read_only(struct dm_clone_metadata *cmd) { - unsigned long flags; - down_write(&cmd->lock); - spin_lock_irqsave(&cmd->bitmap_lock, flags); + spin_lock_irq(&cmd->bitmap_lock); cmd->read_only = 1; - spin_unlock_irqrestore(&cmd->bitmap_lock, flags); + spin_unlock_irq(&cmd->bitmap_lock); if (!cmd->fail_io) dm_bm_set_read_only(cmd->bm); @@ -919,13 +916,11 @@ void dm_clone_metadata_set_read_only(struct dm_clone_metadata *cmd) void dm_clone_metadata_set_read_write(struct dm_clone_metadata *cmd) { - unsigned long flags; - down_write(&cmd->lock); - spin_lock_irqsave(&cmd->bitmap_lock, flags); + spin_lock_irq(&cmd->bitmap_lock); cmd->read_only = 0; - spin_unlock_irqrestore(&cmd->bitmap_lock, flags); + spin_unlock_irq(&cmd->bitmap_lock); if (!cmd->fail_io) dm_bm_set_read_write(cmd->bm); diff --git a/drivers/md/dm-clone-metadata.h b/drivers/md/dm-clone-metadata.h index 434bff08508b..3fe50a781c11 100644 --- a/drivers/md/dm-clone-metadata.h +++ b/drivers/md/dm-clone-metadata.h @@ -44,7 +44,9 @@ int dm_clone_set_region_hydrated(struct dm_clone_metadata *cmd, unsigned long re * @start: Starting region number * @nr_regions: Number of regions in the range * - * This function doesn't block, so it's safe to call it from interrupt context. + * This function doesn't block, but since it uses spin_lock_irq()/spin_unlock_irq() + * it's NOT safe to call it from any context where interrupts are disabled, e.g., + * from interrupt context. */ int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start, unsigned long nr_regions); diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index cd6f9e9fc98e..b3d89072d21c 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -332,8 +332,6 @@ static void submit_bios(struct bio_list *bios) */ static void issue_bio(struct clone *clone, struct bio *bio) { - unsigned long flags; - if (!bio_triggers_commit(clone, bio)) { generic_make_request(bio); return; @@ -352,9 +350,9 @@ static void issue_bio(struct clone *clone, struct bio *bio) * Batch together any bios that trigger commits and then issue a single * commit for them in process_deferred_flush_bios(). */ - spin_lock_irqsave(&clone->lock, flags); + spin_lock_irq(&clone->lock); bio_list_add(&clone->deferred_flush_bios, bio); - spin_unlock_irqrestore(&clone->lock, flags); + spin_unlock_irq(&clone->lock); wake_worker(clone); } @@ -469,7 +467,7 @@ static void complete_discard_bio(struct clone *clone, struct bio *bio, bool succ static void process_discard_bio(struct clone *clone, struct bio *bio) { - unsigned long rs, re, flags; + unsigned long rs, re; bio_region_range(clone, bio, &rs, &re); BUG_ON(re > clone->nr_regions); @@ -501,9 +499,9 @@ static void process_discard_bio(struct clone *clone, struct bio *bio) /* * Defer discard processing. */ - spin_lock_irqsave(&clone->lock, flags); + spin_lock_irq(&clone->lock); bio_list_add(&clone->deferred_discard_bios, bio); - spin_unlock_irqrestore(&clone->lock, flags); + spin_unlock_irq(&clone->lock); wake_worker(clone); } @@ -554,6 +552,12 @@ struct hash_table_bucket { #define bucket_unlock_irqrestore(bucket, flags) \ spin_unlock_irqrestore(&(bucket)->lock, flags) +#define bucket_lock_irq(bucket) \ + spin_lock_irq(&(bucket)->lock) + +#define bucket_unlock_irq(bucket) \ + spin_unlock_irq(&(bucket)->lock) + static int hash_table_init(struct clone *clone) { unsigned int i, sz; @@ -591,8 +595,8 @@ static struct hash_table_bucket *get_hash_table_bucket(struct clone *clone, * * NOTE: Must be called with the bucket lock held */ -struct dm_clone_region_hydration *__hash_find(struct hash_table_bucket *bucket, - unsigned long region_nr) +static struct dm_clone_region_hydration *__hash_find(struct hash_table_bucket *bucket, + unsigned long region_nr) { struct dm_clone_region_hydration *hd; @@ -851,7 +855,6 @@ static void hydration_overwrite(struct dm_clone_region_hydration *hd, struct bio */ static void hydrate_bio_region(struct clone *clone, struct bio *bio) { - unsigned long flags; unsigned long region_nr; struct hash_table_bucket *bucket; struct dm_clone_region_hydration *hd, *hd2; @@ -859,19 +862,19 @@ static void hydrate_bio_region(struct clone *clone, struct bio *bio) region_nr = bio_to_region(clone, bio); bucket = get_hash_table_bucket(clone, region_nr); - bucket_lock_irqsave(bucket, flags); + bucket_lock_irq(bucket); hd = __hash_find(bucket, region_nr); if (hd) { /* Someone else is hydrating the region */ bio_list_add(&hd->deferred_bios, bio); - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); return; } if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) { /* The region has been hydrated */ - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); issue_bio(clone, bio); return; } @@ -880,16 +883,16 @@ static void hydrate_bio_region(struct clone *clone, struct bio *bio) * We must allocate a hydration descriptor and start the hydration of * the corresponding region. */ - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); hd = alloc_hydration(clone); hydration_init(hd, region_nr); - bucket_lock_irqsave(bucket, flags); + bucket_lock_irq(bucket); /* Check if the region has been hydrated in the meantime. */ if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) { - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); free_hydration(hd); issue_bio(clone, bio); return; @@ -899,7 +902,7 @@ static void hydrate_bio_region(struct clone *clone, struct bio *bio) if (hd2 != hd) { /* Someone else started the region's hydration. */ bio_list_add(&hd2->deferred_bios, bio); - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); free_hydration(hd); return; } @@ -911,7 +914,7 @@ static void hydrate_bio_region(struct clone *clone, struct bio *bio) */ if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { hlist_del(&hd->h); - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); free_hydration(hd); bio_io_error(bio); return; @@ -925,11 +928,11 @@ static void hydrate_bio_region(struct clone *clone, struct bio *bio) * to the destination device. */ if (is_overwrite_bio(clone, bio)) { - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); hydration_overwrite(hd, bio); } else { bio_list_add(&hd->deferred_bios, bio); - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); hydration_copy(hd, 1); } } @@ -996,7 +999,6 @@ static unsigned long __start_next_hydration(struct clone *clone, unsigned long offset, struct batch_info *batch) { - unsigned long flags; struct hash_table_bucket *bucket; struct dm_clone_region_hydration *hd; unsigned long nr_regions = clone->nr_regions; @@ -1010,13 +1012,13 @@ static unsigned long __start_next_hydration(struct clone *clone, break; bucket = get_hash_table_bucket(clone, offset); - bucket_lock_irqsave(bucket, flags); + bucket_lock_irq(bucket); if (!dm_clone_is_region_hydrated(clone->cmd, offset) && !__hash_find(bucket, offset)) { hydration_init(hd, offset); __insert_region_hydration(bucket, hd); - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); /* Batch hydration */ __batch_hydration(batch, hd); @@ -1024,7 +1026,7 @@ static unsigned long __start_next_hydration(struct clone *clone, return (offset + 1); } - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); } while (++offset < nr_regions); @@ -1140,13 +1142,13 @@ static void process_deferred_discards(struct clone *clone) int r = -EPERM; struct bio *bio; struct blk_plug plug; - unsigned long rs, re, flags; + unsigned long rs, re; struct bio_list discards = BIO_EMPTY_LIST; - spin_lock_irqsave(&clone->lock, flags); + spin_lock_irq(&clone->lock); bio_list_merge(&discards, &clone->deferred_discard_bios); bio_list_init(&clone->deferred_discard_bios); - spin_unlock_irqrestore(&clone->lock, flags); + spin_unlock_irq(&clone->lock); if (bio_list_empty(&discards)) return; @@ -1176,13 +1178,12 @@ out: static void process_deferred_bios(struct clone *clone) { - unsigned long flags; struct bio_list bios = BIO_EMPTY_LIST; - spin_lock_irqsave(&clone->lock, flags); + spin_lock_irq(&clone->lock); bio_list_merge(&bios, &clone->deferred_bios); bio_list_init(&clone->deferred_bios); - spin_unlock_irqrestore(&clone->lock, flags); + spin_unlock_irq(&clone->lock); if (bio_list_empty(&bios)) return; @@ -1193,7 +1194,6 @@ static void process_deferred_bios(struct clone *clone) static void process_deferred_flush_bios(struct clone *clone) { struct bio *bio; - unsigned long flags; struct bio_list bios = BIO_EMPTY_LIST; struct bio_list bio_completions = BIO_EMPTY_LIST; @@ -1201,13 +1201,13 @@ static void process_deferred_flush_bios(struct clone *clone) * If there are any deferred flush bios, we must commit the metadata * before issuing them or signaling their completion. */ - spin_lock_irqsave(&clone->lock, flags); + spin_lock_irq(&clone->lock); bio_list_merge(&bios, &clone->deferred_flush_bios); bio_list_init(&clone->deferred_flush_bios); bio_list_merge(&bio_completions, &clone->deferred_flush_completions); bio_list_init(&clone->deferred_flush_completions); - spin_unlock_irqrestore(&clone->lock, flags); + spin_unlock_irq(&clone->lock); if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) && !(dm_clone_changed_this_transaction(clone->cmd) && need_commit_due_to_time(clone))) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index f87f6495652f..eb9782fc93fe 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2700,21 +2700,18 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } ret = -ENOMEM; - cc->io_queue = alloc_workqueue("kcryptd_io/%s", - WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, - 1, devname); + cc->io_queue = alloc_workqueue("kcryptd_io/%s", WQ_MEM_RECLAIM, 1, devname); if (!cc->io_queue) { ti->error = "Couldn't create kcryptd io queue"; goto bad; } if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) - cc->crypt_queue = alloc_workqueue("kcryptd/%s", - WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, + cc->crypt_queue = alloc_workqueue("kcryptd/%s", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1, devname); else cc->crypt_queue = alloc_workqueue("kcryptd/%s", - WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, + WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus(), devname); if (!cc->crypt_queue) { ti->error = "Couldn't create kcryptd queue"; diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c index 8288887b7f94..eb37584427a4 100644 --- a/drivers/md/dm-dust.c +++ b/drivers/md/dm-dust.c @@ -17,6 +17,7 @@ struct badblock { struct rb_node node; sector_t bb; + unsigned char wr_fail_cnt; }; struct dust_device { @@ -101,7 +102,8 @@ static int dust_remove_block(struct dust_device *dd, unsigned long long block) return 0; } -static int dust_add_block(struct dust_device *dd, unsigned long long block) +static int dust_add_block(struct dust_device *dd, unsigned long long block, + unsigned char wr_fail_cnt) { struct badblock *bblock; unsigned long flags; @@ -115,6 +117,7 @@ static int dust_add_block(struct dust_device *dd, unsigned long long block) spin_lock_irqsave(&dd->dust_lock, flags); bblock->bb = block; + bblock->wr_fail_cnt = wr_fail_cnt; if (!dust_rb_insert(&dd->badblocklist, bblock)) { if (!dd->quiet_mode) { DMERR("%s: block %llu already in badblocklist", @@ -126,8 +129,10 @@ static int dust_add_block(struct dust_device *dd, unsigned long long block) } dd->badblock_count++; - if (!dd->quiet_mode) - DMINFO("%s: badblock added at block %llu", __func__, block); + if (!dd->quiet_mode) { + DMINFO("%s: badblock added at block %llu with write fail count %hhu", + __func__, block, wr_fail_cnt); + } spin_unlock_irqrestore(&dd->dust_lock, flags); return 0; @@ -163,22 +168,27 @@ static int dust_map_read(struct dust_device *dd, sector_t thisblock, bool fail_read_on_bb) { unsigned long flags; - int ret = DM_MAPIO_REMAPPED; + int r = DM_MAPIO_REMAPPED; if (fail_read_on_bb) { thisblock >>= dd->sect_per_block_shift; spin_lock_irqsave(&dd->dust_lock, flags); - ret = __dust_map_read(dd, thisblock); + r = __dust_map_read(dd, thisblock); spin_unlock_irqrestore(&dd->dust_lock, flags); } - return ret; + return r; } -static void __dust_map_write(struct dust_device *dd, sector_t thisblock) +static int __dust_map_write(struct dust_device *dd, sector_t thisblock) { struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock); + if (bblk && bblk->wr_fail_cnt > 0) { + bblk->wr_fail_cnt--; + return DM_MAPIO_KILL; + } + if (bblk) { rb_erase(&bblk->node, &dd->badblocklist); dd->badblock_count--; @@ -189,37 +199,40 @@ static void __dust_map_write(struct dust_device *dd, sector_t thisblock) (unsigned long long)thisblock); } } + + return DM_MAPIO_REMAPPED; } static int dust_map_write(struct dust_device *dd, sector_t thisblock, bool fail_read_on_bb) { unsigned long flags; + int ret = DM_MAPIO_REMAPPED; if (fail_read_on_bb) { thisblock >>= dd->sect_per_block_shift; spin_lock_irqsave(&dd->dust_lock, flags); - __dust_map_write(dd, thisblock); + ret = __dust_map_write(dd, thisblock); spin_unlock_irqrestore(&dd->dust_lock, flags); } - return DM_MAPIO_REMAPPED; + return ret; } static int dust_map(struct dm_target *ti, struct bio *bio) { struct dust_device *dd = ti->private; - int ret; + int r; bio_set_dev(bio, dd->dev->bdev); bio->bi_iter.bi_sector = dd->start + dm_target_offset(ti, bio->bi_iter.bi_sector); if (bio_data_dir(bio) == READ) - ret = dust_map_read(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb); + r = dust_map_read(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb); else - ret = dust_map_write(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb); + r = dust_map_write(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb); - return ret; + return r; } static bool __dust_clear_badblocks(struct rb_root *tree, @@ -375,8 +388,10 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, struct dust_device *dd = ti->private; sector_t size = i_size_read(dd->dev->bdev->bd_inode) >> SECTOR_SHIFT; bool invalid_msg = false; - int result = -EINVAL; + int r = -EINVAL; unsigned long long tmp, block; + unsigned char wr_fail_cnt; + unsigned int tmp_ui; unsigned long flags; char dummy; @@ -388,45 +403,69 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, } else if (!strcasecmp(argv[0], "disable")) { DMINFO("disabling read failures on bad sectors"); dd->fail_read_on_bb = false; - result = 0; + r = 0; } else if (!strcasecmp(argv[0], "enable")) { DMINFO("enabling read failures on bad sectors"); dd->fail_read_on_bb = true; - result = 0; + r = 0; } else if (!strcasecmp(argv[0], "countbadblocks")) { spin_lock_irqsave(&dd->dust_lock, flags); DMINFO("countbadblocks: %llu badblock(s) found", dd->badblock_count); spin_unlock_irqrestore(&dd->dust_lock, flags); - result = 0; + r = 0; } else if (!strcasecmp(argv[0], "clearbadblocks")) { - result = dust_clear_badblocks(dd); + r = dust_clear_badblocks(dd); } else if (!strcasecmp(argv[0], "quiet")) { if (!dd->quiet_mode) dd->quiet_mode = true; else dd->quiet_mode = false; - result = 0; + r = 0; } else { invalid_msg = true; } } else if (argc == 2) { if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) - return result; + return r; block = tmp; sector_div(size, dd->sect_per_block); if (block > size) { DMERR("selected block value out of range"); - return result; + return r; } if (!strcasecmp(argv[0], "addbadblock")) - result = dust_add_block(dd, block); + r = dust_add_block(dd, block, 0); else if (!strcasecmp(argv[0], "removebadblock")) - result = dust_remove_block(dd, block); + r = dust_remove_block(dd, block); else if (!strcasecmp(argv[0], "queryblock")) - result = dust_query_block(dd, block); + r = dust_query_block(dd, block); + else + invalid_msg = true; + + } else if (argc == 3) { + if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) + return r; + + if (sscanf(argv[2], "%u%c", &tmp_ui, &dummy) != 1) + return r; + + block = tmp; + if (tmp_ui > 255) { + DMERR("selected write fail count out of range"); + return r; + } + wr_fail_cnt = tmp_ui; + sector_div(size, dd->sect_per_block); + if (block > size) { + DMERR("selected block value out of range"); + return r; + } + + if (!strcasecmp(argv[0], "addbadblock")) + r = dust_add_block(dd, block, wr_fail_cnt); else invalid_msg = true; @@ -436,7 +475,7 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, if (invalid_msg) DMERR("unrecognized message '%s' received", argv[0]); - return result; + return r; } static void dust_status(struct dm_target *ti, status_type_t type, @@ -499,12 +538,12 @@ static struct target_type dust_target = { static int __init dm_dust_init(void) { - int result = dm_register_target(&dust_target); + int r = dm_register_target(&dust_target); - if (result < 0) - DMERR("dm_register_target failed %d", result); + if (r < 0) + DMERR("dm_register_target failed %d", r); - return result; + return r; } static void __exit dm_dust_exit(void) diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 2900fbde89b3..a2cc9e45cbba 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -280,7 +280,7 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio) struct flakey_c *fc = ti->private; bio_set_dev(bio, fc->dev->bdev); - if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET) + if (bio_sectors(bio) || op_is_zone_mgmt(bio_op(bio))) bio->bi_iter.bi_sector = flakey_map_sector(ti, bio->bi_iter.bi_sector); } @@ -322,8 +322,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); pb->bio_submitted = false; - /* Do not fail reset zone */ - if (bio_op(bio) == REQ_OP_ZONE_RESET) + if (op_is_zone_mgmt(bio_op(bio))) goto map_bio; /* Are we alive ? */ @@ -384,7 +383,7 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, struct flakey_c *fc = ti->private; struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); - if (bio_op(bio) == REQ_OP_ZONE_RESET) + if (op_is_zone_mgmt(bio_op(bio))) return DM_ENDIO_DONE; if (!*error && pb->bio_submitted && (bio_data_dir(bio) == READ)) { @@ -460,21 +459,15 @@ static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev } #ifdef CONFIG_BLK_DEV_ZONED -static int flakey_report_zones(struct dm_target *ti, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones) +static int flakey_report_zones(struct dm_target *ti, + struct dm_report_zones_args *args, unsigned int nr_zones) { struct flakey_c *fc = ti->private; - int ret; + sector_t sector = flakey_map_sector(ti, args->next_sector); - /* Do report and remap it */ - ret = blkdev_report_zones(fc->dev->bdev, flakey_map_sector(ti, sector), - zones, nr_zones); - if (ret != 0) - return ret; - - if (*nr_zones) - dm_remap_zone_report(ti, fc->start, zones, nr_zones); - return 0; + args->start = fc->start; + return blkdev_report_zones(fc->dev->bdev, sector, nr_zones, + dm_report_zones_cb, args); } #endif diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index dab4446fe7d8..b225b3e445fa 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -53,6 +53,7 @@ #define SB_VERSION_1 1 #define SB_VERSION_2 2 #define SB_VERSION_3 3 +#define SB_VERSION_4 4 #define SB_SECTORS 8 #define MAX_SECTORS_PER_BLOCK 8 @@ -73,6 +74,7 @@ struct superblock { #define SB_FLAG_HAVE_JOURNAL_MAC 0x1 #define SB_FLAG_RECALCULATING 0x2 #define SB_FLAG_DIRTY_BITMAP 0x4 +#define SB_FLAG_FIXED_PADDING 0x8 #define JOURNAL_ENTRY_ROUNDUP 8 @@ -250,6 +252,7 @@ struct dm_integrity_c { bool journal_uptodate; bool just_formatted; bool recalculate_flag; + bool fix_padding; struct alg_spec internal_hash_alg; struct alg_spec journal_crypt_alg; @@ -463,7 +466,9 @@ static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr) static void sb_set_version(struct dm_integrity_c *ic) { - if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) + if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) + ic->sb->version = SB_VERSION_4; + else if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) ic->sb->version = SB_VERSION_3; else if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) ic->sb->version = SB_VERSION_2; @@ -2955,6 +2960,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, arg_count += !!ic->internal_hash_alg.alg_string; arg_count += !!ic->journal_crypt_alg.alg_string; arg_count += !!ic->journal_mac_alg.alg_string; + arg_count += (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0; DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start, ic->tag_size, ic->mode, arg_count); if (ic->meta_dev) @@ -2974,6 +2980,8 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, DMEMIT(" sectors_per_bit:%llu", (unsigned long long)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit); DMEMIT(" bitmap_flush_interval:%u", jiffies_to_msecs(ic->bitmap_flush_interval)); } + if ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0) + DMEMIT(" fix_padding"); #define EMIT_ALG(a, n) \ do { \ @@ -3042,8 +3050,14 @@ static int calculate_device_limits(struct dm_integrity_c *ic) if (!ic->meta_dev) { sector_t last_sector, last_area, last_offset; - ic->metadata_run = roundup((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block), - (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS)) >> SECTOR_SHIFT; + /* we have to maintain excessive padding for compatibility with existing volumes */ + __u64 metadata_run_padding = + ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING) ? + (__u64)(METADATA_PADDING_SECTORS << SECTOR_SHIFT) : + (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS); + + ic->metadata_run = round_up((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block), + metadata_run_padding) >> SECTOR_SHIFT; if (!(ic->metadata_run & (ic->metadata_run - 1))) ic->log2_metadata_run = __ffs(ic->metadata_run); else @@ -3086,6 +3100,8 @@ static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sec journal_sections = 1; if (!ic->meta_dev) { + if (ic->fix_padding) + ic->sb->flags |= cpu_to_le32(SB_FLAG_FIXED_PADDING); ic->sb->journal_sections = cpu_to_le32(journal_sections); if (!interleave_sectors) interleave_sectors = DEFAULT_INTERLEAVE_SECTORS; @@ -3725,6 +3741,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } else if (!strcmp(opt_string, "recalculate")) { ic->recalculate_flag = true; + } else if (!strcmp(opt_string, "fix_padding")) { + ic->fix_padding = true; } else { r = -EINVAL; ti->error = "Invalid argument"; @@ -3867,7 +3885,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) should_write_sb = true; } - if (!ic->sb->version || ic->sb->version > SB_VERSION_3) { + if (!ic->sb->version || ic->sb->version > SB_VERSION_4) { r = -EINVAL; ti->error = "Unknown version"; goto bad; @@ -4182,7 +4200,7 @@ static void dm_integrity_dtr(struct dm_target *ti) static struct target_type integrity_target = { .name = "integrity", - .version = {1, 3, 0}, + .version = {1, 4, 0}, .module = THIS_MODULE, .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, .ctr = dm_integrity_ctr, diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index ecefe6703736..8d07fdf63a47 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -90,7 +90,7 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio) struct linear_c *lc = ti->private; bio_set_dev(bio, lc->dev->bdev); - if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET) + if (bio_sectors(bio) || op_is_zone_mgmt(bio_op(bio))) bio->bi_iter.bi_sector = linear_map_sector(ti, bio->bi_iter.bi_sector); } @@ -136,21 +136,15 @@ static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev } #ifdef CONFIG_BLK_DEV_ZONED -static int linear_report_zones(struct dm_target *ti, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones) +static int linear_report_zones(struct dm_target *ti, + struct dm_report_zones_args *args, unsigned int nr_zones) { - struct linear_c *lc = (struct linear_c *) ti->private; - int ret; - - /* Do report and remap it */ - ret = blkdev_report_zones(lc->dev->bdev, linear_map_sector(ti, sector), - zones, nr_zones); - if (ret != 0) - return ret; + struct linear_c *lc = ti->private; + sector_t sector = linear_map_sector(ti, args->next_sector); - if (*nr_zones) - dm_remap_zone_report(ti, lc->start, zones, nr_zones); - return 0; + args->start = lc->start; + return blkdev_report_zones(lc->dev->bdev, sector, nr_zones, + dm_report_zones_cb, args); } #endif diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index b0aa595e4375..c412eaa975fc 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -209,6 +209,7 @@ struct raid_dev { #define RT_FLAG_RS_SUSPENDED 5 #define RT_FLAG_RS_IN_SYNC 6 #define RT_FLAG_RS_RESYNCING 7 +#define RT_FLAG_RS_GROW 8 /* Array elements of 64 bit needed for rebuild/failed disk bits */ #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8) @@ -241,6 +242,9 @@ struct raid_set { struct raid_type *raid_type; struct dm_target_callbacks callbacks; + sector_t array_sectors; + sector_t dev_sectors; + /* Optional raid4/5/6 journal device */ struct journal_dev { struct dm_dev *dev; @@ -616,7 +620,6 @@ static int raid10_format_to_md_layout(struct raid_set *rs, } else if (algorithm == ALGORITHM_RAID10_FAR) { f = copies; - r = !RAID10_OFFSET; if (!test_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags)) r |= RAID10_USE_FAR_SETS; @@ -1615,13 +1618,12 @@ static int _check_data_dev_sectors(struct raid_set *rs) } /* Calculate the sectors per device and per array used for @rs */ -static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev) +static int rs_set_dev_and_array_sectors(struct raid_set *rs, sector_t sectors, bool use_mddev) { int delta_disks; unsigned int data_stripes; + sector_t array_sectors = sectors, dev_sectors = sectors; struct mddev *mddev = &rs->md; - struct md_rdev *rdev; - sector_t array_sectors = rs->ti->len, dev_sectors = rs->ti->len; if (use_mddev) { delta_disks = mddev->delta_disks; @@ -1656,12 +1658,9 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev) /* Striped layouts */ array_sectors = (data_stripes + delta_disks) * dev_sectors; - rdev_for_each(rdev, mddev) - if (!test_bit(Journal, &rdev->flags)) - rdev->sectors = dev_sectors; - mddev->array_sectors = array_sectors; mddev->dev_sectors = dev_sectors; + rs_set_rdev_sectors(rs); return _check_data_dev_sectors(rs); bad: @@ -1670,7 +1669,7 @@ bad: } /* Setup recovery on @rs */ -static void __rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors) +static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors) { /* raid0 does not recover */ if (rs_is_raid0(rs)) @@ -1691,22 +1690,6 @@ static void __rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors) ? MaxSector : dev_sectors; } -/* Setup recovery on @rs based on raid type, device size and 'nosync' flag */ -static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors) -{ - if (!dev_sectors) - /* New raid set or 'sync' flag provided */ - __rs_setup_recovery(rs, 0); - else if (dev_sectors == MaxSector) - /* Prevent recovery */ - __rs_setup_recovery(rs, MaxSector); - else if (__rdev_sectors(rs) < dev_sectors) - /* Grown raid set */ - __rs_setup_recovery(rs, __rdev_sectors(rs)); - else - __rs_setup_recovery(rs, MaxSector); -} - static void do_table_event(struct work_struct *ws) { struct raid_set *rs = container_of(ws, struct raid_set, md.event_work); @@ -2474,7 +2457,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev) return -EINVAL; } - /* Enable bitmap creation for RAID levels != 0 */ + /* Enable bitmap creation on @rs unless no metadevs or raid0 or journaled raid4/5/6 set. */ mddev->bitmap_info.offset = (rt_is_raid0(rs->raid_type) || rs->journal_dev.dev) ? 0 : to_sector(4096); mddev->bitmap_info.default_offset = mddev->bitmap_info.offset; @@ -2911,7 +2894,7 @@ static int rs_setup_reshape(struct raid_set *rs) /* Remove disk(s) */ } else if (rs->delta_disks < 0) { - r = rs_set_dev_and_array_sectors(rs, true); + r = rs_set_dev_and_array_sectors(rs, rs->ti->len, true); mddev->reshape_backwards = 1; /* removing disk(s) -> backward reshape */ /* Change layout and/or chunk size */ @@ -3008,7 +2991,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) bool resize = false; struct raid_type *rt; unsigned int num_raid_params, num_raid_devs; - sector_t calculated_dev_sectors, rdev_sectors, reshape_sectors; + sector_t sb_array_sectors, rdev_sectors, reshape_sectors; struct raid_set *rs = NULL; const char *arg; struct rs_layout rs_layout; @@ -3067,11 +3050,13 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) * * Any existing superblock will overwrite the array and device sizes */ - r = rs_set_dev_and_array_sectors(rs, false); + r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false); if (r) goto bad; - calculated_dev_sectors = rs->md.dev_sectors; + /* Memorize just calculated, potentially larger sizes to grow the raid set in preresume */ + rs->array_sectors = rs->md.array_sectors; + rs->dev_sectors = rs->md.dev_sectors; /* * Backup any new raid set level, layout, ... @@ -3084,6 +3069,8 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (r) goto bad; + /* All in-core metadata now as of current superblocks after calling analyse_superblocks() */ + sb_array_sectors = rs->md.array_sectors; rdev_sectors = __rdev_sectors(rs); if (!rdev_sectors) { ti->error = "Invalid rdev size"; @@ -3093,8 +3080,11 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) reshape_sectors = _get_reshape_sectors(rs); - if (calculated_dev_sectors != rdev_sectors) - resize = calculated_dev_sectors != (reshape_sectors ? rdev_sectors - reshape_sectors : rdev_sectors); + if (rs->dev_sectors != rdev_sectors) { + resize = (rs->dev_sectors != rdev_sectors - reshape_sectors); + if (rs->dev_sectors > rdev_sectors - reshape_sectors) + set_bit(RT_FLAG_RS_GROW, &rs->runtime_flags); + } INIT_WORK(&rs->md.event_work, do_table_event); ti->private = rs; @@ -3121,13 +3111,8 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); rs_set_new(rs); } else if (rs_is_recovering(rs)) { - /* Rebuild particular devices */ - if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) { - set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); - rs_setup_recovery(rs, MaxSector); - } /* A recovering raid set may be resized */ - ; /* skip setup rs */ + goto size_check; } else if (rs_is_reshaping(rs)) { /* Have to reject size change request during reshape */ if (resize) { @@ -3171,6 +3156,9 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) rs_setup_recovery(rs, MaxSector); rs_set_new(rs); } else if (rs_reshape_requested(rs)) { + /* Only request grow on raid set size extensions, not on reshapes. */ + clear_bit(RT_FLAG_RS_GROW, &rs->runtime_flags); + /* * No need to check for 'ongoing' takeover here, because takeover * is an instant operation as oposed to an ongoing reshape. @@ -3201,13 +3189,31 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) } rs_set_cur(rs); } else { +size_check: /* May not set recovery when a device rebuild is requested */ if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) { - rs_setup_recovery(rs, MaxSector); + clear_bit(RT_FLAG_RS_GROW, &rs->runtime_flags); set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); - } else - rs_setup_recovery(rs, test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags) ? - 0 : (resize ? calculated_dev_sectors : MaxSector)); + rs_setup_recovery(rs, MaxSector); + } else if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) { + /* + * Set raid set to current size, i.e. size as of + * superblocks to grow to larger size in preresume. + */ + r = rs_set_dev_and_array_sectors(rs, sb_array_sectors, false); + if (r) + goto bad; + + rs_setup_recovery(rs, rs->md.recovery_cp < rs->md.dev_sectors ? rs->md.recovery_cp : rs->md.dev_sectors); + } else { + /* This is no size change or it is shrinking, update size and record in superblocks */ + r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false); + if (r) + goto bad; + + if (sb_array_sectors > rs->array_sectors) + set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); + } rs_set_cur(rs); } @@ -3406,10 +3412,9 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev) /* Helper to return resync/reshape progress for @rs and runtime flags for raid set in sync / resynching */ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, - sector_t resync_max_sectors) + enum sync_state state, sector_t resync_max_sectors) { sector_t r; - enum sync_state state; struct mddev *mddev = &rs->md; clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); @@ -3420,8 +3425,6 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); } else { - state = decipher_sync_action(mddev, recovery); - if (state == st_idle && !test_bit(MD_RECOVERY_INTR, &recovery)) r = mddev->recovery_cp; else @@ -3439,18 +3442,14 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, /* * In case we are recovering, the array is not in sync * and health chars should show the recovering legs. + * + * Already retrieved recovery offset from curr_resync_completed above. */ ; - else if (state == st_resync) - /* - * If "resync" is occurring, the raid set - * is or may be out of sync hence the health - * characters shall be 'a'. - */ - set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); - else if (state == st_reshape) + + else if (state == st_resync || state == st_reshape) /* - * If "reshape" is occurring, the raid set + * If "resync/reshape" is occurring, the raid set * is or may be out of sync hence the health * characters shall be 'a'. */ @@ -3464,22 +3463,22 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, */ set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); - else { - struct md_rdev *rdev; - + else if (test_bit(MD_RECOVERY_NEEDED, &recovery)) /* * We are idle and recovery is needed, prevent 'A' chars race * caused by components still set to in-sync by constructor. */ - if (test_bit(MD_RECOVERY_NEEDED, &recovery)) - set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); + set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); + else { /* - * The raid set may be doing an initial sync, or it may - * be rebuilding individual components. If all the - * devices are In_sync, then it is the raid set that is - * being initialized. + * We are idle and the raid set may be doing an initial + * sync, or it may be rebuilding individual components. + * If all the devices are In_sync, then it is the raid set + * that is being initialized. */ + struct md_rdev *rdev; + set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); rdev_for_each(rdev, mddev) if (!test_bit(Journal, &rdev->flags) && @@ -3512,7 +3511,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, unsigned int rebuild_disks; unsigned int write_mostly_params = 0; sector_t progress, resync_max_sectors, resync_mismatches; - const char *sync_action; + enum sync_state state; struct raid_type *rt; switch (type) { @@ -3526,14 +3525,14 @@ static void raid_status(struct dm_target *ti, status_type_t type, /* Access most recent mddev properties for status output */ smp_rmb(); - recovery = rs->md.recovery; /* Get sensible max sectors even if raid set not yet started */ resync_max_sectors = test_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags) ? mddev->resync_max_sectors : mddev->dev_sectors; - progress = rs_get_progress(rs, recovery, resync_max_sectors); + recovery = rs->md.recovery; + state = decipher_sync_action(mddev, recovery); + progress = rs_get_progress(rs, recovery, state, resync_max_sectors); resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ? atomic64_read(&mddev->resync_mismatches) : 0; - sync_action = sync_str(decipher_sync_action(&rs->md, recovery)); /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */ for (i = 0; i < rs->raid_disks; i++) @@ -3561,7 +3560,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, * See Documentation/admin-guide/device-mapper/dm-raid.rst for * information on each of these states. */ - DMEMIT(" %s", sync_action); + DMEMIT(" %s", sync_str(state)); /* * v1.5.0+: @@ -3955,11 +3954,22 @@ static int raid_preresume(struct dm_target *ti) if (r) return r; - /* Resize bitmap to adjust to changed region size (aka MD bitmap chunksize) */ - if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) && mddev->bitmap && - mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)) { - r = md_bitmap_resize(mddev->bitmap, mddev->dev_sectors, - to_bytes(rs->requested_bitmap_chunk_sectors), 0); + /* We are extending the raid set size, adjust mddev/md_rdev sizes and set capacity. */ + if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) { + mddev->array_sectors = rs->array_sectors; + mddev->dev_sectors = rs->dev_sectors; + rs_set_rdev_sectors(rs); + rs_set_capacity(rs); + } + + /* Resize bitmap to adjust to changed region size (aka MD bitmap chunksize) or grown device size */ + if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) && mddev->bitmap && + (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags) || + (rs->requested_bitmap_chunk_sectors && + mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) { + int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize; + + r = md_bitmap_resize(mddev->bitmap, mddev->dev_sectors, chunksize, 0); if (r) DMERR("Failed to resize bitmap"); } @@ -3968,8 +3978,10 @@ static int raid_preresume(struct dm_target *ti) /* Be prepared for mddev_resume() in raid_resume() */ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) { - set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); mddev->resync_min = mddev->recovery_cp; + if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) + mddev->resync_max_sectors = mddev->dev_sectors; } /* Check for any reshape request unless new raid set */ @@ -4017,7 +4029,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 14, 0}, + .version = {1, 15, 0}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index f150f5c5492b..4fb1a40e68a0 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -18,7 +18,6 @@ #include <linux/vmalloc.h> #include <linux/log2.h> #include <linux/dm-kcopyd.h> -#include <linux/semaphore.h> #include "dm.h" @@ -107,8 +106,8 @@ struct dm_snapshot { /* The on disk metadata handler */ struct dm_exception_store *store; - /* Maximum number of in-flight COW jobs. */ - struct semaphore cow_count; + unsigned in_progress; + struct wait_queue_head in_progress_wait; struct dm_kcopyd_client *kcopyd_client; @@ -162,8 +161,8 @@ struct dm_snapshot { */ #define DEFAULT_COW_THRESHOLD 2048 -static int cow_threshold = DEFAULT_COW_THRESHOLD; -module_param_named(snapshot_cow_threshold, cow_threshold, int, 0644); +static unsigned cow_threshold = DEFAULT_COW_THRESHOLD; +module_param_named(snapshot_cow_threshold, cow_threshold, uint, 0644); MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write"); DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle, @@ -1327,7 +1326,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad_hash_tables; } - sema_init(&s->cow_count, (cow_threshold > 0) ? cow_threshold : INT_MAX); + init_waitqueue_head(&s->in_progress_wait); s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle); if (IS_ERR(s->kcopyd_client)) { @@ -1509,9 +1508,56 @@ static void snapshot_dtr(struct dm_target *ti) dm_put_device(ti, s->origin); + WARN_ON(s->in_progress); + kfree(s); } +static void account_start_copy(struct dm_snapshot *s) +{ + spin_lock(&s->in_progress_wait.lock); + s->in_progress++; + spin_unlock(&s->in_progress_wait.lock); +} + +static void account_end_copy(struct dm_snapshot *s) +{ + spin_lock(&s->in_progress_wait.lock); + BUG_ON(!s->in_progress); + s->in_progress--; + if (likely(s->in_progress <= cow_threshold) && + unlikely(waitqueue_active(&s->in_progress_wait))) + wake_up_locked(&s->in_progress_wait); + spin_unlock(&s->in_progress_wait.lock); +} + +static bool wait_for_in_progress(struct dm_snapshot *s, bool unlock_origins) +{ + if (unlikely(s->in_progress > cow_threshold)) { + spin_lock(&s->in_progress_wait.lock); + if (likely(s->in_progress > cow_threshold)) { + /* + * NOTE: this throttle doesn't account for whether + * the caller is servicing an IO that will trigger a COW + * so excess throttling may result for chunks not required + * to be COW'd. But if cow_threshold was reached, extra + * throttling is unlikely to negatively impact performance. + */ + DECLARE_WAITQUEUE(wait, current); + __add_wait_queue(&s->in_progress_wait, &wait); + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&s->in_progress_wait.lock); + if (unlock_origins) + up_read(&_origins_lock); + io_schedule(); + remove_wait_queue(&s->in_progress_wait, &wait); + return false; + } + spin_unlock(&s->in_progress_wait.lock); + } + return true; +} + /* * Flush a list of buffers. */ @@ -1527,7 +1573,7 @@ static void flush_bios(struct bio *bio) } } -static int do_origin(struct dm_dev *origin, struct bio *bio); +static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit); /* * Flush a list of buffers. @@ -1540,7 +1586,7 @@ static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio) while (bio) { n = bio->bi_next; bio->bi_next = NULL; - r = do_origin(s->origin, bio); + r = do_origin(s->origin, bio, false); if (r == DM_MAPIO_REMAPPED) generic_make_request(bio); bio = n; @@ -1732,7 +1778,7 @@ static void copy_callback(int read_err, unsigned long write_err, void *context) rb_link_node(&pe->out_of_order_node, parent, p); rb_insert_color(&pe->out_of_order_node, &s->out_of_order_tree); } - up(&s->cow_count); + account_end_copy(s); } /* @@ -1756,7 +1802,7 @@ static void start_copy(struct dm_snap_pending_exception *pe) dest.count = src.count; /* Hand over to kcopyd */ - down(&s->cow_count); + account_start_copy(s); dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe); } @@ -1776,7 +1822,7 @@ static void start_full_bio(struct dm_snap_pending_exception *pe, pe->full_bio = bio; pe->full_bio_end_io = bio->bi_end_io; - down(&s->cow_count); + account_start_copy(s); callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client, copy_callback, pe); @@ -1866,7 +1912,7 @@ static void zero_callback(int read_err, unsigned long write_err, void *context) struct bio *bio = context; struct dm_snapshot *s = bio->bi_private; - up(&s->cow_count); + account_end_copy(s); bio->bi_status = write_err ? BLK_STS_IOERR : 0; bio_endio(bio); } @@ -1880,7 +1926,7 @@ static void zero_exception(struct dm_snapshot *s, struct dm_exception *e, dest.sector = bio->bi_iter.bi_sector; dest.count = s->store->chunk_size; - down(&s->cow_count); + account_start_copy(s); WARN_ON_ONCE(bio->bi_private); bio->bi_private = s; dm_kcopyd_zero(s->kcopyd_client, 1, &dest, 0, zero_callback, bio); @@ -1916,6 +1962,11 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) if (!s->valid) return DM_MAPIO_KILL; + if (bio_data_dir(bio) == WRITE) { + while (unlikely(!wait_for_in_progress(s, false))) + ; /* wait_for_in_progress() has slept */ + } + down_read(&s->lock); dm_exception_table_lock(&lock); @@ -2112,7 +2163,7 @@ redirect_to_origin: if (bio_data_dir(bio) == WRITE) { up_write(&s->lock); - return do_origin(s->origin, bio); + return do_origin(s->origin, bio, false); } out_unlock: @@ -2487,15 +2538,24 @@ next_snapshot: /* * Called on a write from the origin driver. */ -static int do_origin(struct dm_dev *origin, struct bio *bio) +static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit) { struct origin *o; int r = DM_MAPIO_REMAPPED; +again: down_read(&_origins_lock); o = __lookup_origin(origin->bdev); - if (o) + if (o) { + if (limit) { + struct dm_snapshot *s; + list_for_each_entry(s, &o->snapshots, list) + if (unlikely(!wait_for_in_progress(s, true))) + goto again; + } + r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio); + } up_read(&_origins_lock); return r; @@ -2608,7 +2668,7 @@ static int origin_map(struct dm_target *ti, struct bio *bio) dm_accept_partial_bio(bio, available_sectors); /* Only tell snapshots if this is a write */ - return do_origin(o->dev, bio); + return do_origin(o->dev, bio, true); } /* diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 8547d7594338..63bbcc20f49a 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -55,19 +55,6 @@ static void trigger_event(struct work_struct *work) dm_table_event(sc->ti->table); } -static inline struct stripe_c *alloc_context(unsigned int stripes) -{ - size_t len; - - if (dm_array_too_big(sizeof(struct stripe_c), sizeof(struct stripe), - stripes)) - return NULL; - - len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes); - - return kmalloc(len, GFP_KERNEL); -} - /* * Parse a single <dev> <sector> pair */ @@ -142,7 +129,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -EINVAL; } - sc = alloc_context(stripes); + sc = kmalloc(struct_size(sc, stripe, stripes), GFP_KERNEL); if (!sc) { ti->error = "Memory allocation for striped context " "failed"; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 52e049554f5c..2ae0c1913766 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -918,21 +918,15 @@ bool dm_table_supports_dax(struct dm_table *t, static bool dm_table_does_not_support_partial_completion(struct dm_table *t); -struct verify_rq_based_data { - unsigned sq_count; - unsigned mq_count; -}; - -static int device_is_rq_based(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int device_is_rq_stackable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { - struct request_queue *q = bdev_get_queue(dev->bdev); - struct verify_rq_based_data *v = data; + struct block_device *bdev = dev->bdev; + struct request_queue *q = bdev_get_queue(bdev); - if (queue_is_mq(q)) - v->mq_count++; - else - v->sq_count++; + /* request-based cannot stack on partitions! */ + if (bdev != bdev->bd_contains) + return false; return queue_is_mq(q); } @@ -941,7 +935,6 @@ static int dm_table_determine_type(struct dm_table *t) { unsigned i; unsigned bio_based = 0, request_based = 0, hybrid = 0; - struct verify_rq_based_data v = {.sq_count = 0, .mq_count = 0}; struct dm_target *tgt; struct list_head *devices = dm_table_get_devices(t); enum dm_queue_mode live_md_type = dm_get_md_type(t->md); @@ -1045,14 +1038,10 @@ verify_rq_based: /* Non-request-stackable devices can't be used for request-based dm */ if (!tgt->type->iterate_devices || - !tgt->type->iterate_devices(tgt, device_is_rq_based, &v)) { + !tgt->type->iterate_devices(tgt, device_is_rq_stackable, NULL)) { DMERR("table load rejected: including non-request-stackable devices"); return -EINVAL; } - if (v.sq_count > 0) { - DMERR("table load rejected: not all devices are blk-mq request-stackable"); - return -EINVAL; - } return 0; } diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index fcd887703f95..5a2c494cb552 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -609,13 +609,12 @@ static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, blk_status_t error) { struct bio_list bios; - unsigned long flags; bio_list_init(&bios); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); __merge_bio_list(&bios, master); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); error_bio_list(&bios, error); } @@ -623,15 +622,14 @@ static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, static void requeue_deferred_cells(struct thin_c *tc) { struct pool *pool = tc->pool; - unsigned long flags; struct list_head cells; struct dm_bio_prison_cell *cell, *tmp; INIT_LIST_HEAD(&cells); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); list_splice_init(&tc->deferred_cells, &cells); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); list_for_each_entry_safe(cell, tmp, &cells, user_list) cell_requeue(pool, cell); @@ -640,14 +638,13 @@ static void requeue_deferred_cells(struct thin_c *tc) static void requeue_io(struct thin_c *tc) { struct bio_list bios; - unsigned long flags; bio_list_init(&bios); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); __merge_bio_list(&bios, &tc->deferred_bio_list); __merge_bio_list(&bios, &tc->retry_on_resume_list); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); error_bio_list(&bios, BLK_STS_DM_REQUEUE); requeue_deferred_cells(tc); @@ -756,7 +753,6 @@ static void inc_all_io_entry(struct pool *pool, struct bio *bio) static void issue(struct thin_c *tc, struct bio *bio) { struct pool *pool = tc->pool; - unsigned long flags; if (!bio_triggers_commit(tc, bio)) { generic_make_request(bio); @@ -777,9 +773,9 @@ static void issue(struct thin_c *tc, struct bio *bio) * Batch together any bios that trigger commits and then issue a * single commit for them in process_deferred_bios(). */ - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); bio_list_add(&pool->deferred_flush_bios, bio); - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); } static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) @@ -886,12 +882,15 @@ static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *c { struct pool *pool = tc->pool; unsigned long flags; + int has_work; spin_lock_irqsave(&tc->lock, flags); cell_release_no_holder(pool, cell, &tc->deferred_bio_list); + has_work = !bio_list_empty(&tc->deferred_bio_list); spin_unlock_irqrestore(&tc->lock, flags); - wake_worker(pool); + if (has_work) + wake_worker(pool); } static void thin_defer_bio(struct thin_c *tc, struct bio *bio); @@ -960,7 +959,6 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio) { struct pool *pool = tc->pool; - unsigned long flags; /* * If the bio has the REQ_FUA flag set we must commit the metadata @@ -985,9 +983,9 @@ static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio) * Batch together any bios that trigger commits and then issue a * single commit for them in process_deferred_bios(). */ - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); bio_list_add(&pool->deferred_flush_completions, bio); - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); } static void process_prepared_mapping(struct dm_thin_new_mapping *m) @@ -1226,14 +1224,13 @@ static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m) static void process_prepared(struct pool *pool, struct list_head *head, process_mapping_fn *fn) { - unsigned long flags; struct list_head maps; struct dm_thin_new_mapping *m, *tmp; INIT_LIST_HEAD(&maps); - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); list_splice_init(head, &maps); - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); list_for_each_entry_safe(m, tmp, &maps, list) (*fn)(m); @@ -1510,14 +1507,12 @@ static int commit(struct pool *pool) static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks) { - unsigned long flags; - if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { DMWARN("%s: reached low water mark for data device: sending event.", dm_device_name(pool->pool_md)); - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); pool->low_water_triggered = true; - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); dm_table_event(pool->ti->table); } } @@ -1593,11 +1588,10 @@ static void retry_on_resume(struct bio *bio) { struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); struct thin_c *tc = h->tc; - unsigned long flags; - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); bio_list_add(&tc->retry_on_resume_list, bio); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); } static blk_status_t should_error_unserviceable_bio(struct pool *pool) @@ -2170,7 +2164,6 @@ static void __sort_thin_deferred_bios(struct thin_c *tc) static void process_thin_deferred_bios(struct thin_c *tc) { struct pool *pool = tc->pool; - unsigned long flags; struct bio *bio; struct bio_list bios; struct blk_plug plug; @@ -2184,10 +2177,10 @@ static void process_thin_deferred_bios(struct thin_c *tc) bio_list_init(&bios); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); if (bio_list_empty(&tc->deferred_bio_list)) { - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); return; } @@ -2196,7 +2189,7 @@ static void process_thin_deferred_bios(struct thin_c *tc) bio_list_merge(&bios, &tc->deferred_bio_list); bio_list_init(&tc->deferred_bio_list); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); blk_start_plug(&plug); while ((bio = bio_list_pop(&bios))) { @@ -2206,10 +2199,10 @@ static void process_thin_deferred_bios(struct thin_c *tc) * prepared mappings to process. */ if (ensure_next_mapping(pool)) { - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); bio_list_add(&tc->deferred_bio_list, bio); bio_list_merge(&tc->deferred_bio_list, &bios); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); break; } @@ -2264,16 +2257,15 @@ static unsigned sort_cells(struct pool *pool, struct list_head *cells) static void process_thin_deferred_cells(struct thin_c *tc) { struct pool *pool = tc->pool; - unsigned long flags; struct list_head cells; struct dm_bio_prison_cell *cell; unsigned i, j, count; INIT_LIST_HEAD(&cells); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); list_splice_init(&tc->deferred_cells, &cells); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); if (list_empty(&cells)) return; @@ -2294,9 +2286,9 @@ static void process_thin_deferred_cells(struct thin_c *tc) for (j = i; j < count; j++) list_add(&pool->cell_sort_array[j]->user_list, &cells); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); list_splice(&cells, &tc->deferred_cells); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); return; } @@ -2349,7 +2341,6 @@ static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc) static void process_deferred_bios(struct pool *pool) { - unsigned long flags; struct bio *bio; struct bio_list bios, bio_completions; struct thin_c *tc; @@ -2368,13 +2359,13 @@ static void process_deferred_bios(struct pool *pool) bio_list_init(&bios); bio_list_init(&bio_completions); - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); bio_list_merge(&bios, &pool->deferred_flush_bios); bio_list_init(&pool->deferred_flush_bios); bio_list_merge(&bio_completions, &pool->deferred_flush_completions); bio_list_init(&pool->deferred_flush_completions); - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) && !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool))) @@ -2657,12 +2648,11 @@ static void metadata_operation_failed(struct pool *pool, const char *op, int r) */ static void thin_defer_bio(struct thin_c *tc, struct bio *bio) { - unsigned long flags; struct pool *pool = tc->pool; - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); bio_list_add(&tc->deferred_bio_list, bio); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); wake_worker(pool); } @@ -2678,13 +2668,12 @@ static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio) static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell) { - unsigned long flags; struct pool *pool = tc->pool; throttle_lock(&pool->throttle); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); list_add_tail(&cell->user_list, &tc->deferred_cells); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); throttle_unlock(&pool->throttle); wake_worker(pool); @@ -2810,15 +2799,14 @@ static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits) static void requeue_bios(struct pool *pool) { - unsigned long flags; struct thin_c *tc; rcu_read_lock(); list_for_each_entry_rcu(tc, &pool->active_thins, list) { - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list); bio_list_init(&tc->retry_on_resume_list); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); } rcu_read_unlock(); } @@ -3412,15 +3400,14 @@ static int pool_map(struct dm_target *ti, struct bio *bio) int r; struct pool_c *pt = ti->private; struct pool *pool = pt->pool; - unsigned long flags; /* * As this is a singleton target, ti->begin is always zero. */ - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); bio_set_dev(bio, pt->data_dev->bdev); r = DM_MAPIO_REMAPPED; - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); return r; } @@ -3591,7 +3578,6 @@ static void pool_resume(struct dm_target *ti) { struct pool_c *pt = ti->private; struct pool *pool = pt->pool; - unsigned long flags; /* * Must requeue active_thins' bios and then resume @@ -3600,10 +3586,10 @@ static void pool_resume(struct dm_target *ti) requeue_bios(pool); pool_resume_active_thins(pool); - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); pool->low_water_triggered = false; pool->suspended = false; - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); do_waker(&pool->waker.work); } @@ -3612,11 +3598,10 @@ static void pool_presuspend(struct dm_target *ti) { struct pool_c *pt = ti->private; struct pool *pool = pt->pool; - unsigned long flags; - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); pool->suspended = true; - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); pool_suspend_active_thins(pool); } @@ -3625,13 +3610,12 @@ static void pool_presuspend_undo(struct dm_target *ti) { struct pool_c *pt = ti->private; struct pool *pool = pt->pool; - unsigned long flags; pool_resume_active_thins(pool); - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); pool->suspended = false; - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); } static void pool_postsuspend(struct dm_target *ti) @@ -4110,11 +4094,10 @@ static void thin_put(struct thin_c *tc) static void thin_dtr(struct dm_target *ti) { struct thin_c *tc = ti->private; - unsigned long flags; - spin_lock_irqsave(&tc->pool->lock, flags); + spin_lock_irq(&tc->pool->lock); list_del_rcu(&tc->list); - spin_unlock_irqrestore(&tc->pool->lock, flags); + spin_unlock_irq(&tc->pool->lock); synchronize_rcu(); thin_put(tc); @@ -4150,7 +4133,6 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) struct thin_c *tc; struct dm_dev *pool_dev, *origin_dev; struct mapped_device *pool_md; - unsigned long flags; mutex_lock(&dm_thin_pool_table.mutex); @@ -4244,9 +4226,9 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) mutex_unlock(&dm_thin_pool_table.mutex); - spin_lock_irqsave(&tc->pool->lock, flags); + spin_lock_irq(&tc->pool->lock); if (tc->pool->suspended) { - spin_unlock_irqrestore(&tc->pool->lock, flags); + spin_unlock_irq(&tc->pool->lock); mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */ ti->error = "Unable to activate thin device while pool is suspended"; r = -EINVAL; @@ -4255,7 +4237,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) refcount_set(&tc->refcount, 1); init_completion(&tc->can_destroy); list_add_tail_rcu(&tc->list, &tc->pool->active_thins); - spin_unlock_irqrestore(&tc->pool->lock, flags); + spin_unlock_irq(&tc->pool->lock); /* * This synchronize_rcu() call is needed here otherwise we risk a * wake_worker() call finding no bios to process (because the newly diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index d06b8aa41e26..7d727a72aa13 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -1218,7 +1218,8 @@ bio_copy: } } while (bio->bi_iter.bi_size); - if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) + if (unlikely(bio->bi_opf & REQ_FUA || + wc->uncommitted_blocks >= wc->autocommit_blocks)) writecache_flush(wc); else writecache_schedule_autocommit(wc); @@ -1561,7 +1562,7 @@ static void writecache_writeback(struct work_struct *work) { struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work); struct blk_plug plug; - struct wc_entry *f, *g, *e = NULL; + struct wc_entry *f, *uninitialized_var(g), *e = NULL; struct rb_node *node, *next_node; struct list_head skipped; struct writeback_list wbl; diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 595a73110e17..22b3cb0050a7 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -554,6 +554,7 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd, TASK_UNINTERRUPTIBLE); if (test_bit(DMZ_META_ERROR, &mblk->state)) { dmz_release_mblock(zmd, mblk); + dmz_check_bdev(zmd->dev); return ERR_PTR(-EIO); } @@ -625,6 +626,8 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block, ret = submit_bio_wait(bio); bio_put(bio); + if (ret) + dmz_check_bdev(zmd->dev); return ret; } @@ -691,6 +694,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, TASK_UNINTERRUPTIBLE); if (test_bit(DMZ_META_ERROR, &mblk->state)) { clear_bit(DMZ_META_ERROR, &mblk->state); + dmz_check_bdev(zmd->dev); ret = -EIO; } nr_mblks_submitted--; @@ -768,7 +772,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) /* If there are no dirty metadata blocks, just flush the device cache */ if (list_empty(&write_list)) { ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); - goto out; + goto err; } /* @@ -778,7 +782,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) */ ret = dmz_log_dirty_mblocks(zmd, &write_list); if (ret) - goto out; + goto err; /* * The log is on disk. It is now safe to update in place @@ -786,11 +790,11 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) */ ret = dmz_write_dirty_mblocks(zmd, &write_list, zmd->mblk_primary); if (ret) - goto out; + goto err; ret = dmz_write_sb(zmd, zmd->mblk_primary); if (ret) - goto out; + goto err; while (!list_empty(&write_list)) { mblk = list_first_entry(&write_list, struct dmz_mblock, link); @@ -805,16 +809,20 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) zmd->sb_gen++; out: - if (ret && !list_empty(&write_list)) { - spin_lock(&zmd->mblk_lock); - list_splice(&write_list, &zmd->mblk_dirty_list); - spin_unlock(&zmd->mblk_lock); - } - dmz_unlock_flush(zmd); up_write(&zmd->mblk_sem); return ret; + +err: + if (!list_empty(&write_list)) { + spin_lock(&zmd->mblk_lock); + list_splice(&write_list, &zmd->mblk_dirty_list); + spin_unlock(&zmd->mblk_lock); + } + if (!dmz_check_bdev(zmd->dev)) + ret = -EIO; + goto out; } /* @@ -1080,9 +1088,10 @@ static int dmz_load_sb(struct dmz_metadata *zmd) /* * Initialize a zone descriptor. */ -static int dmz_init_zone(struct dmz_metadata *zmd, struct dm_zone *zone, - struct blk_zone *blkz) +static int dmz_init_zone(struct blk_zone *blkz, unsigned int idx, void *data) { + struct dmz_metadata *zmd = data; + struct dm_zone *zone = &zmd->zones[idx]; struct dmz_dev *dev = zmd->dev; /* Ignore the eventual last runt (smaller) zone */ @@ -1096,26 +1105,29 @@ static int dmz_init_zone(struct dmz_metadata *zmd, struct dm_zone *zone, atomic_set(&zone->refcount, 0); zone->chunk = DMZ_MAP_UNMAPPED; - if (blkz->type == BLK_ZONE_TYPE_CONVENTIONAL) { + switch (blkz->type) { + case BLK_ZONE_TYPE_CONVENTIONAL: set_bit(DMZ_RND, &zone->flags); zmd->nr_rnd_zones++; - } else if (blkz->type == BLK_ZONE_TYPE_SEQWRITE_REQ || - blkz->type == BLK_ZONE_TYPE_SEQWRITE_PREF) { + break; + case BLK_ZONE_TYPE_SEQWRITE_REQ: + case BLK_ZONE_TYPE_SEQWRITE_PREF: set_bit(DMZ_SEQ, &zone->flags); - } else + break; + default: return -ENXIO; - - if (blkz->cond == BLK_ZONE_COND_OFFLINE) - set_bit(DMZ_OFFLINE, &zone->flags); - else if (blkz->cond == BLK_ZONE_COND_READONLY) - set_bit(DMZ_READ_ONLY, &zone->flags); + } if (dmz_is_rnd(zone)) zone->wp_block = 0; else zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start); - if (!dmz_is_offline(zone) && !dmz_is_readonly(zone)) { + if (blkz->cond == BLK_ZONE_COND_OFFLINE) + set_bit(DMZ_OFFLINE, &zone->flags); + else if (blkz->cond == BLK_ZONE_COND_READONLY) + set_bit(DMZ_READ_ONLY, &zone->flags); + else { zmd->nr_useable_zones++; if (dmz_is_rnd(zone)) { zmd->nr_rnd_zones++; @@ -1139,23 +1151,13 @@ static void dmz_drop_zones(struct dmz_metadata *zmd) } /* - * The size of a zone report in number of zones. - * This results in 4096*64B=256KB report zones commands. - */ -#define DMZ_REPORT_NR_ZONES 4096 - -/* * Allocate and initialize zone descriptors using the zone * information from disk. */ static int dmz_init_zones(struct dmz_metadata *zmd) { struct dmz_dev *dev = zmd->dev; - struct dm_zone *zone; - struct blk_zone *blkz; - unsigned int nr_blkz; - sector_t sector = 0; - int i, ret = 0; + int ret; /* Init */ zmd->zone_bitmap_size = dev->zone_nr_blocks >> 3; @@ -1169,54 +1171,38 @@ static int dmz_init_zones(struct dmz_metadata *zmd) dmz_dev_info(dev, "Using %zu B for zone information", sizeof(struct dm_zone) * dev->nr_zones); - /* Get zone information */ - nr_blkz = DMZ_REPORT_NR_ZONES; - blkz = kcalloc(nr_blkz, sizeof(struct blk_zone), GFP_KERNEL); - if (!blkz) { - ret = -ENOMEM; - goto out; - } - /* - * Get zone information and initialize zone descriptors. - * At the same time, determine where the super block - * should be: first block of the first randomly writable - * zone. + * Get zone information and initialize zone descriptors. At the same + * time, determine where the super block should be: first block of the + * first randomly writable zone. */ - zone = zmd->zones; - while (sector < dev->capacity) { - /* Get zone information */ - nr_blkz = DMZ_REPORT_NR_ZONES; - ret = blkdev_report_zones(dev->bdev, sector, blkz, &nr_blkz); - if (ret) { - dmz_dev_err(dev, "Report zones failed %d", ret); - goto out; - } + ret = blkdev_report_zones(dev->bdev, 0, BLK_ALL_ZONES, dmz_init_zone, + zmd); + if (ret < 0) { + dmz_drop_zones(zmd); + return ret; + } - if (!nr_blkz) - break; + return 0; +} - /* Process report */ - for (i = 0; i < nr_blkz; i++) { - ret = dmz_init_zone(zmd, zone, &blkz[i]); - if (ret) - goto out; - sector += dev->zone_nr_sectors; - zone++; - } - } +static int dmz_update_zone_cb(struct blk_zone *blkz, unsigned int idx, + void *data) +{ + struct dm_zone *zone = data; - /* The entire zone configuration of the disk should now be known */ - if (sector < dev->capacity) { - dmz_dev_err(dev, "Failed to get correct zone information"); - ret = -ENXIO; - } -out: - kfree(blkz); - if (ret) - dmz_drop_zones(zmd); + clear_bit(DMZ_OFFLINE, &zone->flags); + clear_bit(DMZ_READ_ONLY, &zone->flags); + if (blkz->cond == BLK_ZONE_COND_OFFLINE) + set_bit(DMZ_OFFLINE, &zone->flags); + else if (blkz->cond == BLK_ZONE_COND_READONLY) + set_bit(DMZ_READ_ONLY, &zone->flags); - return ret; + if (dmz_is_seq(zone)) + zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start); + else + zone->wp_block = 0; + return 0; } /* @@ -1224,9 +1210,7 @@ out: */ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) { - unsigned int nr_blkz = 1; unsigned int noio_flag; - struct blk_zone blkz; int ret; /* @@ -1236,29 +1220,19 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) * GFP_NOIO was specified. */ noio_flag = memalloc_noio_save(); - ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone), - &blkz, &nr_blkz); + ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone), 1, + dmz_update_zone_cb, zone); memalloc_noio_restore(noio_flag); - if (!nr_blkz) + + if (ret == 0) ret = -EIO; - if (ret) { + if (ret < 0) { dmz_dev_err(zmd->dev, "Get zone %u report failed", dmz_id(zmd, zone)); + dmz_check_bdev(zmd->dev); return ret; } - clear_bit(DMZ_OFFLINE, &zone->flags); - clear_bit(DMZ_READ_ONLY, &zone->flags); - if (blkz.cond == BLK_ZONE_COND_OFFLINE) - set_bit(DMZ_OFFLINE, &zone->flags); - else if (blkz.cond == BLK_ZONE_COND_READONLY) - set_bit(DMZ_READ_ONLY, &zone->flags); - - if (dmz_is_seq(zone)) - zone->wp_block = dmz_sect2blk(blkz.wp - blkz.start); - else - zone->wp_block = 0; - return 0; } @@ -1312,9 +1286,9 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone) if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) { struct dmz_dev *dev = zmd->dev; - ret = blkdev_reset_zones(dev->bdev, - dmz_start_sect(zmd, zone), - dev->zone_nr_sectors, GFP_NOIO); + ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET, + dmz_start_sect(zmd, zone), + dev->zone_nr_sectors, GFP_NOIO); if (ret) { dmz_dev_err(dev, "Reset zone %u failed %d", dmz_id(zmd, zone), ret); diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index d240d7ca8a8a..e7ace908a9b7 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -82,6 +82,7 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone, "Align zone %u wp %llu to %llu (wp+%u) blocks failed %d", dmz_id(zmd, zone), (unsigned long long)wp_block, (unsigned long long)block, nr_blocks, ret); + dmz_check_bdev(zrc->dev); return ret; } @@ -489,12 +490,7 @@ static void dmz_reclaim_work(struct work_struct *work) ret = dmz_do_reclaim(zrc); if (ret) { dmz_dev_debug(zrc->dev, "Reclaim error %d\n", ret); - if (ret == -EIO) - /* - * LLD might be performing some error handling sequence - * at the underlying device. To not interfere, do not - * attempt to schedule the next reclaim run immediately. - */ + if (!dmz_check_bdev(zrc->dev)) return; } diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index d3bcc4197f5d..4574e0dedbd6 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -80,6 +80,8 @@ static inline void dmz_bio_endio(struct bio *bio, blk_status_t status) if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK) bio->bi_status = status; + if (bio->bi_status != BLK_STS_OK) + bioctx->target->dev->flags |= DMZ_CHECK_BDEV; if (refcount_dec_and_test(&bioctx->ref)) { struct dm_zone *zone = bioctx->zone; @@ -565,32 +567,52 @@ out: } /* - * Check the backing device availability. If it's on the way out, + * Check if the backing device is being removed. If it's on the way out, * start failing I/O. Reclaim and metadata components also call this * function to cleanly abort operation in the event of such failure. */ bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev) { - struct gendisk *disk; + if (dmz_dev->flags & DMZ_BDEV_DYING) + return true; - if (!(dmz_dev->flags & DMZ_BDEV_DYING)) { - disk = dmz_dev->bdev->bd_disk; - if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) { - dmz_dev_warn(dmz_dev, "Backing device queue dying"); - dmz_dev->flags |= DMZ_BDEV_DYING; - } else if (disk->fops->check_events) { - if (disk->fops->check_events(disk, 0) & - DISK_EVENT_MEDIA_CHANGE) { - dmz_dev_warn(dmz_dev, "Backing device offline"); - dmz_dev->flags |= DMZ_BDEV_DYING; - } - } + if (dmz_dev->flags & DMZ_CHECK_BDEV) + return !dmz_check_bdev(dmz_dev); + + if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) { + dmz_dev_warn(dmz_dev, "Backing device queue dying"); + dmz_dev->flags |= DMZ_BDEV_DYING; } return dmz_dev->flags & DMZ_BDEV_DYING; } /* + * Check the backing device availability. This detects such events as + * backing device going offline due to errors, media removals, etc. + * This check is less efficient than dmz_bdev_is_dying() and should + * only be performed as a part of error handling. + */ +bool dmz_check_bdev(struct dmz_dev *dmz_dev) +{ + struct gendisk *disk; + + dmz_dev->flags &= ~DMZ_CHECK_BDEV; + + if (dmz_bdev_is_dying(dmz_dev)) + return false; + + disk = dmz_dev->bdev->bd_disk; + if (disk->fops->check_events && + disk->fops->check_events(disk, 0) & DISK_EVENT_MEDIA_CHANGE) { + dmz_dev_warn(dmz_dev, "Backing device offline"); + dmz_dev->flags |= DMZ_BDEV_DYING; + } + + return !(dmz_dev->flags & DMZ_BDEV_DYING); +} + +/* * Process a new BIO. */ static int dmz_map(struct dm_target *ti, struct bio *bio) @@ -902,8 +924,8 @@ static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) { struct dmz_target *dmz = ti->private; - if (dmz_bdev_is_dying(dmz->dev)) - return -ENODEV; + if (!dmz_check_bdev(dmz->dev)) + return -EIO; *bdev = dmz->dev->bdev; diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index d8e70b0ade35..5b5e493d479c 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -72,6 +72,7 @@ struct dmz_dev { /* Device flags. */ #define DMZ_BDEV_DYING (1 << 0) +#define DMZ_CHECK_BDEV (2 << 0) /* * Zone descriptor. @@ -255,5 +256,6 @@ void dmz_schedule_reclaim(struct dmz_reclaim *zrc); * Functions defined in dm-zoned-target.c */ bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev); +bool dmz_check_bdev(struct dmz_dev *dmz_dev); #endif /* DM_ZONED_H */ diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 1a5e328c443a..e8f9661a10a1 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -440,14 +440,48 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) return dm_get_geometry(md, geo); } +#ifdef CONFIG_BLK_DEV_ZONED +int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, void *data) +{ + struct dm_report_zones_args *args = data; + sector_t sector_diff = args->tgt->begin - args->start; + + /* + * Ignore zones beyond the target range. + */ + if (zone->start >= args->start + args->tgt->len) + return 0; + + /* + * Remap the start sector and write pointer position of the zone + * to match its position in the target range. + */ + zone->start += sector_diff; + if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) { + if (zone->cond == BLK_ZONE_COND_FULL) + zone->wp = zone->start + zone->len; + else if (zone->cond == BLK_ZONE_COND_EMPTY) + zone->wp = zone->start; + else + zone->wp += sector_diff; + } + + args->next_sector = zone->start + zone->len; + return args->orig_cb(zone, args->zone_idx++, args->orig_data); +} +EXPORT_SYMBOL_GPL(dm_report_zones_cb); + static int dm_blk_report_zones(struct gendisk *disk, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones) + unsigned int nr_zones, report_zones_cb cb, void *data) { -#ifdef CONFIG_BLK_DEV_ZONED struct mapped_device *md = disk->private_data; - struct dm_target *tgt; struct dm_table *map; int srcu_idx, ret; + struct dm_report_zones_args args = { + .next_sector = sector, + .orig_data = data, + .orig_cb = cb, + }; if (dm_suspended_md(md)) return -EAGAIN; @@ -456,38 +490,30 @@ static int dm_blk_report_zones(struct gendisk *disk, sector_t sector, if (!map) return -EIO; - tgt = dm_table_find_target(map, sector); - if (!tgt) { - ret = -EIO; - goto out; - } + do { + struct dm_target *tgt; - /* - * If we are executing this, we already know that the block device - * is a zoned device and so each target should have support for that - * type of drive. A missing report_zones method means that the target - * driver has a problem. - */ - if (WARN_ON(!tgt->type->report_zones)) { - ret = -EIO; - goto out; - } + tgt = dm_table_find_target(map, args.next_sector); + if (WARN_ON_ONCE(!tgt->type->report_zones)) { + ret = -EIO; + goto out; + } - /* - * blkdev_report_zones() will loop and call this again to cover all the - * zones of the target, eventually moving on to the next target. - * So there is no need to loop here trying to fill the entire array - * of zones. - */ - ret = tgt->type->report_zones(tgt, sector, zones, nr_zones); + args.tgt = tgt; + ret = tgt->type->report_zones(tgt, &args, nr_zones); + if (ret < 0) + goto out; + } while (args.zone_idx < nr_zones && + args.next_sector < get_capacity(disk)); + ret = args.zone_idx; out: dm_put_live_table(md, srcu_idx); return ret; -#else - return -ENOTSUPP; -#endif } +#else +#define dm_blk_report_zones NULL +#endif /* CONFIG_BLK_DEV_ZONED */ static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx, struct block_device **bdev) @@ -1174,7 +1200,8 @@ static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, /* * A target may call dm_accept_partial_bio only from the map routine. It is - * allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET. + * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_RESET, + * REQ_OP_ZONE_OPEN, REQ_OP_ZONE_CLOSE and REQ_OP_ZONE_FINISH. * * dm_accept_partial_bio informs the dm that the target only wants to process * additional n_sectors sectors of the bio and the rest of the data should be @@ -1212,54 +1239,6 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) } EXPORT_SYMBOL_GPL(dm_accept_partial_bio); -/* - * The zone descriptors obtained with a zone report indicate - * zone positions within the underlying device of the target. The zone - * descriptors must be remapped to match their position within the dm device. - * The caller target should obtain the zones information using - * blkdev_report_zones() to ensure that remapping for partition offset is - * already handled. - */ -void dm_remap_zone_report(struct dm_target *ti, sector_t start, - struct blk_zone *zones, unsigned int *nr_zones) -{ -#ifdef CONFIG_BLK_DEV_ZONED - struct blk_zone *zone; - unsigned int nrz = *nr_zones; - int i; - - /* - * Remap the start sector and write pointer position of the zones in - * the array. Since we may have obtained from the target underlying - * device more zones that the target size, also adjust the number - * of zones. - */ - for (i = 0; i < nrz; i++) { - zone = zones + i; - if (zone->start >= start + ti->len) { - memset(zone, 0, sizeof(struct blk_zone) * (nrz - i)); - break; - } - - zone->start = zone->start + ti->begin - start; - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) - continue; - - if (zone->cond == BLK_ZONE_COND_FULL) - zone->wp = zone->start + zone->len; - else if (zone->cond == BLK_ZONE_COND_EMPTY) - zone->wp = zone->start; - else - zone->wp = zone->wp + ti->begin - start; - } - - *nr_zones = i; -#else /* !CONFIG_BLK_DEV_ZONED */ - *nr_zones = 0; -#endif -} -EXPORT_SYMBOL_GPL(dm_remap_zone_report); - static blk_qc_t __map_bio(struct dm_target_io *tio) { int r; @@ -1627,7 +1606,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, ci.sector_count = 0; error = __send_empty_flush(&ci); /* dec_pending submits any data associated with flush */ - } else if (bio_op(bio) == REQ_OP_ZONE_RESET) { + } else if (op_is_zone_mgmt(bio_op(bio))) { ci.bio = bio; ci.sector_count = 0; error = __split_and_process_non_flush(&ci); diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index b092c7b5282f..3ad18246fcb3 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2139,6 +2139,7 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, memcpy(page_address(store.sb_page), page_address(bitmap->storage.sb_page), sizeof(bitmap_super_t)); + spin_lock_irq(&bitmap->counts.lock); md_bitmap_file_unmap(&bitmap->storage); bitmap->storage = store; @@ -2154,7 +2155,6 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, blocks = min(old_counts.chunks << old_counts.chunkshift, chunks << chunkshift); - spin_lock_irq(&bitmap->counts.lock); /* For cluster raid, need to pre-allocate bitmap */ if (mddev_is_clustered(bitmap->mddev)) { unsigned long page; diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index c766c559d36d..26c75c0199fa 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -244,10 +244,9 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) sector_t start_sector, end_sector, data_offset; sector_t bio_sector = bio->bi_iter.bi_sector; - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) return true; - } tmp_dev = which_dev(mddev, bio_sector); start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c index 6780938d2991..152f9e65a226 100644 --- a/drivers/md/md-multipath.c +++ b/drivers/md/md-multipath.c @@ -104,10 +104,9 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio) struct multipath_bh * mp_bh; struct multipath_info *multipath; - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) return true; - } mp_bh = mempool_alloc(&conf->pool, GFP_NOIO); diff --git a/drivers/md/md.c b/drivers/md/md.c index 1be7abeb24fd..805b33e27496 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -550,7 +550,13 @@ static void md_submit_flush_data(struct work_struct *ws) } } -void md_flush_request(struct mddev *mddev, struct bio *bio) +/* + * Manages consolidation of flushes and submitting any flushes needed for + * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is + * being finished in another context. Returns false if the flushing is + * complete but still needs the I/O portion of the bio to be processed. + */ +bool md_flush_request(struct mddev *mddev, struct bio *bio) { ktime_t start = ktime_get_boottime(); spin_lock_irq(&mddev->lock); @@ -575,9 +581,10 @@ void md_flush_request(struct mddev *mddev, struct bio *bio) bio_endio(bio); else { bio->bi_opf &= ~REQ_PREFLUSH; - mddev->pers->make_request(mddev, bio); + return false; } } + return true; } EXPORT_SYMBOL(md_flush_request); @@ -1098,6 +1105,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; mdp_super_t *sb; int ret; + bool spare_disk = true; /* * Calculate the position of the superblock (512byte sectors), @@ -1148,8 +1156,18 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor else rdev->desc_nr = sb->this_disk.number; + /* not spare disk, or LEVEL_MULTIPATH */ + if (sb->level == LEVEL_MULTIPATH || + (rdev->desc_nr >= 0 && + sb->disks[rdev->desc_nr].state & + ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))) + spare_disk = false; + if (!refdev) { - ret = 1; + if (!spare_disk) + ret = 1; + else + ret = 0; } else { __u64 ev1, ev2; mdp_super_t *refsb = page_address(refdev->sb_page); @@ -1165,7 +1183,8 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor } ev1 = md_event(sb); ev2 = md_event(refsb); - if (ev1 > ev2) + + if (!spare_disk && ev1 > ev2) ret = 1; else ret = 0; @@ -1525,6 +1544,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ sector_t sectors; char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; int bmask; + bool spare_disk = true; /* * Calculate the position of the superblock in 512byte sectors. @@ -1658,8 +1678,19 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ sb->level != 0) return -EINVAL; + /* not spare disk, or LEVEL_MULTIPATH */ + if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) || + (rdev->desc_nr >= 0 && + rdev->desc_nr < le32_to_cpu(sb->max_dev) && + (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || + le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))) + spare_disk = false; + if (!refdev) { - ret = 1; + if (!spare_disk) + ret = 1; + else + ret = 0; } else { __u64 ev1, ev2; struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); @@ -1676,7 +1707,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ ev1 = le64_to_cpu(sb->events); ev2 = le64_to_cpu(refsb->events); - if (ev1 > ev2) + if (!spare_disk && ev1 > ev2) ret = 1; else ret = 0; @@ -3597,7 +3628,7 @@ abort_free: * Check a full RAID array for plausibility */ -static void analyze_sbs(struct mddev *mddev) +static int analyze_sbs(struct mddev *mddev) { int i; struct md_rdev *rdev, *freshest, *tmp; @@ -3618,6 +3649,12 @@ static void analyze_sbs(struct mddev *mddev) md_kick_rdev_from_array(rdev); } + /* Cannot find a valid fresh disk */ + if (!freshest) { + pr_warn("md: cannot find a valid disk\n"); + return -EINVAL; + } + super_types[mddev->major_version]. validate_super(mddev, freshest); @@ -3652,6 +3689,8 @@ static void analyze_sbs(struct mddev *mddev) clear_bit(In_sync, &rdev->flags); } } + + return 0; } /* Read a fixed-point number. @@ -5570,7 +5609,9 @@ int md_run(struct mddev *mddev) if (!mddev->raid_disks) { if (!mddev->persistent) return -EINVAL; - analyze_sbs(mddev); + err = analyze_sbs(mddev); + if (err) + return -EINVAL; } if (mddev->level != LEVEL_NONE) diff --git a/drivers/md/md.h b/drivers/md/md.h index c5e3ff398b59..5f86f8adb0a4 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -550,7 +550,7 @@ struct md_personality int level; struct list_head list; struct module *owner; - bool (*make_request)(struct mddev *mddev, struct bio *bio); + bool __must_check (*make_request)(struct mddev *mddev, struct bio *bio); /* * start up works that do NOT require md_thread. tasks that * requires md_thread should go into start() @@ -703,7 +703,7 @@ extern void md_error(struct mddev *mddev, struct md_rdev *rdev); extern void md_finish_reshape(struct mddev *mddev); extern int mddev_congested(struct mddev *mddev, int bits); -extern void md_flush_request(struct mddev *mddev, struct bio *bio); +extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, sector_t sector, int size, struct page *page); extern int md_super_wait(struct mddev *mddev); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index f61693e59684..b7c20979bd19 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -154,7 +154,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) } else { pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n", mdname(mddev)); - pr_err("md/raid0: please set raid.default_layout to 1 or 2\n"); + pr_err("md/raid0: please set raid0.default_layout to 1 or 2\n"); err = -ENOTSUPP; goto abort; } @@ -575,10 +575,9 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) unsigned chunk_sects; unsigned sectors; - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) return true; - } if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) { raid0_handle_discard(mddev, bio); @@ -615,7 +614,7 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) tmp_dev = map_sector(mddev, zone, sector, §or); break; default: - WARN("md/raid0:%s: Invalid layout\n", mdname(mddev)); + WARN(1, "md/raid0:%s: Invalid layout\n", mdname(mddev)); bio_io_error(bio); return true; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 0466ee2453b4..a409ab6f30bc 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -819,6 +819,7 @@ static void flush_bio_list(struct r1conf *conf, struct bio *bio) else generic_make_request(bio); bio = next; + cond_resched(); } } @@ -1567,10 +1568,9 @@ static bool raid1_make_request(struct mddev *mddev, struct bio *bio) { sector_t sectors; - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) return true; - } /* * There is a limit to the maximum size, but diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 299c7b1c9718..ec136e44aef7 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -191,7 +191,7 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) out_free_pages: while (--j >= 0) - resync_free_pages(&rps[j * 2]); + resync_free_pages(&rps[j]); j = 0; out_free_bio: @@ -1525,10 +1525,9 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio) int chunk_sects = chunk_mask + 1; int sectors = bio_sectors(bio); - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) return true; - } if (!md_write_start(mddev, bio)) return false; diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 18a4064a61a8..cab5b1352892 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -1404,7 +1404,7 @@ int ppl_init_log(struct r5conf *conf) atomic64_set(&ppl_conf->seq, 0); INIT_LIST_HEAD(&ppl_conf->no_mem_stripes); spin_lock_init(&ppl_conf->no_mem_stripes_lock); - ppl_conf->write_hint = RWF_WRITE_LIFE_NOT_SET; + ppl_conf->write_hint = RWH_WRITE_LIFE_NOT_SET; if (!mddev->external) { ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid)); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 223e97ab27e6..f0fc538bfe59 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1134,7 +1134,7 @@ again: bi->bi_iter.bi_size = STRIPE_SIZE; bi->bi_write_hint = sh->dev[i].write_hint; if (!rrdev) - sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET; + sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET; /* * If this is discard request, set bi_vcnt 0. We don't * want to confuse SCSI because SCSI will replace payload @@ -1187,7 +1187,7 @@ again: rbi->bi_io_vec[0].bv_offset = 0; rbi->bi_iter.bi_size = STRIPE_SIZE; rbi->bi_write_hint = sh->dev[i].write_hint; - sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET; + sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET; /* * If this is discard request, set bi_vcnt 0. We don't * want to confuse SCSI because SCSI will replace payload @@ -5592,8 +5592,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) if (ret == 0) return true; if (ret == -ENODEV) { - md_flush_request(mddev, bi); - return true; + if (md_flush_request(mddev, bi)) + return true; } /* ret == -EAGAIN, fallback */ /* |