diff options
Diffstat (limited to 'drivers/md')
52 files changed, 2095 insertions, 900 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 83b9362be09c..2c8ac3688815 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -269,6 +269,13 @@ config DM_BIO_PRISON source "drivers/md/persistent-data/Kconfig" +config DM_UNSTRIPED + tristate "Unstriped target" + depends on BLK_DEV_DM + ---help--- + Unstripes I/O so it is issued solely on a single drive in a HW + RAID0 or dm-striped target. + config DM_CRYPT tristate "Crypt target support" depends on BLK_DEV_DM diff --git a/drivers/md/Makefile b/drivers/md/Makefile index f701bb211783..63255f3ebd97 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -43,6 +43,7 @@ obj-$(CONFIG_BCACHE) += bcache/ obj-$(CONFIG_BLK_DEV_MD) += md-mod.o obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o obj-$(CONFIG_BLK_DEV_DM_BUILTIN) += dm-builtin.o +obj-$(CONFIG_DM_UNSTRIPED) += dm-unstripe.o obj-$(CONFIG_DM_BUFIO) += dm-bufio.o obj-$(CONFIG_DM_BIO_PRISON) += dm-bio-prison.o obj-$(CONFIG_DM_CRYPT) += dm-crypt.o diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index a0cc1bc6d884..458e1d38577d 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -287,8 +287,10 @@ do { \ break; \ \ mutex_unlock(&(ca)->set->bucket_lock); \ - if (kthread_should_stop()) \ + if (kthread_should_stop()) { \ + set_current_state(TASK_RUNNING); \ return 0; \ + } \ \ schedule(); \ mutex_lock(&(ca)->set->bucket_lock); \ @@ -525,15 +527,21 @@ struct open_bucket { /* * We keep multiple buckets open for writes, and try to segregate different - * write streams for better cache utilization: first we look for a bucket where - * the last write to it was sequential with the current write, and failing that - * we look for a bucket that was last used by the same task. + * write streams for better cache utilization: first we try to segregate flash + * only volume write streams from cached devices, secondly we look for a bucket + * where the last write to it was sequential with the current write, and + * failing that we look for a bucket that was last used by the same task. * * The ideas is if you've got multiple tasks pulling data into the cache at the * same time, you'll get better cache utilization if you try to segregate their * data and preserve locality. * - * For example, say you've starting Firefox at the same time you're copying a + * For example, dirty sectors of flash only volume is not reclaimable, if their + * dirty sectors mixed with dirty sectors of cached device, such buckets will + * be marked as dirty and won't be reclaimed, though the dirty data of cached + * device have been written back to backend device. + * + * And say you've starting Firefox at the same time you're copying a * bunch of files. Firefox will likely end up being fairly hot and stay in the * cache awhile, but the data you copied might not be; if you wrote all that * data to the same buckets it'd get invalidated at the same time. @@ -550,7 +558,10 @@ static struct open_bucket *pick_data_bucket(struct cache_set *c, struct open_bucket *ret, *ret_task = NULL; list_for_each_entry_reverse(ret, &c->data_buckets, list) - if (!bkey_cmp(&ret->key, search)) + if (UUID_FLASH_ONLY(&c->uuids[KEY_INODE(&ret->key)]) != + UUID_FLASH_ONLY(&c->uuids[KEY_INODE(search)])) + continue; + else if (!bkey_cmp(&ret->key, search)) goto found; else if (ret->last_write_point == write_point) ret_task = ret; diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 843877e017e1..12e5197f186c 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -320,14 +320,15 @@ struct cached_dev { */ atomic_t has_dirty; - struct bch_ratelimit writeback_rate; - struct delayed_work writeback_rate_update; - /* - * Internal to the writeback code, so read_dirty() can keep track of - * where it's at. + * Set to zero by things that touch the backing volume-- except + * writeback. Incremented by writeback. Used to determine when to + * accelerate idle writeback. */ - sector_t last_read; + atomic_t backing_idle; + + struct bch_ratelimit writeback_rate; + struct delayed_work writeback_rate_update; /* Limit number of writeback bios in flight */ struct semaphore in_flight; @@ -336,6 +337,14 @@ struct cached_dev { struct keybuf writeback_keys; + /* + * Order the write-half of writeback operations strongly in dispatch + * order. (Maintain LBA order; don't allow reads completing out of + * order to re-order the writes...) + */ + struct closure_waitlist writeback_ordering_wait; + atomic_t writeback_sequence_next; + /* For tracking sequential IO */ #define RECENT_IO_BITS 7 #define RECENT_IO (1 << RECENT_IO_BITS) @@ -488,6 +497,7 @@ struct cache_set { int caches_loaded; struct bcache_device **devices; + unsigned devices_max_used; struct list_head cached_devs; uint64_t cached_dev_sectors; struct closure caching; @@ -648,10 +658,15 @@ struct cache_set { atomic_long_t writeback_keys_done; atomic_long_t writeback_keys_failed; + atomic_long_t reclaim; + atomic_long_t flush_write; + atomic_long_t retry_flush_write; + enum { ON_ERROR_UNREGISTER, ON_ERROR_PANIC, } on_error; +#define DEFAULT_IO_ERROR_LIMIT 8 unsigned error_limit; unsigned error_decay; @@ -665,6 +680,8 @@ struct cache_set { #define BUCKET_HASH_BITS 12 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; + + DECLARE_HEAP(struct btree *, flush_btree); }; struct bbio { @@ -852,7 +869,7 @@ static inline void wake_up_allocators(struct cache_set *c) /* Forward declarations */ -void bch_count_io_errors(struct cache *, blk_status_t, const char *); +void bch_count_io_errors(struct cache *, blk_status_t, int, const char *); void bch_bbio_count_io_errors(struct cache_set *, struct bio *, blk_status_t, const char *); void bch_bbio_endio(struct cache_set *, struct bio *, blk_status_t, @@ -907,7 +924,7 @@ void bcache_write_super(struct cache_set *); int bch_flash_dev_create(struct cache_set *c, uint64_t size); -int bch_cached_dev_attach(struct cached_dev *, struct cache_set *); +int bch_cached_dev_attach(struct cached_dev *, struct cache_set *, uint8_t *); void bch_cached_dev_detach(struct cached_dev *); void bch_cached_dev_run(struct cached_dev *); void bcache_device_stop(struct bcache_device *); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 81e8dc3dbe5e..fad9fe8817eb 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -419,7 +419,7 @@ static void do_btree_node_write(struct btree *b) SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_sector_offset(&b->keys, i)); - if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { + if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { int j; struct bio_vec *bv; void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); @@ -432,6 +432,7 @@ static void do_btree_node_write(struct btree *b) continue_at(cl, btree_node_write_done, NULL); } else { + /* No problem for multipage bvec since the bio is just allocated */ b->bio->bi_vcnt = 0; bch_bio_map(b->bio, i); @@ -1678,7 +1679,7 @@ static void bch_btree_gc_finish(struct cache_set *c) /* don't reclaim buckets to which writeback keys point */ rcu_read_lock(); - for (i = 0; i < c->nr_uuids; i++) { + for (i = 0; i < c->devices_max_used; i++) { struct bcache_device *d = c->devices[i]; struct cached_dev *dc; struct keybuf_key *w, *n; @@ -1803,10 +1804,7 @@ static int bch_gc_thread(void *arg) int bch_gc_thread_start(struct cache_set *c) { c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc"); - if (IS_ERR(c->gc_thread)) - return PTR_ERR(c->gc_thread); - - return 0; + return PTR_ERR_OR_ZERO(c->gc_thread); } /* Initial partial gc */ @@ -1871,14 +1869,17 @@ void bch_initial_gc_finish(struct cache_set *c) */ for_each_cache(ca, c, i) { for_each_bucket(b, ca) { - if (fifo_full(&ca->free[RESERVE_PRIO])) + if (fifo_full(&ca->free[RESERVE_PRIO]) && + fifo_full(&ca->free[RESERVE_BTREE])) break; if (bch_can_invalidate_bucket(ca, b) && !GC_MARK(b)) { __bch_invalidate_one_bucket(ca, b); - fifo_push(&ca->free[RESERVE_PRIO], - b - ca->buckets); + if (!fifo_push(&ca->free[RESERVE_PRIO], + b - ca->buckets)) + fifo_push(&ca->free[RESERVE_BTREE], + b - ca->buckets); } } } diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 1841d0359bac..7f12920c14f7 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -8,6 +8,7 @@ #include <linux/debugfs.h> #include <linux/module.h> #include <linux/seq_file.h> +#include <linux/sched/debug.h> #include "closure.h" @@ -18,10 +19,6 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) BUG_ON(flags & CLOSURE_GUARD_MASK); BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); - /* Must deliver precisely one wakeup */ - if (r == 1 && (flags & CLOSURE_SLEEPING)) - wake_up_process(cl->task); - if (!r) { if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { atomic_set(&cl->remaining, @@ -100,28 +97,34 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) } EXPORT_SYMBOL(closure_wait); -/** - * closure_sync - sleep until a closure has nothing left to wait on - * - * Sleeps until the refcount hits 1 - the thread that's running the closure owns - * the last refcount. - */ -void closure_sync(struct closure *cl) +struct closure_syncer { + struct task_struct *task; + int done; +}; + +static void closure_sync_fn(struct closure *cl) { - while (1) { - __closure_start_sleep(cl); - closure_set_ret_ip(cl); + cl->s->done = 1; + wake_up_process(cl->s->task); +} - if ((atomic_read(&cl->remaining) & - CLOSURE_REMAINING_MASK) == 1) - break; +void __sched __closure_sync(struct closure *cl) +{ + struct closure_syncer s = { .task = current }; + cl->s = &s; + continue_at(cl, closure_sync_fn, NULL); + + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (s.done) + break; schedule(); } - __closure_end_sleep(cl); + __set_current_state(TASK_RUNNING); } -EXPORT_SYMBOL(closure_sync); +EXPORT_SYMBOL(__closure_sync); #ifdef CONFIG_BCACHE_CLOSURES_DEBUG @@ -168,12 +171,10 @@ static int debug_seq_show(struct seq_file *f, void *data) cl, (void *) cl->ip, cl->fn, cl->parent, r & CLOSURE_REMAINING_MASK); - seq_printf(f, "%s%s%s%s\n", + seq_printf(f, "%s%s\n", test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&cl->work)) ? "Q" : "", - r & CLOSURE_RUNNING ? "R" : "", - r & CLOSURE_STACK ? "S" : "", - r & CLOSURE_SLEEPING ? "Sl" : ""); + r & CLOSURE_RUNNING ? "R" : ""); if (r & CLOSURE_WAITING) seq_printf(f, " W %pF\n", diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index ccfbea6f9f6b..3b9dfc9962ad 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -103,6 +103,7 @@ */ struct closure; +struct closure_syncer; typedef void (closure_fn) (struct closure *); struct closure_waitlist { @@ -115,10 +116,6 @@ enum closure_state { * the thread that owns the closure, and cleared by the thread that's * waking up the closure. * - * CLOSURE_SLEEPING: Must be set before a thread uses a closure to sleep - * - indicates that cl->task is valid and closure_put() may wake it up. - * Only set or cleared by the thread that owns the closure. - * * The rest are for debugging and don't affect behaviour: * * CLOSURE_RUNNING: Set when a closure is running (i.e. by @@ -128,22 +125,16 @@ enum closure_state { * continue_at() and closure_return() clear it for you, if you're doing * something unusual you can use closure_set_dead() which also helps * annotate where references are being transferred. - * - * CLOSURE_STACK: Sanity check - remaining should never hit 0 on a - * closure with this flag set */ - CLOSURE_BITS_START = (1 << 23), - CLOSURE_DESTRUCTOR = (1 << 23), - CLOSURE_WAITING = (1 << 25), - CLOSURE_SLEEPING = (1 << 27), - CLOSURE_RUNNING = (1 << 29), - CLOSURE_STACK = (1 << 31), + CLOSURE_BITS_START = (1U << 26), + CLOSURE_DESTRUCTOR = (1U << 26), + CLOSURE_WAITING = (1U << 28), + CLOSURE_RUNNING = (1U << 30), }; #define CLOSURE_GUARD_MASK \ - ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_SLEEPING| \ - CLOSURE_RUNNING|CLOSURE_STACK) << 1) + ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1) #define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) #define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) @@ -152,7 +143,7 @@ struct closure { union { struct { struct workqueue_struct *wq; - struct task_struct *task; + struct closure_syncer *s; struct llist_node list; closure_fn *fn; }; @@ -178,7 +169,19 @@ void closure_sub(struct closure *cl, int v); void closure_put(struct closure *cl); void __closure_wake_up(struct closure_waitlist *list); bool closure_wait(struct closure_waitlist *list, struct closure *cl); -void closure_sync(struct closure *cl); +void __closure_sync(struct closure *cl); + +/** + * closure_sync - sleep until a closure a closure has nothing left to wait on + * + * Sleeps until the refcount hits 1 - the thread that's running the closure owns + * the last refcount. + */ +static inline void closure_sync(struct closure *cl) +{ + if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1) + __closure_sync(cl); +} #ifdef CONFIG_BCACHE_CLOSURES_DEBUG @@ -215,24 +218,6 @@ static inline void closure_set_waiting(struct closure *cl, unsigned long f) #endif } -static inline void __closure_end_sleep(struct closure *cl) -{ - __set_current_state(TASK_RUNNING); - - if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING) - atomic_sub(CLOSURE_SLEEPING, &cl->remaining); -} - -static inline void __closure_start_sleep(struct closure *cl) -{ - closure_set_ip(cl); - cl->task = current; - set_current_state(TASK_UNINTERRUPTIBLE); - - if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING)) - atomic_add(CLOSURE_SLEEPING, &cl->remaining); -} - static inline void closure_set_stopped(struct closure *cl) { atomic_sub(CLOSURE_RUNNING, &cl->remaining); @@ -241,7 +226,6 @@ static inline void closure_set_stopped(struct closure *cl) static inline void set_closure_fn(struct closure *cl, closure_fn *fn, struct workqueue_struct *wq) { - BUG_ON(object_is_on_stack(cl)); closure_set_ip(cl); cl->fn = fn; cl->wq = wq; @@ -300,7 +284,7 @@ static inline void closure_init(struct closure *cl, struct closure *parent) static inline void closure_init_stack(struct closure *cl) { memset(cl, 0, sizeof(struct closure)); - atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK); + atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); } /** @@ -322,6 +306,8 @@ static inline void closure_wake_up(struct closure_waitlist *list) * This is because after calling continue_at() you no longer have a ref on @cl, * and whatever @cl owns may be freed out from under you - a running closure fn * has a ref on its own closure which continue_at() drops. + * + * Note you are expected to immediately return after using this macro. */ #define continue_at(_cl, _fn, _wq) \ do { \ diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index c7a02c4900da..af89408befe8 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -116,7 +116,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) return; check->bi_opf = REQ_OP_READ; - if (bio_alloc_pages(check, GFP_NOIO)) + if (bch_bio_alloc_pages(check, GFP_NOIO)) goto out_put; submit_bio_wait(check); @@ -251,8 +251,7 @@ void bch_debug_exit(void) int __init bch_debug_init(struct kobject *kobj) { - int ret = 0; - debug = debugfs_create_dir("bcache", NULL); - return ret; + + return IS_ERR_OR_NULL(debug); } diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index fac97ec2d0e2..a783c5a41ff1 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -51,7 +51,10 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c, /* IO errors */ -void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m) +void bch_count_io_errors(struct cache *ca, + blk_status_t error, + int is_read, + const char *m) { /* * The halflife of an error is: @@ -94,8 +97,9 @@ void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m) errors >>= IO_ERROR_SHIFT; if (errors < ca->set->error_limit) - pr_err("%s: IO error on %s, recovering", - bdevname(ca->bdev, buf), m); + pr_err("%s: IO error on %s%s", + bdevname(ca->bdev, buf), m, + is_read ? ", recovering." : "."); else bch_cache_set_error(ca->set, "%s: too many IO errors %s", @@ -108,6 +112,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, { struct bbio *b = container_of(bio, struct bbio, bio); struct cache *ca = PTR_CACHE(c, &b->key, 0); + int is_read = (bio_data_dir(bio) == READ ? 1 : 0); unsigned threshold = op_is_write(bio_op(bio)) ? c->congested_write_threshold_us @@ -129,7 +134,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, atomic_inc(&c->congested); } - bch_count_io_errors(ca, error, m); + bch_count_io_errors(ca, error, is_read, m); } void bch_bbio_endio(struct cache_set *c, struct bio *bio, diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index a87165c1d8e5..1b736b860739 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -368,6 +368,12 @@ err: } /* Journalling */ +#define journal_max_cmp(l, r) \ + (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \ + fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) +#define journal_min_cmp(l, r) \ + (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \ + fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) static void btree_flush_write(struct cache_set *c) { @@ -375,28 +381,41 @@ static void btree_flush_write(struct cache_set *c) * Try to find the btree node with that references the oldest journal * entry, best is our current candidate and is locked if non NULL: */ - struct btree *b, *best; - unsigned i; + struct btree *b; + int i; + + atomic_long_inc(&c->flush_write); + retry: - best = NULL; - - for_each_cached_btree(b, c, i) - if (btree_current_write(b)->journal) { - if (!best) - best = b; - else if (journal_pin_cmp(c, - btree_current_write(best)->journal, - btree_current_write(b)->journal)) { - best = b; + spin_lock(&c->journal.lock); + if (heap_empty(&c->flush_btree)) { + for_each_cached_btree(b, c, i) + if (btree_current_write(b)->journal) { + if (!heap_full(&c->flush_btree)) + heap_add(&c->flush_btree, b, + journal_max_cmp); + else if (journal_max_cmp(b, + heap_peek(&c->flush_btree))) { + c->flush_btree.data[0] = b; + heap_sift(&c->flush_btree, 0, + journal_max_cmp); + } } - } - b = best; + for (i = c->flush_btree.used / 2 - 1; i >= 0; --i) + heap_sift(&c->flush_btree, i, journal_min_cmp); + } + + b = NULL; + heap_pop(&c->flush_btree, b, journal_min_cmp); + spin_unlock(&c->journal.lock); + if (b) { mutex_lock(&b->write_lock); if (!btree_current_write(b)->journal) { mutex_unlock(&b->write_lock); /* We raced */ + atomic_long_inc(&c->retry_flush_write); goto retry; } @@ -476,6 +495,8 @@ static void journal_reclaim(struct cache_set *c) unsigned iter, n = 0; atomic_t p; + atomic_long_inc(&c->reclaim); + while (!atomic_read(&fifo_front(&c->journal.pin))) fifo_pop(&c->journal.pin, p); @@ -819,7 +840,8 @@ int bch_journal_alloc(struct cache_set *c) j->w[0].c = c; j->w[1].c = c; - if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || + if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) || + !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) return -ENOMEM; diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index d50c1c97da68..a24c3a95b2c0 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -162,7 +162,7 @@ static void read_moving(struct cache_set *c) bio_set_op_attrs(bio, REQ_OP_READ, 0); bio->bi_end_io = read_moving_endio; - if (bio_alloc_pages(bio, GFP_KERNEL)) + if (bch_bio_alloc_pages(bio, GFP_KERNEL)) goto err; trace_bcache_gc_copy(&w->key); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 643c3021624f..1a46b41dac70 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -576,6 +576,7 @@ static void cache_lookup(struct closure *cl) { struct search *s = container_of(cl, struct search, iop.cl); struct bio *bio = &s->bio.bio; + struct cached_dev *dc; int ret; bch_btree_op_init(&s->op, -1); @@ -588,6 +589,27 @@ static void cache_lookup(struct closure *cl) return; } + /* + * We might meet err when searching the btree, If that happens, we will + * get negative ret, in this scenario we should not recover data from + * backing device (when cache device is dirty) because we don't know + * whether bkeys the read request covered are all clean. + * + * And after that happened, s->iop.status is still its initial value + * before we submit s->bio.bio + */ + if (ret < 0) { + BUG_ON(ret == -EINTR); + if (s->d && s->d->c && + !UUID_FLASH_ONLY(&s->d->c->uuids[s->d->id])) { + dc = container_of(s->d, struct cached_dev, disk); + if (dc && atomic_read(&dc->has_dirty)) + s->recoverable = false; + } + if (!s->iop.status) + s->iop.status = BLK_STS_IOERR; + } + closure_return(cl); } @@ -611,8 +633,8 @@ static void request_endio(struct bio *bio) static void bio_complete(struct search *s) { if (s->orig_bio) { - struct request_queue *q = s->orig_bio->bi_disk->queue; - generic_end_io_acct(q, bio_data_dir(s->orig_bio), + generic_end_io_acct(s->d->disk->queue, + bio_data_dir(s->orig_bio), &s->d->disk->part0, s->start_time); trace_bcache_request_end(s->d, s->orig_bio); @@ -841,7 +863,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, cache_bio->bi_private = &s->cl; bch_bio_map(cache_bio, NULL); - if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO)) + if (bch_bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO)) goto out_put; if (reada) @@ -974,6 +996,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, struct cached_dev *dc = container_of(d, struct cached_dev, disk); int rw = bio_data_dir(bio); + atomic_set(&dc->backing_idle, 0); generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0); bio_set_dev(bio, dc->bdev); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index b4d28928dec5..312895788036 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -211,7 +211,7 @@ static void write_bdev_super_endio(struct bio *bio) static void __write_super(struct cache_sb *sb, struct bio *bio) { - struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page); + struct cache_sb *out = page_address(bio_first_page_all(bio)); unsigned i; bio->bi_iter.bi_sector = SB_SECTOR; @@ -274,7 +274,9 @@ static void write_super_endio(struct bio *bio) { struct cache *ca = bio->bi_private; - bch_count_io_errors(ca, bio->bi_status, "writing superblock"); + /* is_read = 0 */ + bch_count_io_errors(ca, bio->bi_status, 0, + "writing superblock"); closure_put(&ca->set->sb_write); } @@ -721,6 +723,9 @@ static void bcache_device_attach(struct bcache_device *d, struct cache_set *c, d->c = c; c->devices[id] = d; + if (id >= c->devices_max_used) + c->devices_max_used = id + 1; + closure_get(&c->caching); } @@ -906,6 +911,12 @@ static void cached_dev_detach_finish(struct work_struct *w) mutex_lock(&bch_register_lock); + cancel_delayed_work_sync(&dc->writeback_rate_update); + if (!IS_ERR_OR_NULL(dc->writeback_thread)) { + kthread_stop(dc->writeback_thread); + dc->writeback_thread = NULL; + } + memset(&dc->sb.set_uuid, 0, 16); SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE); @@ -946,7 +957,8 @@ void bch_cached_dev_detach(struct cached_dev *dc) cached_dev_put(dc); } -int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) +int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, + uint8_t *set_uuid) { uint32_t rtime = cpu_to_le32(get_seconds()); struct uuid_entry *u; @@ -954,7 +966,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) bdevname(dc->bdev, buf); - if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)) + if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) || + (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))) return -ENOENT; if (dc->disk.c) { @@ -1166,7 +1179,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page, dc->bdev->bd_holder = dc; bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1); - dc->sb_bio.bi_io_vec[0].bv_page = sb_page; + bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page; get_page(sb_page); if (cached_dev_init(dc, sb->block_size << 9)) @@ -1183,7 +1196,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page, list_add(&dc->list, &uncached_devices); list_for_each_entry(c, &bch_cache_sets, list) - bch_cached_dev_attach(dc, c); + bch_cached_dev_attach(dc, c, NULL); if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE || BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) @@ -1261,7 +1274,7 @@ static int flash_devs_run(struct cache_set *c) struct uuid_entry *u; for (u = c->uuids; - u < c->uuids + c->nr_uuids && !ret; + u < c->uuids + c->devices_max_used && !ret; u++) if (UUID_FLASH_ONLY(u)) ret = flash_dev_run(c, u); @@ -1427,7 +1440,7 @@ static void __cache_set_unregister(struct closure *cl) mutex_lock(&bch_register_lock); - for (i = 0; i < c->nr_uuids; i++) + for (i = 0; i < c->devices_max_used; i++) if (c->devices[i]) { if (!UUID_FLASH_ONLY(&c->uuids[i]) && test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { @@ -1490,7 +1503,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->bucket_bits = ilog2(sb->bucket_size); c->block_bits = ilog2(sb->block_size); c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); - + c->devices_max_used = 0; c->btree_pages = bucket_pages(c); if (c->btree_pages > BTREE_MAX_PAGES) c->btree_pages = max_t(int, c->btree_pages / 4, @@ -1542,7 +1555,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->congested_read_threshold_us = 2000; c->congested_write_threshold_us = 20000; - c->error_limit = 8 << IO_ERROR_SHIFT; + c->error_limit = DEFAULT_IO_ERROR_LIMIT; return c; err: @@ -1705,7 +1718,7 @@ static void run_cache_set(struct cache_set *c) bcache_write_super(c); list_for_each_entry_safe(dc, t, &uncached_devices, list) - bch_cached_dev_attach(dc, c); + bch_cached_dev_attach(dc, c, NULL); flash_devs_run(c); @@ -1810,7 +1823,7 @@ void bch_cache_release(struct kobject *kobj) free_fifo(&ca->free[i]); if (ca->sb_bio.bi_inline_vecs[0].bv_page) - put_page(ca->sb_bio.bi_io_vec[0].bv_page); + put_page(bio_first_page_all(&ca->sb_bio)); if (!IS_ERR_OR_NULL(ca->bdev)) blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); @@ -1822,6 +1835,7 @@ void bch_cache_release(struct kobject *kobj) static int cache_alloc(struct cache *ca) { size_t free; + size_t btree_buckets; struct bucket *b; __module_get(THIS_MODULE); @@ -1829,9 +1843,19 @@ static int cache_alloc(struct cache *ca) bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8); + /* + * when ca->sb.njournal_buckets is not zero, journal exists, + * and in bch_journal_replay(), tree node may split, + * so bucket of RESERVE_BTREE type is needed, + * the worst situation is all journal buckets are valid journal, + * and all the keys need to replay, + * so the number of RESERVE_BTREE type buckets should be as much + * as journal buckets + */ + btree_buckets = ca->sb.njournal_buckets ?: 8; free = roundup_pow_of_two(ca->sb.nbuckets) >> 10; - if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) || + if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets, GFP_KERNEL) || !init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) || @@ -1864,7 +1888,7 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, ca->bdev->bd_holder = ca; bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1); - ca->sb_bio.bi_io_vec[0].bv_page = sb_page; + bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page; get_page(sb_page); if (blk_queue_discard(bdev_get_queue(ca->bdev))) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index b4184092c727..78cd7bd50fdd 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -65,6 +65,9 @@ read_attribute(bset_tree_stats); read_attribute(state); read_attribute(cache_read_races); +read_attribute(reclaim); +read_attribute(flush_write); +read_attribute(retry_flush_write); read_attribute(writeback_keys_done); read_attribute(writeback_keys_failed); read_attribute(io_errors); @@ -195,7 +198,7 @@ STORE(__cached_dev) { struct cached_dev *dc = container_of(kobj, struct cached_dev, disk.kobj); - ssize_t v = size; + ssize_t v; struct cache_set *c; struct kobj_uevent_env *env; @@ -215,7 +218,9 @@ STORE(__cached_dev) sysfs_strtoul_clamp(writeback_rate, dc->writeback_rate.rate, 1, INT_MAX); - d_strtoul_nonzero(writeback_rate_update_seconds); + sysfs_strtoul_clamp(writeback_rate_update_seconds, + dc->writeback_rate_update_seconds, + 1, WRITEBACK_RATE_UPDATE_SECS_MAX); d_strtoul(writeback_rate_i_term_inverse); d_strtoul_nonzero(writeback_rate_p_term_inverse); @@ -267,17 +272,20 @@ STORE(__cached_dev) } if (attr == &sysfs_attach) { - if (bch_parse_uuid(buf, dc->sb.set_uuid) < 16) + uint8_t set_uuid[16]; + + if (bch_parse_uuid(buf, set_uuid) < 16) return -EINVAL; + v = -ENOENT; list_for_each_entry(c, &bch_cache_sets, list) { - v = bch_cached_dev_attach(dc, c); + v = bch_cached_dev_attach(dc, c, set_uuid); if (!v) return size; } pr_err("Can't attach %s: cache set not found", buf); - size = v; + return v; } if (attr == &sysfs_detach && dc->disk.c) @@ -545,6 +553,15 @@ SHOW(__bch_cache_set) sysfs_print(cache_read_races, atomic_long_read(&c->cache_read_races)); + sysfs_print(reclaim, + atomic_long_read(&c->reclaim)); + + sysfs_print(flush_write, + atomic_long_read(&c->flush_write)); + + sysfs_print(retry_flush_write, + atomic_long_read(&c->retry_flush_write)); + sysfs_print(writeback_keys_done, atomic_long_read(&c->writeback_keys_done)); sysfs_print(writeback_keys_failed, @@ -556,7 +573,7 @@ SHOW(__bch_cache_set) /* See count_io_errors for why 88 */ sysfs_print(io_error_halflife, c->error_decay * 88); - sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT); + sysfs_print(io_error_limit, c->error_limit); sysfs_hprint(congested, ((uint64_t) bch_get_congested(c)) << 9); @@ -656,7 +673,7 @@ STORE(__bch_cache_set) } if (attr == &sysfs_io_error_limit) - c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; + c->error_limit = strtoul_or_return(buf); /* See count_io_errors() for why 88 */ if (attr == &sysfs_io_error_halflife) @@ -731,6 +748,9 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_bset_tree_stats, &sysfs_cache_read_races, + &sysfs_reclaim, + &sysfs_flush_write, + &sysfs_retry_flush_write, &sysfs_writeback_keys_done, &sysfs_writeback_keys_failed, diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index e548b8b51322..a23cd6a14b74 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -249,6 +249,13 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) : 0; } +/* + * Generally it isn't good to access .bi_io_vec and .bi_vcnt directly, + * the preferred way is bio_add_page, but in this case, bch_bio_map() + * supposes that the bvec table is empty, so it is safe to access + * .bi_vcnt & .bi_io_vec in this way even after multipage bvec is + * supported. + */ void bch_bio_map(struct bio *bio, void *base) { size_t size = bio->bi_iter.bi_size; @@ -276,6 +283,33 @@ start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset, } } +/** + * bch_bio_alloc_pages - allocates a single page for each bvec in a bio + * @bio: bio to allocate pages for + * @gfp_mask: flags for allocation + * + * Allocates pages up to @bio->bi_vcnt. + * + * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are + * freed. + */ +int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) +{ + int i; + struct bio_vec *bv; + + bio_for_each_segment_all(bv, bio, i) { + bv->bv_page = alloc_page(gfp_mask); + if (!bv->bv_page) { + while (--bv >= bio->bi_io_vec) + __free_page(bv->bv_page); + return -ENOMEM; + } + } + + return 0; +} + /* * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any * use permitted, subject to terms of PostgreSQL license; see.) diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index ed5e8a412eb8..a6763db7f061 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -112,6 +112,8 @@ do { \ #define heap_full(h) ((h)->used == (h)->size) +#define heap_empty(h) ((h)->used == 0) + #define DECLARE_FIFO(type, name) \ struct { \ size_t front, back, size, mask; \ @@ -558,6 +560,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) } void bch_bio_map(struct bio *bio, void *base); +int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask); static inline sector_t bdev_sectors(struct block_device *bdev) { diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 56a37884ca8b..f1d2fc15abcc 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -18,17 +18,39 @@ #include <trace/events/bcache.h> /* Rate limiting */ - -static void __update_writeback_rate(struct cached_dev *dc) +static uint64_t __calc_target_rate(struct cached_dev *dc) { struct cache_set *c = dc->disk.c; + + /* + * This is the size of the cache, minus the amount used for + * flash-only devices + */ uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - bcache_flash_devs_sectors_dirty(c); + + /* + * Unfortunately there is no control of global dirty data. If the + * user states that they want 10% dirty data in the cache, and has, + * e.g., 5 backing volumes of equal size, we try and ensure each + * backing volume uses about 2% of the cache for dirty data. + */ + uint32_t bdev_share = + div64_u64(bdev_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT, + c->cached_dev_sectors); + uint64_t cache_dirty_target = div_u64(cache_sectors * dc->writeback_percent, 100); - int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), - c->cached_dev_sectors); + /* Ensure each backing dev gets at least one dirty share */ + if (bdev_share < 1) + bdev_share = 1; + + return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT; +} + +static void __update_writeback_rate(struct cached_dev *dc) +{ /* * PI controller: * Figures out the amount that should be written per second. @@ -49,6 +71,7 @@ static void __update_writeback_rate(struct cached_dev *dc) * This acts as a slow, long-term average that is not subject to * variations in usage like the p term. */ + int64_t target = __calc_target_rate(dc); int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); int64_t error = dirty - target; int64_t proportional_scaled = @@ -116,6 +139,7 @@ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) struct dirty_io { struct closure cl; struct cached_dev *dc; + uint16_t sequence; struct bio bio; }; @@ -194,6 +218,27 @@ static void write_dirty(struct closure *cl) { struct dirty_io *io = container_of(cl, struct dirty_io, cl); struct keybuf_key *w = io->bio.bi_private; + struct cached_dev *dc = io->dc; + + uint16_t next_sequence; + + if (atomic_read(&dc->writeback_sequence_next) != io->sequence) { + /* Not our turn to write; wait for a write to complete */ + closure_wait(&dc->writeback_ordering_wait, cl); + + if (atomic_read(&dc->writeback_sequence_next) == io->sequence) { + /* + * Edge case-- it happened in indeterminate order + * relative to when we were added to wait list.. + */ + closure_wake_up(&dc->writeback_ordering_wait); + } + + continue_at(cl, write_dirty, io->dc->writeback_write_wq); + return; + } + + next_sequence = io->sequence + 1; /* * IO errors are signalled using the dirty bit on the key. @@ -211,6 +256,9 @@ static void write_dirty(struct closure *cl) closure_bio_submit(&io->bio, cl); } + atomic_set(&dc->writeback_sequence_next, next_sequence); + closure_wake_up(&dc->writeback_ordering_wait); + continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); } @@ -219,8 +267,10 @@ static void read_dirty_endio(struct bio *bio) struct keybuf_key *w = bio->bi_private; struct dirty_io *io = w->private; + /* is_read = 1 */ bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), - bio->bi_status, "reading dirty data from cache"); + bio->bi_status, 1, + "reading dirty data from cache"); dirty_endio(bio); } @@ -237,10 +287,15 @@ static void read_dirty_submit(struct closure *cl) static void read_dirty(struct cached_dev *dc) { unsigned delay = 0; - struct keybuf_key *w; + struct keybuf_key *next, *keys[MAX_WRITEBACKS_IN_PASS], *w; + size_t size; + int nk, i; struct dirty_io *io; struct closure cl; + uint16_t sequence = 0; + BUG_ON(!llist_empty(&dc->writeback_ordering_wait.list)); + atomic_set(&dc->writeback_sequence_next, sequence); closure_init_stack(&cl); /* @@ -248,45 +303,109 @@ static void read_dirty(struct cached_dev *dc) * mempools. */ - while (!kthread_should_stop()) { - - w = bch_keybuf_next(&dc->writeback_keys); - if (!w) - break; - - BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); - - if (KEY_START(&w->key) != dc->last_read || - jiffies_to_msecs(delay) > 50) - while (!kthread_should_stop() && delay) - delay = schedule_timeout_interruptible(delay); - - dc->last_read = KEY_OFFSET(&w->key); - - io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) - * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), - GFP_KERNEL); - if (!io) - goto err; - - w->private = io; - io->dc = dc; - - dirty_init(w); - bio_set_op_attrs(&io->bio, REQ_OP_READ, 0); - io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); - bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev); - io->bio.bi_end_io = read_dirty_endio; - - if (bio_alloc_pages(&io->bio, GFP_KERNEL)) - goto err_free; + next = bch_keybuf_next(&dc->writeback_keys); + + while (!kthread_should_stop() && next) { + size = 0; + nk = 0; + + do { + BUG_ON(ptr_stale(dc->disk.c, &next->key, 0)); + + /* + * Don't combine too many operations, even if they + * are all small. + */ + if (nk >= MAX_WRITEBACKS_IN_PASS) + break; + + /* + * If the current operation is very large, don't + * further combine operations. + */ + if (size >= MAX_WRITESIZE_IN_PASS) + break; + + /* + * Operations are only eligible to be combined + * if they are contiguous. + * + * TODO: add a heuristic willing to fire a + * certain amount of non-contiguous IO per pass, + * so that we can benefit from backing device + * command queueing. + */ + if ((nk != 0) && bkey_cmp(&keys[nk-1]->key, + &START_KEY(&next->key))) + break; + + size += KEY_SIZE(&next->key); + keys[nk++] = next; + } while ((next = bch_keybuf_next(&dc->writeback_keys))); + + /* Now we have gathered a set of 1..5 keys to write back. */ + for (i = 0; i < nk; i++) { + w = keys[i]; + + io = kzalloc(sizeof(struct dirty_io) + + sizeof(struct bio_vec) * + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + GFP_KERNEL); + if (!io) + goto err; + + w->private = io; + io->dc = dc; + io->sequence = sequence++; + + dirty_init(w); + bio_set_op_attrs(&io->bio, REQ_OP_READ, 0); + io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); + bio_set_dev(&io->bio, + PTR_CACHE(dc->disk.c, &w->key, 0)->bdev); + io->bio.bi_end_io = read_dirty_endio; + + if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) + goto err_free; + + trace_bcache_writeback(&w->key); + + down(&dc->in_flight); + + /* We've acquired a semaphore for the maximum + * simultaneous number of writebacks; from here + * everything happens asynchronously. + */ + closure_call(&io->cl, read_dirty_submit, NULL, &cl); + } - trace_bcache_writeback(&w->key); + delay = writeback_delay(dc, size); - down(&dc->in_flight); - closure_call(&io->cl, read_dirty_submit, NULL, &cl); + /* If the control system would wait for at least half a + * second, and there's been no reqs hitting the backing disk + * for awhile: use an alternate mode where we have at most + * one contiguous set of writebacks in flight at a time. If + * someone wants to do IO it will be quick, as it will only + * have to contend with one operation in flight, and we'll + * be round-tripping data to the backing disk as quickly as + * it can accept it. + */ + if (delay >= HZ / 2) { + /* 3 means at least 1.5 seconds, up to 7.5 if we + * have slowed way down. + */ + if (atomic_inc_return(&dc->backing_idle) >= 3) { + /* Wait for current I/Os to finish */ + closure_sync(&cl); + /* And immediately launch a new set. */ + delay = 0; + } + } - delay = writeback_delay(dc, KEY_SIZE(&w->key)); + while (!kthread_should_stop() && delay) { + schedule_timeout_interruptible(delay); + delay = writeback_delay(dc, 0); + } } if (0) { @@ -445,18 +564,21 @@ static int bch_writeback_thread(void *arg) while (!kthread_should_stop()) { down_write(&dc->writeback_lock); + set_current_state(TASK_INTERRUPTIBLE); if (!atomic_read(&dc->has_dirty) || (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && !dc->writeback_running)) { up_write(&dc->writeback_lock); - set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) + if (kthread_should_stop()) { + set_current_state(TASK_RUNNING); return 0; + } schedule(); continue; } + set_current_state(TASK_RUNNING); searched_full_index = refill_dirty(dc); @@ -533,7 +655,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_rate.rate = 1024; dc->writeback_rate_minimum = 8; - dc->writeback_rate_update_seconds = 5; + dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; dc->writeback_rate_p_term_inverse = 40; dc->writeback_rate_i_term_inverse = 10000; diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index a9e3ffb4b03c..587b25599856 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -5,6 +5,19 @@ #define CUTOFF_WRITEBACK 40 #define CUTOFF_WRITEBACK_SYNC 70 +#define MAX_WRITEBACKS_IN_PASS 5 +#define MAX_WRITESIZE_IN_PASS 5000 /* *512b */ + +#define WRITEBACK_RATE_UPDATE_SECS_MAX 60 +#define WRITEBACK_RATE_UPDATE_SECS_DEFAULT 5 + +/* + * 14 (16384ths) is chosen here as something that each backing device + * should be a reasonable fraction of the share, and not to blow up + * until individual backing devices are a petabyte. + */ +#define WRITEBACK_SHARE_SHIFT 14 + static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) { uint64_t i, ret = 0; @@ -21,7 +34,7 @@ static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c) mutex_lock(&bch_register_lock); - for (i = 0; i < c->nr_uuids; i++) { + for (i = 0; i < c->devices_max_used; i++) { struct bcache_device *d = c->devices[i]; if (!d || !UUID_FLASH_ONLY(&c->uuids[i])) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index c546b567f3b5..414c9af54ded 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -662,7 +662,7 @@ static void submit_io(struct dm_buffer *b, int rw, bio_end_io_t *end_io) sector = (b->block << b->c->sectors_per_block_bits) + b->c->start; - if (rw != WRITE) { + if (rw != REQ_OP_WRITE) { n_sectors = 1 << b->c->sectors_per_block_bits; offset = 0; } else { @@ -740,7 +740,7 @@ static void __write_dirty_buffer(struct dm_buffer *b, b->write_end = b->dirty_end; if (!write_list) - submit_io(b, WRITE, write_endio); + submit_io(b, REQ_OP_WRITE, write_endio); else list_add_tail(&b->write_list, write_list); } @@ -753,7 +753,7 @@ static void __flush_write_list(struct list_head *write_list) struct dm_buffer *b = list_entry(write_list->next, struct dm_buffer, write_list); list_del(&b->write_list); - submit_io(b, WRITE, write_endio); + submit_io(b, REQ_OP_WRITE, write_endio); cond_resched(); } blk_finish_plug(&plug); @@ -1123,7 +1123,7 @@ static void *new_read(struct dm_bufio_client *c, sector_t block, return NULL; if (need_submit) - submit_io(b, READ, read_endio); + submit_io(b, REQ_OP_READ, read_endio); wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); @@ -1193,7 +1193,7 @@ void dm_bufio_prefetch(struct dm_bufio_client *c, dm_bufio_unlock(c); if (need_submit) - submit_io(b, READ, read_endio); + submit_io(b, REQ_OP_READ, read_endio); dm_bufio_release(b); cond_resched(); @@ -1454,7 +1454,7 @@ retry: old_block = b->block; __unlink_buffer(b); __link_buffer(b, new_block, b->list_mode); - submit_io(b, WRITE, write_endio); + submit_io(b, REQ_OP_WRITE, write_endio); wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); __unlink_buffer(b); @@ -1716,7 +1716,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign if (!DM_BUFIO_CACHE_NAME(c)) { r = -ENOMEM; mutex_unlock(&dm_bufio_clients_lock); - goto bad_cache; + goto bad; } } @@ -1727,7 +1727,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign if (!DM_BUFIO_CACHE(c)) { r = -ENOMEM; mutex_unlock(&dm_bufio_clients_lock); - goto bad_cache; + goto bad; } } } @@ -1738,27 +1738,28 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign if (!b) { r = -ENOMEM; - goto bad_buffer; + goto bad; } __free_buffer_wake(b); } + c->shrinker.count_objects = dm_bufio_shrink_count; + c->shrinker.scan_objects = dm_bufio_shrink_scan; + c->shrinker.seeks = 1; + c->shrinker.batch = 0; + r = register_shrinker(&c->shrinker); + if (r) + goto bad; + mutex_lock(&dm_bufio_clients_lock); dm_bufio_client_count++; list_add(&c->client_list, &dm_bufio_all_clients); __cache_size_refresh(); mutex_unlock(&dm_bufio_clients_lock); - c->shrinker.count_objects = dm_bufio_shrink_count; - c->shrinker.scan_objects = dm_bufio_shrink_scan; - c->shrinker.seeks = 1; - c->shrinker.batch = 0; - register_shrinker(&c->shrinker); - return c; -bad_buffer: -bad_cache: +bad: while (!list_empty(&c->reserved_buffers)) { struct dm_buffer *b = list_entry(c->reserved_buffers.next, struct dm_buffer, lru_list); @@ -1767,6 +1768,7 @@ bad_cache: } dm_io_client_destroy(c->dm_io); bad_dm_io: + mutex_destroy(&c->lock); kfree(c); bad_client: return ERR_PTR(r); @@ -1811,6 +1813,7 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c) BUG_ON(c->n_buffers[i]); dm_io_client_destroy(c->dm_io); + mutex_destroy(&c->lock); kfree(c); } EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 6a14f945783c..3222e21cbbf8 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -91,8 +91,7 @@ struct mapped_device { /* * io objects are allocated from here. */ - mempool_t *io_pool; - + struct bio_set *io_bs; struct bio_set *bs; /* @@ -130,8 +129,6 @@ struct mapped_device { struct srcu_struct io_barrier; }; -void dm_init_md_queue(struct mapped_device *md); -void dm_init_normal_md_queue(struct mapped_device *md); int md_in_flight(struct mapped_device *md); void disable_write_same(struct mapped_device *md); void disable_write_zeroes(struct mapped_device *md); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 9fc12f556534..8168f737590e 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1446,7 +1446,6 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone) bio_for_each_segment_all(bv, clone, i) { BUG_ON(!bv->bv_page); mempool_free(bv->bv_page, cc->page_pool); - bv->bv_page = NULL; } } @@ -1954,10 +1953,15 @@ static int crypt_setkey(struct crypt_config *cc) /* Ignore extra keys (which are used for IV etc) */ subkey_size = crypt_subkey_size(cc); - if (crypt_integrity_hmac(cc)) + if (crypt_integrity_hmac(cc)) { + if (subkey_size < cc->key_mac_size) + return -EINVAL; + crypt_copy_authenckey(cc->authenc_key, cc->key, subkey_size - cc->key_mac_size, cc->key_mac_size); + } + for (i = 0; i < cc->tfms_count; i++) { if (crypt_integrity_hmac(cc)) r = crypto_aead_setkey(cc->cipher_tfm.tfms_aead[i], @@ -2053,9 +2057,6 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string ret = crypt_setkey(cc); - /* wipe the kernel key payload copy in each case */ - memset(cc->key, 0, cc->key_size * sizeof(u8)); - if (!ret) { set_bit(DM_CRYPT_KEY_VALID, &cc->flags); kzfree(cc->key_string); @@ -2192,6 +2193,8 @@ static void crypt_dtr(struct dm_target *ti) kzfree(cc->cipher_auth); kzfree(cc->authenc_key); + mutex_destroy(&cc->bio_alloc_lock); + /* Must zero key material before freeing */ kzfree(cc); } @@ -2523,6 +2526,10 @@ static int crypt_ctr_cipher(struct dm_target *ti, char *cipher_in, char *key) } } + /* wipe the kernel key payload copy */ + if (cc->key_string) + memset(cc->key, 0, cc->key_size * sizeof(u8)); + return ret; } @@ -2697,8 +2704,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - cc->bs = bioset_create(MIN_IOS, 0, (BIOSET_NEED_BVECS | - BIOSET_NEED_RESCUER)); + cc->bs = bioset_create(MIN_IOS, 0, BIOSET_NEED_BVECS); if (!cc->bs) { ti->error = "Cannot allocate crypt bioset"; goto bad; @@ -2740,6 +2746,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) cc->tag_pool_max_sectors * cc->on_disk_tag_size); if (!cc->tag_pool) { ti->error = "Cannot allocate integrity tags mempool"; + ret = -ENOMEM; goto bad; } @@ -2961,6 +2968,9 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) return ret; if (cc->iv_gen_ops && cc->iv_gen_ops->init) ret = cc->iv_gen_ops->init(cc); + /* wipe the kernel key payload copy */ + if (cc->key_string) + memset(cc->key, 0, cc->key_size * sizeof(u8)); return ret; } if (argc == 2 && !strcasecmp(argv[1], "wipe")) { @@ -3007,7 +3017,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type crypt_target = { .name = "crypt", - .version = {1, 18, 0}, + .version = {1, 18, 1}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 288386bfbfb5..1783d80c9cad 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -229,6 +229,8 @@ static void delay_dtr(struct dm_target *ti) if (dc->dev_write) dm_put_device(ti, dc->dev_write); + mutex_destroy(&dc->timer_lock); + kfree(dc); } diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index b82cb1ab1eaa..1b907b15f5c3 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -70,6 +70,11 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, arg_name = dm_shift_arg(as); argc--; + if (!arg_name) { + ti->error = "Insufficient feature arguments"; + return -EINVAL; + } + /* * drop_writes */ diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 05c7bfd0c9d9..46d7c8749222 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -2559,7 +2559,8 @@ static int create_journal(struct dm_integrity_c *ic, char **error) int r = 0; unsigned i; __u64 journal_pages, journal_desc_size, journal_tree_size; - unsigned char *crypt_data = NULL; + unsigned char *crypt_data = NULL, *crypt_iv = NULL; + struct skcipher_request *req = NULL; ic->commit_ids[0] = cpu_to_le64(0x1111111111111111ULL); ic->commit_ids[1] = cpu_to_le64(0x2222222222222222ULL); @@ -2617,9 +2618,20 @@ static int create_journal(struct dm_integrity_c *ic, char **error) if (blocksize == 1) { struct scatterlist *sg; - SKCIPHER_REQUEST_ON_STACK(req, ic->journal_crypt); - unsigned char iv[ivsize]; - skcipher_request_set_tfm(req, ic->journal_crypt); + + req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL); + if (!req) { + *error = "Could not allocate crypt request"; + r = -ENOMEM; + goto bad; + } + + crypt_iv = kmalloc(ivsize, GFP_KERNEL); + if (!crypt_iv) { + *error = "Could not allocate iv"; + r = -ENOMEM; + goto bad; + } ic->journal_xor = dm_integrity_alloc_page_list(ic); if (!ic->journal_xor) { @@ -2641,9 +2653,9 @@ static int create_journal(struct dm_integrity_c *ic, char **error) sg_set_buf(&sg[i], va, PAGE_SIZE); } sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids); - memset(iv, 0x00, ivsize); + memset(crypt_iv, 0x00, ivsize); - skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, iv); + skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, crypt_iv); init_completion(&comp.comp); comp.in_flight = (atomic_t)ATOMIC_INIT(1); if (do_crypt(true, req, &comp)) @@ -2659,10 +2671,22 @@ static int create_journal(struct dm_integrity_c *ic, char **error) crypto_free_skcipher(ic->journal_crypt); ic->journal_crypt = NULL; } else { - SKCIPHER_REQUEST_ON_STACK(req, ic->journal_crypt); - unsigned char iv[ivsize]; unsigned crypt_len = roundup(ivsize, blocksize); + req = skcipher_request_alloc(ic->journal_crypt, GFP_KERNEL); + if (!req) { + *error = "Could not allocate crypt request"; + r = -ENOMEM; + goto bad; + } + + crypt_iv = kmalloc(ivsize, GFP_KERNEL); + if (!crypt_iv) { + *error = "Could not allocate iv"; + r = -ENOMEM; + goto bad; + } + crypt_data = kmalloc(crypt_len, GFP_KERNEL); if (!crypt_data) { *error = "Unable to allocate crypt data"; @@ -2670,8 +2694,6 @@ static int create_journal(struct dm_integrity_c *ic, char **error) goto bad; } - skcipher_request_set_tfm(req, ic->journal_crypt); - ic->journal_scatterlist = dm_integrity_alloc_journal_scatterlist(ic, ic->journal); if (!ic->journal_scatterlist) { *error = "Unable to allocate sg list"; @@ -2695,12 +2717,12 @@ static int create_journal(struct dm_integrity_c *ic, char **error) struct skcipher_request *section_req; __u32 section_le = cpu_to_le32(i); - memset(iv, 0x00, ivsize); + memset(crypt_iv, 0x00, ivsize); memset(crypt_data, 0x00, crypt_len); memcpy(crypt_data, §ion_le, min((size_t)crypt_len, sizeof(section_le))); sg_init_one(&sg, crypt_data, crypt_len); - skcipher_request_set_crypt(req, &sg, &sg, crypt_len, iv); + skcipher_request_set_crypt(req, &sg, &sg, crypt_len, crypt_iv); init_completion(&comp.comp); comp.in_flight = (atomic_t)ATOMIC_INIT(1); if (do_crypt(true, req, &comp)) @@ -2758,6 +2780,9 @@ retest_commit_id: } bad: kfree(crypt_data); + kfree(crypt_iv); + skcipher_request_free(req); + return r; } diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index b4357ed4d541..a8d914d5abbe 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -58,8 +58,7 @@ struct dm_io_client *dm_io_client_create(void) if (!client->pool) goto bad; - client->bios = bioset_create(min_ios, 0, (BIOSET_NEED_BVECS | - BIOSET_NEED_RESCUER)); + client->bios = bioset_create(min_ios, 0, BIOSET_NEED_BVECS); if (!client->bios) goto bad; diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index e52676fa9832..a89fd8f44453 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1929,15 +1929,15 @@ static int dm_release(struct inode *inode, struct file *filp) return 0; } -static unsigned dm_poll(struct file *filp, poll_table *wait) +static __poll_t dm_poll(struct file *filp, poll_table *wait) { struct dm_file *priv = filp->private_data; - unsigned mask = 0; + __poll_t mask = 0; poll_wait(filp, &dm_global_eventq, wait); if ((int)(atomic_read(&dm_global_event_nr) - priv->global_event_nr) > 0) - mask |= POLLIN; + mask |= EPOLLIN; return mask; } diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index eb45cc3df31d..e6e7c686646d 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -477,8 +477,10 @@ static int run_complete_job(struct kcopyd_job *job) * If this is the master job, the sub jobs have already * completed so we can free everything. */ - if (job->master_job == job) + if (job->master_job == job) { + mutex_destroy(&job->lock); mempool_free(job, kc->job_pool); + } fn(read_err, write_err, context); if (atomic_dec_and_test(&kc->nr_jobs)) @@ -750,6 +752,7 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, * followed by SPLIT_COUNT sub jobs. */ job = mempool_alloc(kc->job_pool, GFP_NOIO); + mutex_init(&job->lock); /* * set up for the read. @@ -811,7 +814,6 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from, if (job->source.count <= SUB_JOB_SIZE) dispatch_job(job); else { - mutex_init(&job->lock); job->progress = 0; split_job(job); } diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 189badbeddaf..3362d866793b 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -594,7 +594,7 @@ static int log_mark(struct log_writes_c *lc, char *data) return -ENOMEM; } - block->data = kstrndup(data, maxsize, GFP_KERNEL); + block->data = kstrndup(data, maxsize - 1, GFP_KERNEL); if (!block->data) { DMERR("Error copying mark data"); kfree(block); diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index f7810cc869ac..7d3e572072f5 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -64,36 +64,30 @@ struct priority_group { /* Multipath context */ struct multipath { - struct list_head list; - struct dm_target *ti; - - const char *hw_handler_name; - char *hw_handler_params; + unsigned long flags; /* Multipath state flags */ spinlock_t lock; - - unsigned nr_priority_groups; - struct list_head priority_groups; - - wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ + enum dm_queue_mode queue_mode; struct pgpath *current_pgpath; struct priority_group *current_pg; struct priority_group *next_pg; /* Switch to this PG if set */ - unsigned long flags; /* Multipath state flags */ + atomic_t nr_valid_paths; /* Total number of usable paths */ + unsigned nr_priority_groups; + struct list_head priority_groups; + const char *hw_handler_name; + char *hw_handler_params; + wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ unsigned pg_init_retries; /* Number of times to retry pg_init */ unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */ - - atomic_t nr_valid_paths; /* Total number of usable paths */ atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */ atomic_t pg_init_count; /* Number of times pg_init called */ - enum dm_queue_mode queue_mode; - struct mutex work_mutex; struct work_struct trigger_event; + struct dm_target *ti; struct work_struct process_queued_bios; struct bio_list queued_bios; @@ -135,10 +129,10 @@ static struct pgpath *alloc_pgpath(void) { struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL); - if (pgpath) { - pgpath->is_active = true; - INIT_DELAYED_WORK(&pgpath->activate_path, activate_path_work); - } + if (!pgpath) + return NULL; + + pgpath->is_active = true; return pgpath; } @@ -193,13 +187,8 @@ static struct multipath *alloc_multipath(struct dm_target *ti) if (m) { INIT_LIST_HEAD(&m->priority_groups); spin_lock_init(&m->lock); - set_bit(MPATHF_QUEUE_IO, &m->flags); atomic_set(&m->nr_valid_paths, 0); - atomic_set(&m->pg_init_in_progress, 0); - atomic_set(&m->pg_init_count, 0); - m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; INIT_WORK(&m->trigger_event, trigger_event); - init_waitqueue_head(&m->pg_init_wait); mutex_init(&m->work_mutex); m->queue_mode = DM_TYPE_NONE; @@ -221,13 +210,26 @@ static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m) m->queue_mode = DM_TYPE_MQ_REQUEST_BASED; else m->queue_mode = DM_TYPE_REQUEST_BASED; - } else if (m->queue_mode == DM_TYPE_BIO_BASED) { + + } else if (m->queue_mode == DM_TYPE_BIO_BASED || + m->queue_mode == DM_TYPE_NVME_BIO_BASED) { INIT_WORK(&m->process_queued_bios, process_queued_bios); - /* - * bio-based doesn't support any direct scsi_dh management; - * it just discovers if a scsi_dh is attached. - */ - set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags); + + if (m->queue_mode == DM_TYPE_BIO_BASED) { + /* + * bio-based doesn't support any direct scsi_dh management; + * it just discovers if a scsi_dh is attached. + */ + set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags); + } + } + + if (m->queue_mode != DM_TYPE_NVME_BIO_BASED) { + set_bit(MPATHF_QUEUE_IO, &m->flags); + atomic_set(&m->pg_init_in_progress, 0); + atomic_set(&m->pg_init_count, 0); + m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; + init_waitqueue_head(&m->pg_init_wait); } dm_table_set_type(ti->table, m->queue_mode); @@ -246,6 +248,7 @@ static void free_multipath(struct multipath *m) kfree(m->hw_handler_name); kfree(m->hw_handler_params); + mutex_destroy(&m->work_mutex); kfree(m); } @@ -264,29 +267,23 @@ static struct dm_mpath_io *get_mpio_from_bio(struct bio *bio) return dm_per_bio_data(bio, multipath_per_bio_data_size()); } -static struct dm_bio_details *get_bio_details_from_bio(struct bio *bio) +static struct dm_bio_details *get_bio_details_from_mpio(struct dm_mpath_io *mpio) { /* dm_bio_details is immediately after the dm_mpath_io in bio's per-bio-data */ - struct dm_mpath_io *mpio = get_mpio_from_bio(bio); void *bio_details = mpio + 1; - return bio_details; } -static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mpio_p, - struct dm_bio_details **bio_details_p) +static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mpio_p) { struct dm_mpath_io *mpio = get_mpio_from_bio(bio); - struct dm_bio_details *bio_details = get_bio_details_from_bio(bio); + struct dm_bio_details *bio_details = get_bio_details_from_mpio(mpio); - memset(mpio, 0, sizeof(*mpio)); - memset(bio_details, 0, sizeof(*bio_details)); - dm_bio_record(bio_details, bio); + mpio->nr_bytes = bio->bi_iter.bi_size; + mpio->pgpath = NULL; + *mpio_p = mpio; - if (mpio_p) - *mpio_p = mpio; - if (bio_details_p) - *bio_details_p = bio_details; + dm_bio_record(bio_details, bio); } /*----------------------------------------------- @@ -340,6 +337,9 @@ static void __switch_pg(struct multipath *m, struct priority_group *pg) { m->current_pg = pg; + if (m->queue_mode == DM_TYPE_NVME_BIO_BASED) + return; + /* Must we initialise the PG first, and queue I/O till it's ready? */ if (m->hw_handler_name) { set_bit(MPATHF_PG_INIT_REQUIRED, &m->flags); @@ -385,7 +385,8 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) unsigned bypassed = 1; if (!atomic_read(&m->nr_valid_paths)) { - clear_bit(MPATHF_QUEUE_IO, &m->flags); + if (m->queue_mode != DM_TYPE_NVME_BIO_BASED) + clear_bit(MPATHF_QUEUE_IO, &m->flags); goto failed; } @@ -516,12 +517,10 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, return DM_MAPIO_KILL; } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { - if (pg_init_all_paths(m)) - return DM_MAPIO_DELAY_REQUEUE; - return DM_MAPIO_REQUEUE; + pg_init_all_paths(m); + return DM_MAPIO_DELAY_REQUEUE; } - memset(mpio, 0, sizeof(*mpio)); mpio->pgpath = pgpath; mpio->nr_bytes = nr_bytes; @@ -530,12 +529,23 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE, GFP_ATOMIC); if (IS_ERR(clone)) { /* EBUSY, ENODEV or EWOULDBLOCK: requeue */ - bool queue_dying = blk_queue_dying(q); - if (queue_dying) { + if (blk_queue_dying(q)) { atomic_inc(&m->pg_init_in_progress); activate_or_offline_path(pgpath); + return DM_MAPIO_DELAY_REQUEUE; } - return DM_MAPIO_DELAY_REQUEUE; + + /* + * blk-mq's SCHED_RESTART can cover this requeue, so we + * needn't deal with it by DELAY_REQUEUE. More importantly, + * we have to return DM_MAPIO_REQUEUE so that blk-mq can + * get the queue busy feedback (via BLK_STS_RESOURCE), + * otherwise I/O merging can suffer. + */ + if (q->mq_ops) + return DM_MAPIO_REQUEUE; + else + return DM_MAPIO_DELAY_REQUEUE; } clone->bio = clone->biotail = NULL; clone->rq_disk = bdev->bd_disk; @@ -557,9 +567,9 @@ static void multipath_release_clone(struct request *clone) /* * Map cloned bios (bio-based multipath) */ -static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_mpath_io *mpio) + +static struct pgpath *__map_bio(struct multipath *m, struct bio *bio) { - size_t nr_bytes = bio->bi_iter.bi_size; struct pgpath *pgpath; unsigned long flags; bool queue_io; @@ -568,7 +578,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m pgpath = READ_ONCE(m->current_pgpath); queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags); if (!pgpath || !queue_io) - pgpath = choose_pgpath(m, nr_bytes); + pgpath = choose_pgpath(m, bio->bi_iter.bi_size); if ((pgpath && queue_io) || (!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) { @@ -576,14 +586,62 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m spin_lock_irqsave(&m->lock, flags); bio_list_add(&m->queued_bios, bio); spin_unlock_irqrestore(&m->lock, flags); + /* PG_INIT_REQUIRED cannot be set without QUEUE_IO */ if (queue_io || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) pg_init_all_paths(m); else if (!queue_io) queue_work(kmultipathd, &m->process_queued_bios); - return DM_MAPIO_SUBMITTED; + + return ERR_PTR(-EAGAIN); + } + + return pgpath; +} + +static struct pgpath *__map_bio_nvme(struct multipath *m, struct bio *bio) +{ + struct pgpath *pgpath; + unsigned long flags; + + /* Do we need to select a new pgpath? */ + /* + * FIXME: currently only switching path if no path (due to failure, etc) + * - which negates the point of using a path selector + */ + pgpath = READ_ONCE(m->current_pgpath); + if (!pgpath) + pgpath = choose_pgpath(m, bio->bi_iter.bi_size); + + if (!pgpath) { + if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { + /* Queue for the daemon to resubmit */ + spin_lock_irqsave(&m->lock, flags); + bio_list_add(&m->queued_bios, bio); + spin_unlock_irqrestore(&m->lock, flags); + queue_work(kmultipathd, &m->process_queued_bios); + + return ERR_PTR(-EAGAIN); + } + return NULL; } + return pgpath; +} + +static int __multipath_map_bio(struct multipath *m, struct bio *bio, + struct dm_mpath_io *mpio) +{ + struct pgpath *pgpath; + + if (m->queue_mode == DM_TYPE_NVME_BIO_BASED) + pgpath = __map_bio_nvme(m, bio); + else + pgpath = __map_bio(m, bio); + + if (IS_ERR(pgpath)) + return DM_MAPIO_SUBMITTED; + if (!pgpath) { if (must_push_back_bio(m)) return DM_MAPIO_REQUEUE; @@ -592,7 +650,6 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m } mpio->pgpath = pgpath; - mpio->nr_bytes = nr_bytes; bio->bi_status = 0; bio_set_dev(bio, pgpath->path.dev->bdev); @@ -601,7 +658,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m if (pgpath->pg->ps.type->start_io) pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path, - nr_bytes); + mpio->nr_bytes); return DM_MAPIO_REMAPPED; } @@ -610,8 +667,7 @@ static int multipath_map_bio(struct dm_target *ti, struct bio *bio) struct multipath *m = ti->private; struct dm_mpath_io *mpio = NULL; - multipath_init_per_bio_data(bio, &mpio, NULL); - + multipath_init_per_bio_data(bio, &mpio); return __multipath_map_bio(m, bio, mpio); } @@ -619,7 +675,8 @@ static void process_queued_io_list(struct multipath *m) { if (m->queue_mode == DM_TYPE_MQ_REQUEST_BASED) dm_mq_kick_requeue_list(dm_table_get_md(m->ti->table)); - else if (m->queue_mode == DM_TYPE_BIO_BASED) + else if (m->queue_mode == DM_TYPE_BIO_BASED || + m->queue_mode == DM_TYPE_NVME_BIO_BASED) queue_work(kmultipathd, &m->process_queued_bios); } @@ -649,7 +706,9 @@ static void process_queued_bios(struct work_struct *work) blk_start_plug(&plug); while ((bio = bio_list_pop(&bios))) { - r = __multipath_map_bio(m, bio, get_mpio_from_bio(bio)); + struct dm_mpath_io *mpio = get_mpio_from_bio(bio); + dm_bio_restore(get_bio_details_from_mpio(mpio), bio); + r = __multipath_map_bio(m, bio, mpio); switch (r) { case DM_MAPIO_KILL: bio->bi_status = BLK_STS_IOERR; @@ -752,34 +811,11 @@ static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg, return 0; } -static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps, - struct dm_target *ti) +static int setup_scsi_dh(struct block_device *bdev, struct multipath *m, char **error) { - int r; - struct pgpath *p; - struct multipath *m = ti->private; - struct request_queue *q = NULL; + struct request_queue *q = bdev_get_queue(bdev); const char *attached_handler_name; - - /* we need at least a path arg */ - if (as->argc < 1) { - ti->error = "no device given"; - return ERR_PTR(-EINVAL); - } - - p = alloc_pgpath(); - if (!p) - return ERR_PTR(-ENOMEM); - - r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), - &p->path.dev); - if (r) { - ti->error = "error getting device"; - goto bad; - } - - if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) || m->hw_handler_name) - q = bdev_get_queue(p->path.dev->bdev); + int r; if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) { retain: @@ -811,26 +847,59 @@ retain: char b[BDEVNAME_SIZE]; printk(KERN_INFO "dm-mpath: retaining handler on device %s\n", - bdevname(p->path.dev->bdev, b)); + bdevname(bdev, b)); goto retain; } if (r < 0) { - ti->error = "error attaching hardware handler"; - dm_put_device(ti, p->path.dev); - goto bad; + *error = "error attaching hardware handler"; + return r; } if (m->hw_handler_params) { r = scsi_dh_set_params(q, m->hw_handler_params); if (r < 0) { - ti->error = "unable to set hardware " - "handler parameters"; - dm_put_device(ti, p->path.dev); - goto bad; + *error = "unable to set hardware handler parameters"; + return r; } } } + return 0; +} + +static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps, + struct dm_target *ti) +{ + int r; + struct pgpath *p; + struct multipath *m = ti->private; + + /* we need at least a path arg */ + if (as->argc < 1) { + ti->error = "no device given"; + return ERR_PTR(-EINVAL); + } + + p = alloc_pgpath(); + if (!p) + return ERR_PTR(-ENOMEM); + + r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), + &p->path.dev); + if (r) { + ti->error = "error getting device"; + goto bad; + } + + if (m->queue_mode != DM_TYPE_NVME_BIO_BASED) { + INIT_DELAYED_WORK(&p->activate_path, activate_path_work); + r = setup_scsi_dh(p->path.dev->bdev, m, &ti->error); + if (r) { + dm_put_device(ti, p->path.dev); + goto bad; + } + } + r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error); if (r) { dm_put_device(ti, p->path.dev); @@ -838,7 +907,6 @@ retain: } return p; - bad: free_pgpath(p); return ERR_PTR(r); @@ -933,7 +1001,8 @@ static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m) if (!hw_argc) return 0; - if (m->queue_mode == DM_TYPE_BIO_BASED) { + if (m->queue_mode == DM_TYPE_BIO_BASED || + m->queue_mode == DM_TYPE_NVME_BIO_BASED) { dm_consume_args(as, hw_argc); DMERR("bio-based multipath doesn't allow hardware handler args"); return 0; @@ -1022,6 +1091,8 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m) if (!strcasecmp(queue_mode_name, "bio")) m->queue_mode = DM_TYPE_BIO_BASED; + else if (!strcasecmp(queue_mode_name, "nvme")) + m->queue_mode = DM_TYPE_NVME_BIO_BASED; else if (!strcasecmp(queue_mode_name, "rq")) m->queue_mode = DM_TYPE_REQUEST_BASED; else if (!strcasecmp(queue_mode_name, "mq")) @@ -1122,7 +1193,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->num_discard_bios = 1; ti->num_write_same_bios = 1; ti->num_write_zeroes_bios = 1; - if (m->queue_mode == DM_TYPE_BIO_BASED) + if (m->queue_mode == DM_TYPE_BIO_BASED || m->queue_mode == DM_TYPE_NVME_BIO_BASED) ti->per_io_data_size = multipath_per_bio_data_size(); else ti->per_io_data_size = sizeof(struct dm_mpath_io); @@ -1151,16 +1222,19 @@ static void multipath_wait_for_pg_init_completion(struct multipath *m) static void flush_multipath_work(struct multipath *m) { - set_bit(MPATHF_PG_INIT_DISABLED, &m->flags); - smp_mb__after_atomic(); + if (m->hw_handler_name) { + set_bit(MPATHF_PG_INIT_DISABLED, &m->flags); + smp_mb__after_atomic(); + + flush_workqueue(kmpath_handlerd); + multipath_wait_for_pg_init_completion(m); + + clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags); + smp_mb__after_atomic(); + } - flush_workqueue(kmpath_handlerd); - multipath_wait_for_pg_init_completion(m); flush_workqueue(kmultipathd); flush_work(&m->trigger_event); - - clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags); - smp_mb__after_atomic(); } static void multipath_dtr(struct dm_target *ti) @@ -1475,21 +1549,6 @@ static void activate_path_work(struct work_struct *work) activate_or_offline_path(pgpath); } -static int noretry_error(blk_status_t error) -{ - switch (error) { - case BLK_STS_NOTSUPP: - case BLK_STS_NOSPC: - case BLK_STS_TARGET: - case BLK_STS_NEXUS: - case BLK_STS_MEDIUM: - return 1; - } - - /* Anything else could be a path failure, so should be retried */ - return 0; -} - static int multipath_end_io(struct dm_target *ti, struct request *clone, blk_status_t error, union map_info *map_context) { @@ -1508,10 +1567,13 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, * request into dm core, which will remake a clone request and * clone bios for it and resubmit it later. */ - if (error && !noretry_error(error)) { + if (error && blk_path_error(error)) { struct multipath *m = ti->private; - r = DM_ENDIO_REQUEUE; + if (error == BLK_STS_RESOURCE) + r = DM_ENDIO_DELAY_REQUEUE; + else + r = DM_ENDIO_REQUEUE; if (pgpath) fail_path(pgpath); @@ -1536,7 +1598,7 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, } static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, - blk_status_t *error) + blk_status_t *error) { struct multipath *m = ti->private; struct dm_mpath_io *mpio = get_mpio_from_bio(clone); @@ -1544,7 +1606,7 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, unsigned long flags; int r = DM_ENDIO_DONE; - if (!*error || noretry_error(*error)) + if (!*error || !blk_path_error(*error)) goto done; if (pgpath) @@ -1561,9 +1623,6 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, goto done; } - /* Queue for the daemon to resubmit */ - dm_bio_restore(get_bio_details_from_bio(clone), clone); - spin_lock_irqsave(&m->lock, flags); bio_list_add(&m->queued_bios, clone); spin_unlock_irqrestore(&m->lock, flags); @@ -1671,6 +1730,9 @@ static void multipath_status(struct dm_target *ti, status_type_t type, case DM_TYPE_BIO_BASED: DMEMIT("queue_mode bio "); break; + case DM_TYPE_NVME_BIO_BASED: + DMEMIT("queue_mode nvme "); + break; case DM_TYPE_MQ_REQUEST_BASED: DMEMIT("queue_mode mq "); break; diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c index 23f178641794..969c4f1a3633 100644 --- a/drivers/md/dm-queue-length.c +++ b/drivers/md/dm-queue-length.c @@ -195,9 +195,6 @@ static struct dm_path *ql_select_path(struct path_selector *ps, size_t nr_bytes) if (list_empty(&s->valid_paths)) goto out; - /* Change preferred (first in list) path to evenly balance. */ - list_move_tail(s->valid_paths.next, &s->valid_paths); - list_for_each_entry(pi, &s->valid_paths, list) { if (!best || (atomic_read(&pi->qlen) < atomic_read(&best->qlen))) @@ -210,6 +207,9 @@ static struct dm_path *ql_select_path(struct path_selector *ps, size_t nr_bytes) if (!best) goto out; + /* Move most recently used to least preferred to evenly balance. */ + list_move_tail(&best->list, &s->valid_paths); + ret = best->path; out: spin_unlock_irqrestore(&s->lock, flags); diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 6319d846e0ad..7ef469e902c6 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -29,6 +29,9 @@ */ #define MIN_RAID456_JOURNAL_SPACE (4*2048) +/* Global list of all raid sets */ +static LIST_HEAD(raid_sets); + static bool devices_handle_discard_safely = false; /* @@ -105,8 +108,6 @@ struct raid_dev { #define CTR_FLAG_JOURNAL_DEV (1 << __CTR_FLAG_JOURNAL_DEV) #define CTR_FLAG_JOURNAL_MODE (1 << __CTR_FLAG_JOURNAL_MODE) -#define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET) - /* * Definitions of various constructor flags to * be used in checks of valid / invalid flags @@ -209,6 +210,8 @@ struct raid_dev { #define RT_FLAG_UPDATE_SBS 3 #define RT_FLAG_RESHAPE_RS 4 #define RT_FLAG_RS_SUSPENDED 5 +#define RT_FLAG_RS_IN_SYNC 6 +#define RT_FLAG_RS_RESYNCING 7 /* Array elements of 64 bit needed for rebuild/failed disk bits */ #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8) @@ -224,8 +227,8 @@ struct rs_layout { struct raid_set { struct dm_target *ti; + struct list_head list; - uint32_t bitmap_loaded; uint32_t stripe_cache_entries; unsigned long ctr_flags; unsigned long runtime_flags; @@ -270,6 +273,19 @@ static void rs_config_restore(struct raid_set *rs, struct rs_layout *l) mddev->new_chunk_sectors = l->new_chunk_sectors; } +/* Find any raid_set in active slot for @rs on global list */ +static struct raid_set *rs_find_active(struct raid_set *rs) +{ + struct raid_set *r; + struct mapped_device *md = dm_table_get_md(rs->ti->table); + + list_for_each_entry(r, &raid_sets, list) + if (r != rs && dm_table_get_md(r->ti->table) == md) + return r; + + return NULL; +} + /* raid10 algorithms (i.e. formats) */ #define ALGORITHM_RAID10_DEFAULT 0 #define ALGORITHM_RAID10_NEAR 1 @@ -572,7 +588,7 @@ static const char *raid10_md_layout_to_format(int layout) } /* Return md raid10 algorithm for @name */ -static int raid10_name_to_format(const char *name) +static const int raid10_name_to_format(const char *name) { if (!strcasecmp(name, "near")) return ALGORITHM_RAID10_NEAR; @@ -675,15 +691,11 @@ static struct raid_type *get_raid_type_by_ll(const int level, const int layout) return NULL; } -/* - * Conditionally change bdev capacity of @rs - * in case of a disk add/remove reshape - */ -static void rs_set_capacity(struct raid_set *rs) +/* Adjust rdev sectors */ +static void rs_set_rdev_sectors(struct raid_set *rs) { struct mddev *mddev = &rs->md; struct md_rdev *rdev; - struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table)); /* * raid10 sets rdev->sector to the device size, which @@ -692,8 +704,16 @@ static void rs_set_capacity(struct raid_set *rs) rdev_for_each(rdev, mddev) if (!test_bit(Journal, &rdev->flags)) rdev->sectors = mddev->dev_sectors; +} - set_capacity(gendisk, mddev->array_sectors); +/* + * Change bdev capacity of @rs in case of a disk add/remove reshape + */ +static void rs_set_capacity(struct raid_set *rs) +{ + struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table)); + + set_capacity(gendisk, rs->md.array_sectors); revalidate_disk(gendisk); } @@ -744,6 +764,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r mddev_init(&rs->md); + INIT_LIST_HEAD(&rs->list); rs->raid_disks = raid_devs; rs->delta_disks = 0; @@ -761,6 +782,9 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r for (i = 0; i < raid_devs; i++) md_rdev_init(&rs->dev[i].rdev); + /* Add @rs to global list. */ + list_add(&rs->list, &raid_sets); + /* * Remaining items to be initialized by further RAID params: * rs->md.persistent @@ -773,6 +797,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r return rs; } +/* Free all @rs allocations and remove it from global list. */ static void raid_set_free(struct raid_set *rs) { int i; @@ -790,6 +815,8 @@ static void raid_set_free(struct raid_set *rs) dm_put_device(rs->ti, rs->dev[i].data_dev); } + list_del(&rs->list); + kfree(rs); } @@ -1002,7 +1029,7 @@ static int validate_raid_redundancy(struct raid_set *rs) !rs->dev[i].rdev.sb_page) rebuild_cnt++; - switch (rs->raid_type->level) { + switch (rs->md.level) { case 0: break; case 1: @@ -1017,6 +1044,11 @@ static int validate_raid_redundancy(struct raid_set *rs) break; case 10: copies = raid10_md_layout_to_copies(rs->md.new_layout); + if (copies < 2) { + DMERR("Bogus raid10 data copies < 2!"); + return -EINVAL; + } + if (rebuild_cnt < copies) break; @@ -1576,6 +1608,24 @@ static sector_t __rdev_sectors(struct raid_set *rs) return 0; } +/* Check that calculated dev_sectors fits all component devices. */ +static int _check_data_dev_sectors(struct raid_set *rs) +{ + sector_t ds = ~0; + struct md_rdev *rdev; + + rdev_for_each(rdev, &rs->md) + if (!test_bit(Journal, &rdev->flags) && rdev->bdev) { + ds = min(ds, to_sector(i_size_read(rdev->bdev->bd_inode))); + if (ds < rs->md.dev_sectors) { + rs->ti->error = "Component device(s) too small"; + return -EINVAL; + } + } + + return 0; +} + /* Calculate the sectors per device and per array used for @rs */ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev) { @@ -1625,7 +1675,7 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev) mddev->array_sectors = array_sectors; mddev->dev_sectors = dev_sectors; - return 0; + return _check_data_dev_sectors(rs); bad: rs->ti->error = "Target length not divisible by number of data devices"; return -EINVAL; @@ -1674,8 +1724,11 @@ static void do_table_event(struct work_struct *ws) struct raid_set *rs = container_of(ws, struct raid_set, md.event_work); smp_rmb(); /* Make sure we access most actual mddev properties */ - if (!rs_is_reshaping(rs)) + if (!rs_is_reshaping(rs)) { + if (rs_is_raid10(rs)) + rs_set_rdev_sectors(rs); rs_set_capacity(rs); + } dm_table_event(rs->ti->table); } @@ -1860,7 +1913,7 @@ static bool rs_reshape_requested(struct raid_set *rs) if (rs_takeover_requested(rs)) return false; - if (!mddev->level) + if (rs_is_raid0(rs)) return false; change = mddev->new_layout != mddev->layout || @@ -1868,7 +1921,7 @@ static bool rs_reshape_requested(struct raid_set *rs) rs->delta_disks; /* Historical case to support raid1 reshape without delta disks */ - if (mddev->level == 1) { + if (rs_is_raid1(rs)) { if (rs->delta_disks) return !!rs->delta_disks; @@ -1876,7 +1929,7 @@ static bool rs_reshape_requested(struct raid_set *rs) mddev->raid_disks != rs->raid_disks; } - if (mddev->level == 10) + if (rs_is_raid10(rs)) return change && !__is_raid10_far(mddev->new_layout) && rs->delta_disks >= 0; @@ -2340,7 +2393,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) DMERR("new device%s provided without 'rebuild'", new_devs > 1 ? "s" : ""); return -EINVAL; - } else if (rs_is_recovering(rs)) { + } else if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) && rs_is_recovering(rs)) { DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)", (unsigned long long) mddev->recovery_cp); return -EINVAL; @@ -2640,12 +2693,19 @@ static int rs_adjust_data_offsets(struct raid_set *rs) * Make sure we got a minimum amount of free sectors per device */ if (rs->data_offset && - to_sector(i_size_read(rdev->bdev->bd_inode)) - rdev->sectors < MIN_FREE_RESHAPE_SPACE) { + to_sector(i_size_read(rdev->bdev->bd_inode)) - rs->md.dev_sectors < MIN_FREE_RESHAPE_SPACE) { rs->ti->error = data_offset ? "No space for forward reshape" : "No space for backward reshape"; return -ENOSPC; } out: + /* + * Raise recovery_cp in case data_offset != 0 to + * avoid false recovery positives in the constructor. + */ + if (rs->md.recovery_cp < rs->md.dev_sectors) + rs->md.recovery_cp += rs->dev[0].rdev.data_offset; + /* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */ rdev_for_each(rdev, &rs->md) { if (!test_bit(Journal, &rdev->flags)) { @@ -2682,14 +2742,14 @@ static int rs_setup_takeover(struct raid_set *rs) sector_t new_data_offset = rs->dev[0].rdev.data_offset ? 0 : rs->data_offset; if (rt_is_raid10(rs->raid_type)) { - if (mddev->level == 0) { + if (rs_is_raid0(rs)) { /* Userpace reordered disks -> adjust raid_disk indexes */ __reorder_raid_disk_indexes(rs); /* raid0 -> raid10_far layout */ mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_FAR, rs->raid10_copies); - } else if (mddev->level == 1) + } else if (rs_is_raid1(rs)) /* raid1 -> raid10_near layout */ mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_NEAR, rs->raid_disks); @@ -2777,6 +2837,23 @@ static int rs_prepare_reshape(struct raid_set *rs) return 0; } +/* Get reshape sectors from data_offsets or raid set */ +static sector_t _get_reshape_sectors(struct raid_set *rs) +{ + struct md_rdev *rdev; + sector_t reshape_sectors = 0; + + rdev_for_each(rdev, &rs->md) + if (!test_bit(Journal, &rdev->flags)) { + reshape_sectors = (rdev->data_offset > rdev->new_data_offset) ? + rdev->data_offset - rdev->new_data_offset : + rdev->new_data_offset - rdev->data_offset; + break; + } + + return max(reshape_sectors, (sector_t) rs->data_offset); +} + /* * * - change raid layout @@ -2788,6 +2865,7 @@ static int rs_setup_reshape(struct raid_set *rs) { int r = 0; unsigned int cur_raid_devs, d; + sector_t reshape_sectors = _get_reshape_sectors(rs); struct mddev *mddev = &rs->md; struct md_rdev *rdev; @@ -2804,13 +2882,13 @@ static int rs_setup_reshape(struct raid_set *rs) /* * Adjust array size: * - * - in case of adding disks, array size has + * - in case of adding disk(s), array size has * to grow after the disk adding reshape, * which'll hapen in the event handler; * reshape will happen forward, so space has to * be available at the beginning of each disk * - * - in case of removing disks, array size + * - in case of removing disk(s), array size * has to shrink before starting the reshape, * which'll happen here; * reshape will happen backward, so space has to @@ -2841,7 +2919,7 @@ static int rs_setup_reshape(struct raid_set *rs) rdev->recovery_offset = rs_is_raid1(rs) ? 0 : MaxSector; } - mddev->reshape_backwards = 0; /* adding disks -> forward reshape */ + mddev->reshape_backwards = 0; /* adding disk(s) -> forward reshape */ /* Remove disk(s) */ } else if (rs->delta_disks < 0) { @@ -2874,6 +2952,15 @@ static int rs_setup_reshape(struct raid_set *rs) mddev->reshape_backwards = rs->dev[0].rdev.data_offset ? 0 : 1; } + /* + * Adjust device size for forward reshape + * because md_finish_reshape() reduces it. + */ + if (!mddev->reshape_backwards) + rdev_for_each(rdev, &rs->md) + if (!test_bit(Journal, &rdev->flags)) + rdev->sectors += reshape_sectors; + return r; } @@ -2890,7 +2977,7 @@ static void configure_discard_support(struct raid_set *rs) /* * XXX: RAID level 4,5,6 require zeroing for safety. */ - raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6); + raid456 = rs_is_raid456(rs); for (i = 0; i < rs->raid_disks; i++) { struct request_queue *q; @@ -2915,7 +3002,7 @@ static void configure_discard_support(struct raid_set *rs) * RAID1 and RAID10 personalities require bio splitting, * RAID0/4/5/6 don't and process large discard bios properly. */ - ti->split_discard_bios = !!(rs->md.level == 1 || rs->md.level == 10); + ti->split_discard_bios = !!(rs_is_raid1(rs) || rs_is_raid10(rs)); ti->num_discard_bios = 1; } @@ -2935,10 +3022,10 @@ static void configure_discard_support(struct raid_set *rs) static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) { int r; - bool resize; + bool resize = false; struct raid_type *rt; unsigned int num_raid_params, num_raid_devs; - sector_t calculated_dev_sectors, rdev_sectors; + sector_t calculated_dev_sectors, rdev_sectors, reshape_sectors; struct raid_set *rs = NULL; const char *arg; struct rs_layout rs_layout; @@ -3021,7 +3108,10 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - resize = calculated_dev_sectors != rdev_sectors; + + reshape_sectors = _get_reshape_sectors(rs); + if (calculated_dev_sectors != rdev_sectors) + resize = calculated_dev_sectors != (reshape_sectors ? rdev_sectors - reshape_sectors : rdev_sectors); INIT_WORK(&rs->md.event_work, do_table_event); ti->private = rs; @@ -3105,19 +3195,22 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - /* - * We can only prepare for a reshape here, because the - * raid set needs to run to provide the repective reshape - * check functions via its MD personality instance. - * - * So do the reshape check after md_run() succeeded. - */ - r = rs_prepare_reshape(rs); - if (r) - return r; + /* Out-of-place space has to be available to allow for a reshape unless raid1! */ + if (reshape_sectors || rs_is_raid1(rs)) { + /* + * We can only prepare for a reshape here, because the + * raid set needs to run to provide the repective reshape + * check functions via its MD personality instance. + * + * So do the reshape check after md_run() succeeded. + */ + r = rs_prepare_reshape(rs); + if (r) + return r; - /* Reshaping ain't recovery, so disable recovery */ - rs_setup_recovery(rs, MaxSector); + /* Reshaping ain't recovery, so disable recovery */ + rs_setup_recovery(rs, MaxSector); + } rs_set_cur(rs); } else { /* May not set recovery when a device rebuild is requested */ @@ -3144,13 +3237,20 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) mddev_lock_nointr(&rs->md); r = md_run(&rs->md); rs->md.in_sync = 0; /* Assume already marked dirty */ - if (r) { ti->error = "Failed to run raid array"; mddev_unlock(&rs->md); goto bad; } + r = md_start(&rs->md); + + if (r) { + ti->error = "Failed to start raid array"; + mddev_unlock(&rs->md); + goto bad_md_start; + } + rs->callbacks.congested_fn = raid_is_congested; dm_table_add_target_callbacks(ti->table, &rs->callbacks); @@ -3198,6 +3298,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) mddev_unlock(&rs->md); return 0; +bad_md_start: bad_journal_mode_set: bad_stripe_cache: bad_check_reshape: @@ -3239,25 +3340,27 @@ static int raid_map(struct dm_target *ti, struct bio *bio) } /* Return string describing the current sync action of @mddev */ -static const char *decipher_sync_action(struct mddev *mddev) +static const char *decipher_sync_action(struct mddev *mddev, unsigned long recovery) { - if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + if (test_bit(MD_RECOVERY_FROZEN, &recovery)) return "frozen"; - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || - (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + /* The MD sync thread can be done with io but still be running */ + if (!test_bit(MD_RECOVERY_DONE, &recovery) && + (test_bit(MD_RECOVERY_RUNNING, &recovery) || + (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery)))) { + if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) return "reshape"; - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + if (test_bit(MD_RECOVERY_SYNC, &recovery)) { + if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) return "resync"; - else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + else if (test_bit(MD_RECOVERY_CHECK, &recovery)) return "check"; return "repair"; } - if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) + if (test_bit(MD_RECOVERY_RECOVER, &recovery)) return "recover"; } @@ -3274,7 +3377,7 @@ static const char *decipher_sync_action(struct mddev *mddev) * 'A' = Alive and in-sync raid set component _or_ alive raid4/5/6 'write_through' journal device * '-' = Non-existing device (i.e. uspace passed '- -' into the ctr) */ -static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, bool array_in_sync) +static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev) { if (!rdev->bdev) return "-"; @@ -3282,85 +3385,108 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, return "D"; else if (test_bit(Journal, &rdev->flags)) return (rs->journal_dev.mode == R5C_JOURNAL_MODE_WRITE_THROUGH) ? "A" : "a"; - else if (!array_in_sync || !test_bit(In_sync, &rdev->flags)) + else if (test_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags) || + (!test_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags) && + !test_bit(In_sync, &rdev->flags))) return "a"; else return "A"; } -/* Helper to return resync/reshape progress for @rs and @array_in_sync */ -static sector_t rs_get_progress(struct raid_set *rs, - sector_t resync_max_sectors, bool *array_in_sync) +/* Helper to return resync/reshape progress for @rs and runtime flags for raid set in sync / resynching */ +static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, + sector_t resync_max_sectors) { - sector_t r, curr_resync_completed; + sector_t r; struct mddev *mddev = &rs->md; - curr_resync_completed = mddev->curr_resync_completed ?: mddev->recovery_cp; - *array_in_sync = false; + clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); + clear_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); if (rs_is_raid0(rs)) { r = resync_max_sectors; - *array_in_sync = true; + set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); } else { - r = mddev->reshape_position; - - /* Reshape is relative to the array size */ - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || - r != MaxSector) { - if (r == MaxSector) { - *array_in_sync = true; - r = resync_max_sectors; - } else { - /* Got to reverse on backward reshape */ - if (mddev->reshape_backwards) - r = mddev->array_sectors - r; - - /* Devide by # of data stripes */ - sector_div(r, mddev_data_stripes(rs)); - } - - /* Sync is relative to the component device size */ - } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - r = curr_resync_completed; + if (test_bit(MD_RECOVERY_NEEDED, &recovery) || + test_bit(MD_RECOVERY_RESHAPE, &recovery) || + test_bit(MD_RECOVERY_RUNNING, &recovery)) + r = mddev->curr_resync_completed; else r = mddev->recovery_cp; - if ((r == MaxSector) || - (test_bit(MD_RECOVERY_DONE, &mddev->recovery) && - (mddev->curr_resync_completed == resync_max_sectors))) { + if (r >= resync_max_sectors && + (!test_bit(MD_RECOVERY_REQUESTED, &recovery) || + (!test_bit(MD_RECOVERY_FROZEN, &recovery) && + !test_bit(MD_RECOVERY_NEEDED, &recovery) && + !test_bit(MD_RECOVERY_RUNNING, &recovery)))) { /* * Sync complete. */ - *array_in_sync = true; - r = resync_max_sectors; - } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + /* In case we have finished recovering, the array is in sync. */ + if (test_bit(MD_RECOVERY_RECOVER, &recovery)) + set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); + + } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) { + /* + * In case we are recovering, the array is not in sync + * and health chars should show the recovering legs. + */ + ; + + } else if (test_bit(MD_RECOVERY_SYNC, &recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &recovery)) { + /* + * If "resync" is occurring, the raid set + * is or may be out of sync hence the health + * characters shall be 'a'. + */ + set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); + + } else if (test_bit(MD_RECOVERY_RESHAPE, &recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &recovery)) { + /* + * If "reshape" is occurring, the raid set + * is or may be out of sync hence the health + * characters shall be 'a'. + */ + set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); + + } else if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) { /* * If "check" or "repair" is occurring, the raid set has * undergone an initial sync and the health characters * should not be 'a' anymore. */ - *array_in_sync = true; + set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); + } else { struct md_rdev *rdev; /* + * We are idle and recovery is needed, prevent 'A' chars race + * caused by components still set to in-sync by constrcuctor. + */ + if (test_bit(MD_RECOVERY_NEEDED, &recovery)) + set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); + + /* * The raid set may be doing an initial sync, or it may * be rebuilding individual components. If all the * devices are In_sync, then it is the raid set that is * being initialized. */ + set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); rdev_for_each(rdev, mddev) if (!test_bit(Journal, &rdev->flags) && - !test_bit(In_sync, &rdev->flags)) - *array_in_sync = true; -#if 0 - r = 0; /* HM FIXME: TESTME: https://bugzilla.redhat.com/show_bug.cgi?id=1210637 ? */ -#endif + !test_bit(In_sync, &rdev->flags)) { + clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); + break; + } } } - return r; + return min(r, resync_max_sectors); } /* Helper to return @dev name or "-" if !@dev */ @@ -3376,7 +3502,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, struct mddev *mddev = &rs->md; struct r5conf *conf = mddev->private; int i, max_nr_stripes = conf ? conf->max_nr_stripes : 0; - bool array_in_sync; + unsigned long recovery; unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */ unsigned int sz = 0; unsigned int rebuild_disks; @@ -3396,17 +3522,18 @@ static void raid_status(struct dm_target *ti, status_type_t type, /* Access most recent mddev properties for status output */ smp_rmb(); + recovery = rs->md.recovery; /* Get sensible max sectors even if raid set not yet started */ resync_max_sectors = test_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags) ? mddev->resync_max_sectors : mddev->dev_sectors; - progress = rs_get_progress(rs, resync_max_sectors, &array_in_sync); + progress = rs_get_progress(rs, recovery, resync_max_sectors); resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ? atomic64_read(&mddev->resync_mismatches) : 0; - sync_action = decipher_sync_action(&rs->md); + sync_action = decipher_sync_action(&rs->md, recovery); /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */ for (i = 0; i < rs->raid_disks; i++) - DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev, array_in_sync)); + DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev)); /* * In-sync/Reshape ratio: @@ -3457,7 +3584,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, * v1.10.0+: */ DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? - __raid_dev_status(rs, &rs->journal_dev.rdev, 0) : "-"); + __raid_dev_status(rs, &rs->journal_dev.rdev) : "-"); break; case STATUSTYPE_TABLE: @@ -3613,24 +3740,19 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_opt(limits, chunk_size * mddev_data_stripes(rs)); } -static void raid_presuspend(struct dm_target *ti) -{ - struct raid_set *rs = ti->private; - - md_stop_writes(&rs->md); -} - static void raid_postsuspend(struct dm_target *ti) { struct raid_set *rs = ti->private; if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { + /* Writes have to be stopped before suspending to avoid deadlocks. */ + if (!test_bit(MD_RECOVERY_FROZEN, &rs->md.recovery)) + md_stop_writes(&rs->md); + mddev_lock_nointr(&rs->md); mddev_suspend(&rs->md); mddev_unlock(&rs->md); } - - rs->md.ro = 1; } static void attempt_restore_of_faulty_devices(struct raid_set *rs) @@ -3807,10 +3929,33 @@ static int raid_preresume(struct dm_target *ti) struct raid_set *rs = ti->private; struct mddev *mddev = &rs->md; - /* This is a resume after a suspend of the set -> it's already started */ + /* This is a resume after a suspend of the set -> it's already started. */ if (test_and_set_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags)) return 0; + if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) { + struct raid_set *rs_active = rs_find_active(rs); + + if (rs_active) { + /* + * In case no rebuilds have been requested + * and an active table slot exists, copy + * current resynchonization completed and + * reshape position pointers across from + * suspended raid set in the active slot. + * + * This resumes the new mapping at current + * offsets to continue recover/reshape without + * necessarily redoing a raid set partially or + * causing data corruption in case of a reshape. + */ + if (rs_active->md.curr_resync_completed != MaxSector) + mddev->curr_resync_completed = rs_active->md.curr_resync_completed; + if (rs_active->md.reshape_position != MaxSector) + mddev->reshape_position = rs_active->md.reshape_position; + } + } + /* * The superblocks need to be updated on disk if the * array is new or new devices got added (thus zeroed @@ -3842,11 +3987,10 @@ static int raid_preresume(struct dm_target *ti) mddev->resync_min = mddev->recovery_cp; } - rs_set_capacity(rs); - /* Check for any reshape request unless new raid set */ - if (test_and_clear_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) { + if (test_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) { /* Initiate a reshape. */ + rs_set_rdev_sectors(rs); mddev_lock_nointr(mddev); r = rs_start_reshape(rs); mddev_unlock(mddev); @@ -3872,21 +4016,15 @@ static void raid_resume(struct dm_target *ti) attempt_restore_of_faulty_devices(rs); } - mddev->ro = 0; - mddev->in_sync = 0; - - /* - * Keep the RAID set frozen if reshape/rebuild flags are set. - * The RAID set is unfrozen once the next table load/resume, - * which clears the reshape/rebuild flags, occurs. - * This ensures that the constructor for the inactive table - * retrieves an up-to-date reshape_position. - */ - if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS)) - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { + /* Only reduce raid set size before running a disk removing reshape. */ + if (mddev->delta_disks < 0) + rs_set_capacity(rs); + mddev_lock_nointr(mddev); + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + mddev->ro = 0; + mddev->in_sync = 0; mddev_resume(mddev); mddev_unlock(mddev); } @@ -3894,7 +4032,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 13, 0}, + .version = {1, 13, 2}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, @@ -3903,7 +4041,6 @@ static struct target_type raid_target = { .message = raid_message, .iterate_devices = raid_iterate_devices, .io_hints = raid_io_hints, - .presuspend = raid_presuspend, .postsuspend = raid_postsuspend, .preresume = raid_preresume, .resume = raid_resume, diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 9d32f25489c2..bf0b840645cc 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -315,6 +315,10 @@ static void dm_done(struct request *clone, blk_status_t error, bool mapped) /* The target wants to requeue the I/O */ dm_requeue_original_request(tio, false); break; + case DM_ENDIO_DELAY_REQUEUE: + /* The target wants to requeue the I/O after a delay */ + dm_requeue_original_request(tio, true); + break; default: DMWARN("unimplemented target endio return value: %d", r); BUG(); @@ -395,7 +399,7 @@ static void end_clone_request(struct request *clone, blk_status_t error) dm_complete_request(tio->orig, error); } -static void dm_dispatch_clone_request(struct request *clone, struct request *rq) +static blk_status_t dm_dispatch_clone_request(struct request *clone, struct request *rq) { blk_status_t r; @@ -404,9 +408,10 @@ static void dm_dispatch_clone_request(struct request *clone, struct request *rq) clone->start_time = jiffies; r = blk_insert_cloned_request(clone->q, clone); - if (r) + if (r != BLK_STS_OK && r != BLK_STS_RESOURCE && r != BLK_STS_DEV_RESOURCE) /* must complete clone in terms of original request */ dm_complete_request(rq, r); + return r; } static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, @@ -476,8 +481,10 @@ static int map_request(struct dm_rq_target_io *tio) struct mapped_device *md = tio->md; struct request *rq = tio->orig; struct request *clone = NULL; + blk_status_t ret; r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone); +check_again: switch (r) { case DM_MAPIO_SUBMITTED: /* The target has taken the I/O to submit by itself later */ @@ -492,7 +499,17 @@ static int map_request(struct dm_rq_target_io *tio) /* The target has remapped the I/O so dispatch it */ trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), blk_rq_pos(rq)); - dm_dispatch_clone_request(clone, rq); + ret = dm_dispatch_clone_request(clone, rq); + if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { + blk_rq_unprep_clone(clone); + tio->ti->type->release_clone_rq(clone); + tio->clone = NULL; + if (!rq->q->mq_ops) + r = DM_MAPIO_DELAY_REQUEUE; + else + r = DM_MAPIO_REQUEUE; + goto check_again; + } break; case DM_MAPIO_REQUEUE: /* The target wants to requeue the I/O */ @@ -700,7 +717,6 @@ int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t) /* disable dm_old_request_fn's merge heuristic by default */ md->seq_rq_merge_deadline_usecs = 0; - dm_init_normal_md_queue(md); blk_queue_softirq_done(md->queue, dm_softirq_done); /* Initialize the request-based DM worker thread */ @@ -713,8 +729,6 @@ int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t) return error; } - elv_register_queue(md->queue); - return 0; } @@ -758,7 +772,6 @@ static blk_status_t dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, /* Undo dm_start_request() before requeuing */ rq_end_stats(md, rq); rq_completed(md, rq_data_dir(rq), false); - blk_mq_delay_run_hw_queue(hctx, 100/*ms*/); return BLK_STS_RESOURCE; } @@ -810,17 +823,9 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) err = PTR_ERR(q); goto out_tag_set; } - dm_init_md_queue(md); - - /* backfill 'mq' sysfs registration normally done in blk_register_queue */ - err = blk_mq_register_dev(disk_to_dev(md->disk), q); - if (err) - goto out_cleanup_queue; return 0; -out_cleanup_queue: - blk_cleanup_queue(q); out_tag_set: blk_mq_free_tag_set(md->tag_set); out_kfree_tag_set: diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c index 7b8642045c55..f006a9005593 100644 --- a/drivers/md/dm-service-time.c +++ b/drivers/md/dm-service-time.c @@ -282,9 +282,6 @@ static struct dm_path *st_select_path(struct path_selector *ps, size_t nr_bytes) if (list_empty(&s->valid_paths)) goto out; - /* Change preferred (first in list) path to evenly balance. */ - list_move_tail(s->valid_paths.next, &s->valid_paths); - list_for_each_entry(pi, &s->valid_paths, list) if (!best || (st_compare_load(pi, best, nr_bytes) < 0)) best = pi; @@ -292,6 +289,9 @@ static struct dm_path *st_select_path(struct path_selector *ps, size_t nr_bytes) if (!best) goto out; + /* Move most recently used to least preferred to evenly balance. */ + list_move_tail(&best->list, &s->valid_paths); + ret = best->path; out: spin_unlock_irqrestore(&s->lock, flags); diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index a0613bd8ed00..216035be5661 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -47,7 +47,7 @@ struct dm_exception_table { }; struct dm_snapshot { - struct rw_semaphore lock; + struct mutex lock; struct dm_dev *origin; struct dm_dev *cow; @@ -439,9 +439,9 @@ static int __find_snapshots_sharing_cow(struct dm_snapshot *snap, if (!bdev_equal(s->cow->bdev, snap->cow->bdev)) continue; - down_read(&s->lock); + mutex_lock(&s->lock); active = s->active; - up_read(&s->lock); + mutex_unlock(&s->lock); if (active) { if (snap_src) @@ -909,7 +909,7 @@ static int remove_single_exception_chunk(struct dm_snapshot *s) int r; chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1; - down_write(&s->lock); + mutex_lock(&s->lock); /* * Process chunks (and associated exceptions) in reverse order @@ -924,7 +924,7 @@ static int remove_single_exception_chunk(struct dm_snapshot *s) b = __release_queued_bios_after_merge(s); out: - up_write(&s->lock); + mutex_unlock(&s->lock); if (b) flush_bios(b); @@ -983,9 +983,9 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s) if (linear_chunks < 0) { DMERR("Read error in exception store: " "shutting down merge"); - down_write(&s->lock); + mutex_lock(&s->lock); s->merge_failed = 1; - up_write(&s->lock); + mutex_unlock(&s->lock); } goto shut; } @@ -1026,10 +1026,10 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s) previous_count = read_pending_exceptions_done_count(); } - down_write(&s->lock); + mutex_lock(&s->lock); s->first_merging_chunk = old_chunk; s->num_merging_chunks = linear_chunks; - up_write(&s->lock); + mutex_unlock(&s->lock); /* Wait until writes to all 'linear_chunks' drain */ for (i = 0; i < linear_chunks; i++) @@ -1071,10 +1071,10 @@ static void merge_callback(int read_err, unsigned long write_err, void *context) return; shut: - down_write(&s->lock); + mutex_lock(&s->lock); s->merge_failed = 1; b = __release_queued_bios_after_merge(s); - up_write(&s->lock); + mutex_unlock(&s->lock); error_bios(b); merge_shutdown(s); @@ -1173,7 +1173,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s->exception_start_sequence = 0; s->exception_complete_sequence = 0; INIT_LIST_HEAD(&s->out_of_order_list); - init_rwsem(&s->lock); + mutex_init(&s->lock); INIT_LIST_HEAD(&s->list); spin_lock_init(&s->pe_lock); s->state_bits = 0; @@ -1338,9 +1338,9 @@ static void snapshot_dtr(struct dm_target *ti) /* Check whether exception handover must be cancelled */ (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); if (snap_src && snap_dest && (s == snap_src)) { - down_write(&snap_dest->lock); + mutex_lock(&snap_dest->lock); snap_dest->valid = 0; - up_write(&snap_dest->lock); + mutex_unlock(&snap_dest->lock); DMERR("Cancelling snapshot handover."); } up_read(&_origins_lock); @@ -1371,6 +1371,8 @@ static void snapshot_dtr(struct dm_target *ti) dm_exception_store_destroy(s->store); + mutex_destroy(&s->lock); + dm_put_device(ti, s->cow); dm_put_device(ti, s->origin); @@ -1458,7 +1460,7 @@ static void pending_complete(void *context, int success) if (!success) { /* Read/write error - snapshot is unusable */ - down_write(&s->lock); + mutex_lock(&s->lock); __invalidate_snapshot(s, -EIO); error = 1; goto out; @@ -1466,14 +1468,14 @@ static void pending_complete(void *context, int success) e = alloc_completed_exception(GFP_NOIO); if (!e) { - down_write(&s->lock); + mutex_lock(&s->lock); __invalidate_snapshot(s, -ENOMEM); error = 1; goto out; } *e = pe->e; - down_write(&s->lock); + mutex_lock(&s->lock); if (!s->valid) { free_completed_exception(e); error = 1; @@ -1498,7 +1500,7 @@ out: full_bio->bi_end_io = pe->full_bio_end_io; increment_pending_exceptions_done_count(); - up_write(&s->lock); + mutex_unlock(&s->lock); /* Submit any pending write bios */ if (error) { @@ -1694,7 +1696,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) /* FIXME: should only take write lock if we need * to copy an exception */ - down_write(&s->lock); + mutex_lock(&s->lock); if (!s->valid || (unlikely(s->snapshot_overflowed) && bio_data_dir(bio) == WRITE)) { @@ -1717,9 +1719,9 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) if (bio_data_dir(bio) == WRITE) { pe = __lookup_pending_exception(s, chunk); if (!pe) { - up_write(&s->lock); + mutex_unlock(&s->lock); pe = alloc_pending_exception(s); - down_write(&s->lock); + mutex_lock(&s->lock); if (!s->valid || s->snapshot_overflowed) { free_pending_exception(pe); @@ -1754,7 +1756,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) bio->bi_iter.bi_size == (s->store->chunk_size << SECTOR_SHIFT)) { pe->started = 1; - up_write(&s->lock); + mutex_unlock(&s->lock); start_full_bio(pe, bio); goto out; } @@ -1764,7 +1766,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) if (!pe->started) { /* this is protected by snap->lock */ pe->started = 1; - up_write(&s->lock); + mutex_unlock(&s->lock); start_copy(pe); goto out; } @@ -1774,7 +1776,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) } out_unlock: - up_write(&s->lock); + mutex_unlock(&s->lock); out: return r; } @@ -1810,7 +1812,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector); - down_write(&s->lock); + mutex_lock(&s->lock); /* Full merging snapshots are redirected to the origin */ if (!s->valid) @@ -1841,12 +1843,12 @@ redirect_to_origin: bio_set_dev(bio, s->origin->bdev); if (bio_data_dir(bio) == WRITE) { - up_write(&s->lock); + mutex_unlock(&s->lock); return do_origin(s->origin, bio); } out_unlock: - up_write(&s->lock); + mutex_unlock(&s->lock); return r; } @@ -1878,7 +1880,7 @@ static int snapshot_preresume(struct dm_target *ti) down_read(&_origins_lock); (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); if (snap_src && snap_dest) { - down_read(&snap_src->lock); + mutex_lock(&snap_src->lock); if (s == snap_src) { DMERR("Unable to resume snapshot source until " "handover completes."); @@ -1888,7 +1890,7 @@ static int snapshot_preresume(struct dm_target *ti) "source is suspended."); r = -EINVAL; } - up_read(&snap_src->lock); + mutex_unlock(&snap_src->lock); } up_read(&_origins_lock); @@ -1934,11 +1936,11 @@ static void snapshot_resume(struct dm_target *ti) (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); if (snap_src && snap_dest) { - down_write(&snap_src->lock); - down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING); + mutex_lock(&snap_src->lock); + mutex_lock_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING); __handover_exceptions(snap_src, snap_dest); - up_write(&snap_dest->lock); - up_write(&snap_src->lock); + mutex_unlock(&snap_dest->lock); + mutex_unlock(&snap_src->lock); } up_read(&_origins_lock); @@ -1953,9 +1955,9 @@ static void snapshot_resume(struct dm_target *ti) /* Now we have correct chunk size, reregister */ reregister_snapshot(s); - down_write(&s->lock); + mutex_lock(&s->lock); s->active = 1; - up_write(&s->lock); + mutex_unlock(&s->lock); } static uint32_t get_origin_minimum_chunksize(struct block_device *bdev) @@ -1995,7 +1997,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type, switch (type) { case STATUSTYPE_INFO: - down_write(&snap->lock); + mutex_lock(&snap->lock); if (!snap->valid) DMEMIT("Invalid"); @@ -2020,7 +2022,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type, DMEMIT("Unknown"); } - up_write(&snap->lock); + mutex_unlock(&snap->lock); break; @@ -2086,7 +2088,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, if (dm_target_is_snapshot_merge(snap->ti)) continue; - down_write(&snap->lock); + mutex_lock(&snap->lock); /* Only deal with valid and active snapshots */ if (!snap->valid || !snap->active) @@ -2113,9 +2115,9 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, pe = __lookup_pending_exception(snap, chunk); if (!pe) { - up_write(&snap->lock); + mutex_unlock(&snap->lock); pe = alloc_pending_exception(snap); - down_write(&snap->lock); + mutex_lock(&snap->lock); if (!snap->valid) { free_pending_exception(pe); @@ -2158,7 +2160,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, } next_snapshot: - up_write(&snap->lock); + mutex_unlock(&snap->lock); if (pe_to_start_now) { start_copy(pe_to_start_now); diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 29bc51084c82..56059fb56e2d 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -228,6 +228,7 @@ void dm_stats_cleanup(struct dm_stats *stats) dm_stat_free(&s->rcu_head); } free_percpu(stats->last); + mutex_destroy(&stats->mutex); } static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index aaffd0c0ee9a..5fe7ec356c33 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -866,7 +866,8 @@ EXPORT_SYMBOL(dm_consume_args); static bool __table_type_bio_based(enum dm_queue_mode table_type) { return (table_type == DM_TYPE_BIO_BASED || - table_type == DM_TYPE_DAX_BIO_BASED); + table_type == DM_TYPE_DAX_BIO_BASED || + table_type == DM_TYPE_NVME_BIO_BASED); } static bool __table_type_request_based(enum dm_queue_mode table_type) @@ -909,13 +910,33 @@ static bool dm_table_supports_dax(struct dm_table *t) return true; } +static bool dm_table_does_not_support_partial_completion(struct dm_table *t); + +struct verify_rq_based_data { + unsigned sq_count; + unsigned mq_count; +}; + +static int device_is_rq_based(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + struct verify_rq_based_data *v = data; + + if (q->mq_ops) + v->mq_count++; + else + v->sq_count++; + + return queue_is_rq_based(q); +} + static int dm_table_determine_type(struct dm_table *t) { unsigned i; unsigned bio_based = 0, request_based = 0, hybrid = 0; - unsigned sq_count = 0, mq_count = 0; + struct verify_rq_based_data v = {.sq_count = 0, .mq_count = 0}; struct dm_target *tgt; - struct dm_dev_internal *dd; struct list_head *devices = dm_table_get_devices(t); enum dm_queue_mode live_md_type = dm_get_md_type(t->md); @@ -923,6 +944,14 @@ static int dm_table_determine_type(struct dm_table *t) /* target already set the table's type */ if (t->type == DM_TYPE_BIO_BASED) return 0; + else if (t->type == DM_TYPE_NVME_BIO_BASED) { + if (!dm_table_does_not_support_partial_completion(t)) { + DMERR("nvme bio-based is only possible with devices" + " that don't support partial completion"); + return -EINVAL; + } + /* Fallthru, also verify all devices are blk-mq */ + } BUG_ON(t->type == DM_TYPE_DAX_BIO_BASED); goto verify_rq_based; } @@ -937,8 +966,8 @@ static int dm_table_determine_type(struct dm_table *t) bio_based = 1; if (bio_based && request_based) { - DMWARN("Inconsistent table: different target types" - " can't be mixed up"); + DMERR("Inconsistent table: different target types" + " can't be mixed up"); return -EINVAL; } } @@ -959,8 +988,18 @@ static int dm_table_determine_type(struct dm_table *t) /* We must use this table as bio-based */ t->type = DM_TYPE_BIO_BASED; if (dm_table_supports_dax(t) || - (list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) + (list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) { t->type = DM_TYPE_DAX_BIO_BASED; + } else { + /* Check if upgrading to NVMe bio-based is valid or required */ + tgt = dm_table_get_immutable_target(t); + if (tgt && !tgt->max_io_len && dm_table_does_not_support_partial_completion(t)) { + t->type = DM_TYPE_NVME_BIO_BASED; + goto verify_rq_based; /* must be stacked directly on NVMe (blk-mq) */ + } else if (list_empty(devices) && live_md_type == DM_TYPE_NVME_BIO_BASED) { + t->type = DM_TYPE_NVME_BIO_BASED; + } + } return 0; } @@ -980,7 +1019,8 @@ verify_rq_based: * (e.g. request completion process for partial completion.) */ if (t->num_targets > 1) { - DMWARN("Request-based dm doesn't support multiple targets yet"); + DMERR("%s DM doesn't support multiple targets", + t->type == DM_TYPE_NVME_BIO_BASED ? "nvme bio-based" : "request-based"); return -EINVAL; } @@ -997,28 +1037,29 @@ verify_rq_based: return 0; } - /* Non-request-stackable devices can't be used for request-based dm */ - list_for_each_entry(dd, devices, list) { - struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev); - - if (!queue_is_rq_based(q)) { - DMERR("table load rejected: including" - " non-request-stackable devices"); - return -EINVAL; - } + tgt = dm_table_get_immutable_target(t); + if (!tgt) { + DMERR("table load rejected: immutable target is required"); + return -EINVAL; + } else if (tgt->max_io_len) { + DMERR("table load rejected: immutable target that splits IO is not supported"); + return -EINVAL; + } - if (q->mq_ops) - mq_count++; - else - sq_count++; + /* Non-request-stackable devices can't be used for request-based dm */ + if (!tgt->type->iterate_devices || + !tgt->type->iterate_devices(tgt, device_is_rq_based, &v)) { + DMERR("table load rejected: including non-request-stackable devices"); + return -EINVAL; } - if (sq_count && mq_count) { + if (v.sq_count && v.mq_count) { DMERR("table load rejected: not all devices are blk-mq request-stackable"); return -EINVAL; } - t->all_blk_mq = mq_count > 0; + t->all_blk_mq = v.mq_count > 0; - if (t->type == DM_TYPE_MQ_REQUEST_BASED && !t->all_blk_mq) { + if (!t->all_blk_mq && + (t->type == DM_TYPE_MQ_REQUEST_BASED || t->type == DM_TYPE_NVME_BIO_BASED)) { DMERR("table load rejected: all devices are not blk-mq request-stackable"); return -EINVAL; } @@ -1079,7 +1120,8 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * { enum dm_queue_mode type = dm_table_get_type(t); unsigned per_io_data_size = 0; - struct dm_target *tgt; + unsigned min_pool_size = 0; + struct dm_target *ti; unsigned i; if (unlikely(type == DM_TYPE_NONE)) { @@ -1089,11 +1131,13 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * if (__table_type_bio_based(type)) for (i = 0; i < t->num_targets; i++) { - tgt = t->targets + i; - per_io_data_size = max(per_io_data_size, tgt->per_io_data_size); + ti = t->targets + i; + per_io_data_size = max(per_io_data_size, ti->per_io_data_size); + min_pool_size = max(min_pool_size, ti->num_flush_bios); } - t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_io_data_size); + t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, + per_io_data_size, min_pool_size); if (!t->mempools) return -ENOMEM; @@ -1705,6 +1749,20 @@ static bool dm_table_all_devices_attribute(struct dm_table *t, return true; } +static int device_no_partial_completion(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + char b[BDEVNAME_SIZE]; + + /* For now, NVMe devices are the only devices of this class */ + return (strncmp(bdevname(dev->bdev, b), "nvme", 3) == 0); +} + +static bool dm_table_does_not_support_partial_completion(struct dm_table *t) +{ + return dm_table_all_devices_attribute(t, device_no_partial_completion); +} + static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { @@ -1820,6 +1878,8 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, } blk_queue_write_cache(q, wc, fua); + if (dm_table_supports_dax(t)) + queue_flag_set_unlocked(QUEUE_FLAG_DAX, q); if (dm_table_supports_dax_write_cache(t)) dax_write_cache(t->md->dax_dev, true); diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index d31d18d9727c..36ef284ad086 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -80,10 +80,14 @@ #define SECTOR_TO_BLOCK_SHIFT 3 /* + * For btree insert: * 3 for btree insert + * 2 for btree lookup used within space map + * For btree remove: + * 2 for shadow spine + + * 4 for rebalance 3 child node */ -#define THIN_MAX_CONCURRENT_LOCKS 5 +#define THIN_MAX_CONCURRENT_LOCKS 6 /* This should be plenty */ #define SPACE_MAP_ROOT_SIZE 128 diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index f91d771fff4b..629c555890c1 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -492,6 +492,11 @@ static void pool_table_init(void) INIT_LIST_HEAD(&dm_thin_pool_table.pools); } +static void pool_table_exit(void) +{ + mutex_destroy(&dm_thin_pool_table.mutex); +} + static void __pool_table_insert(struct pool *pool) { BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex)); @@ -1717,7 +1722,7 @@ static void __remap_and_issue_shared_cell(void *context, bio_op(bio) == REQ_OP_DISCARD) bio_list_add(&info->defer_bios, bio); else { - struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));; + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds); inc_all_io_entry(info->tc->pool, bio); @@ -4387,6 +4392,8 @@ static void dm_thin_exit(void) dm_unregister_target(&pool_target); kmem_cache_destroy(_new_mapping_cache); + + pool_table_exit(); } module_init(dm_thin_init); diff --git a/drivers/md/dm-unstripe.c b/drivers/md/dm-unstripe.c new file mode 100644 index 000000000000..65f838fa2e99 --- /dev/null +++ b/drivers/md/dm-unstripe.c @@ -0,0 +1,219 @@ +/* + * Copyright (C) 2017 Intel Corporation. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/blkdev.h> +#include <linux/bio.h> +#include <linux/slab.h> +#include <linux/bitops.h> +#include <linux/device-mapper.h> + +struct unstripe_c { + struct dm_dev *dev; + sector_t physical_start; + + uint32_t stripes; + + uint32_t unstripe; + sector_t unstripe_width; + sector_t unstripe_offset; + + uint32_t chunk_size; + u8 chunk_shift; +}; + +#define DM_MSG_PREFIX "unstriped" + +static void cleanup_unstripe(struct unstripe_c *uc, struct dm_target *ti) +{ + if (uc->dev) + dm_put_device(ti, uc->dev); + kfree(uc); +} + +/* + * Contruct an unstriped mapping. + * <number of stripes> <chunk size> <stripe #> <dev_path> <offset> + */ +static int unstripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct unstripe_c *uc; + sector_t tmp_len; + unsigned long long start; + char dummy; + + if (argc != 5) { + ti->error = "Invalid number of arguments"; + return -EINVAL; + } + + uc = kzalloc(sizeof(*uc), GFP_KERNEL); + if (!uc) { + ti->error = "Memory allocation for unstriped context failed"; + return -ENOMEM; + } + + if (kstrtouint(argv[0], 10, &uc->stripes) || !uc->stripes) { + ti->error = "Invalid stripe count"; + goto err; + } + + if (kstrtouint(argv[1], 10, &uc->chunk_size) || !uc->chunk_size) { + ti->error = "Invalid chunk_size"; + goto err; + } + + // FIXME: must support non power of 2 chunk_size, dm-stripe.c does + if (!is_power_of_2(uc->chunk_size)) { + ti->error = "Non power of 2 chunk_size is not supported yet"; + goto err; + } + + if (kstrtouint(argv[2], 10, &uc->unstripe)) { + ti->error = "Invalid stripe number"; + goto err; + } + + if (uc->unstripe > uc->stripes && uc->stripes > 1) { + ti->error = "Please provide stripe between [0, # of stripes]"; + goto err; + } + + if (dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), &uc->dev)) { + ti->error = "Couldn't get striped device"; + goto err; + } + + if (sscanf(argv[4], "%llu%c", &start, &dummy) != 1) { + ti->error = "Invalid striped device offset"; + goto err; + } + uc->physical_start = start; + + uc->unstripe_offset = uc->unstripe * uc->chunk_size; + uc->unstripe_width = (uc->stripes - 1) * uc->chunk_size; + uc->chunk_shift = fls(uc->chunk_size) - 1; + + tmp_len = ti->len; + if (sector_div(tmp_len, uc->chunk_size)) { + ti->error = "Target length not divisible by chunk size"; + goto err; + } + + if (dm_set_target_max_io_len(ti, uc->chunk_size)) { + ti->error = "Failed to set max io len"; + goto err; + } + + ti->private = uc; + return 0; +err: + cleanup_unstripe(uc, ti); + return -EINVAL; +} + +static void unstripe_dtr(struct dm_target *ti) +{ + struct unstripe_c *uc = ti->private; + + cleanup_unstripe(uc, ti); +} + +static sector_t map_to_core(struct dm_target *ti, struct bio *bio) +{ + struct unstripe_c *uc = ti->private; + sector_t sector = bio->bi_iter.bi_sector; + + /* Shift us up to the right "row" on the stripe */ + sector += uc->unstripe_width * (sector >> uc->chunk_shift); + + /* Account for what stripe we're operating on */ + sector += uc->unstripe_offset; + + return sector; +} + +static int unstripe_map(struct dm_target *ti, struct bio *bio) +{ + struct unstripe_c *uc = ti->private; + + bio_set_dev(bio, uc->dev->bdev); + bio->bi_iter.bi_sector = map_to_core(ti, bio) + uc->physical_start; + + return DM_MAPIO_REMAPPED; +} + +static void unstripe_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, unsigned int maxlen) +{ + struct unstripe_c *uc = ti->private; + unsigned int sz = 0; + + switch (type) { + case STATUSTYPE_INFO: + break; + + case STATUSTYPE_TABLE: + DMEMIT("%d %llu %d %s %llu", + uc->stripes, (unsigned long long)uc->chunk_size, uc->unstripe, + uc->dev->name, (unsigned long long)uc->physical_start); + break; + } +} + +static int unstripe_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct unstripe_c *uc = ti->private; + + return fn(ti, uc->dev, uc->physical_start, ti->len, data); +} + +static void unstripe_io_hints(struct dm_target *ti, + struct queue_limits *limits) +{ + struct unstripe_c *uc = ti->private; + + limits->chunk_sectors = uc->chunk_size; +} + +static struct target_type unstripe_target = { + .name = "unstriped", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = unstripe_ctr, + .dtr = unstripe_dtr, + .map = unstripe_map, + .status = unstripe_status, + .iterate_devices = unstripe_iterate_devices, + .io_hints = unstripe_io_hints, +}; + +static int __init dm_unstripe_init(void) +{ + int r; + + r = dm_register_target(&unstripe_target); + if (r < 0) + DMERR("target registration failed"); + + return r; +} + +static void __exit dm_unstripe_exit(void) +{ + dm_unregister_target(&unstripe_target); +} + +module_init(dm_unstripe_init); +module_exit(dm_unstripe_exit); + +MODULE_DESCRIPTION(DM_NAME " unstriped target"); +MODULE_AUTHOR("Scott Bauer <scott.bauer@intel.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 70485de37b66..969954915566 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -2333,6 +2333,9 @@ static void dmz_cleanup_metadata(struct dmz_metadata *zmd) /* Free the zone descriptors */ dmz_drop_zones(zmd); + + mutex_destroy(&zmd->mblk_flush_lock); + mutex_destroy(&zmd->map_lock); } /* diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 6d7bda6f8190..caff02caf083 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -827,6 +827,7 @@ err_fwq: err_cwq: destroy_workqueue(dmz->chunk_wq); err_bio: + mutex_destroy(&dmz->chunk_lock); bioset_free(dmz->bio_set); err_meta: dmz_dtr_metadata(dmz->metadata); @@ -861,6 +862,8 @@ static void dmz_dtr(struct dm_target *ti) dmz_put_zoned_device(ti); + mutex_destroy(&dmz->chunk_lock); + kfree(dmz); } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index de17b7193299..d6de00f367ef 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -60,18 +60,73 @@ void dm_issue_global_event(void) } /* - * One of these is allocated per bio. + * One of these is allocated (on-stack) per original bio. */ +struct clone_info { + struct dm_table *map; + struct bio *bio; + struct dm_io *io; + sector_t sector; + unsigned sector_count; +}; + +/* + * One of these is allocated per clone bio. + */ +#define DM_TIO_MAGIC 7282014 +struct dm_target_io { + unsigned magic; + struct dm_io *io; + struct dm_target *ti; + unsigned target_bio_nr; + unsigned *len_ptr; + bool inside_dm_io; + struct bio clone; +}; + +/* + * One of these is allocated per original bio. + * It contains the first clone used for that original. + */ +#define DM_IO_MAGIC 5191977 struct dm_io { + unsigned magic; struct mapped_device *md; blk_status_t status; atomic_t io_count; - struct bio *bio; + struct bio *orig_bio; unsigned long start_time; spinlock_t endio_lock; struct dm_stats_aux stats_aux; + /* last member of dm_target_io is 'struct bio' */ + struct dm_target_io tio; }; +void *dm_per_bio_data(struct bio *bio, size_t data_size) +{ + struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); + if (!tio->inside_dm_io) + return (char *)bio - offsetof(struct dm_target_io, clone) - data_size; + return (char *)bio - offsetof(struct dm_target_io, clone) - offsetof(struct dm_io, tio) - data_size; +} +EXPORT_SYMBOL_GPL(dm_per_bio_data); + +struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size) +{ + struct dm_io *io = (struct dm_io *)((char *)data + data_size); + if (io->magic == DM_IO_MAGIC) + return (struct bio *)((char *)io + offsetof(struct dm_io, tio) + offsetof(struct dm_target_io, clone)); + BUG_ON(io->magic != DM_TIO_MAGIC); + return (struct bio *)((char *)io + offsetof(struct dm_target_io, clone)); +} +EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data); + +unsigned dm_bio_get_target_bio_nr(const struct bio *bio) +{ + return container_of(bio, struct dm_target_io, clone)->target_bio_nr; +} +EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr); + #define MINOR_ALLOCED ((void *)-1) /* @@ -93,8 +148,8 @@ static int dm_numa_node = DM_NUMA_NODE; * For mempools pre-allocation at the table loading time. */ struct dm_md_mempools { - mempool_t *io_pool; struct bio_set *bs; + struct bio_set *io_bs; }; struct table_device { @@ -103,7 +158,6 @@ struct table_device { struct dm_dev dm_dev; }; -static struct kmem_cache *_io_cache; static struct kmem_cache *_rq_tio_cache; static struct kmem_cache *_rq_cache; @@ -170,14 +224,9 @@ static int __init local_init(void) { int r = -ENOMEM; - /* allocate a slab for the dm_ios */ - _io_cache = KMEM_CACHE(dm_io, 0); - if (!_io_cache) - return r; - _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); if (!_rq_tio_cache) - goto out_free_io_cache; + return r; _rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request), __alignof__(struct request), 0, NULL); @@ -212,8 +261,6 @@ out_free_rq_cache: kmem_cache_destroy(_rq_cache); out_free_rq_tio_cache: kmem_cache_destroy(_rq_tio_cache); -out_free_io_cache: - kmem_cache_destroy(_io_cache); return r; } @@ -225,7 +272,6 @@ static void local_exit(void) kmem_cache_destroy(_rq_cache); kmem_cache_destroy(_rq_tio_cache); - kmem_cache_destroy(_io_cache); unregister_blkdev(_major, _name); dm_uevent_exit(); @@ -486,18 +532,69 @@ out: return r; } -static struct dm_io *alloc_io(struct mapped_device *md) +static void start_io_acct(struct dm_io *io); + +static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) { - return mempool_alloc(md->io_pool, GFP_NOIO); + struct dm_io *io; + struct dm_target_io *tio; + struct bio *clone; + + clone = bio_alloc_bioset(GFP_NOIO, 0, md->io_bs); + if (!clone) + return NULL; + + tio = container_of(clone, struct dm_target_io, clone); + tio->inside_dm_io = true; + tio->io = NULL; + + io = container_of(tio, struct dm_io, tio); + io->magic = DM_IO_MAGIC; + io->status = 0; + atomic_set(&io->io_count, 1); + io->orig_bio = bio; + io->md = md; + spin_lock_init(&io->endio_lock); + + start_io_acct(io); + + return io; } static void free_io(struct mapped_device *md, struct dm_io *io) { - mempool_free(io, md->io_pool); + bio_put(&io->tio.clone); +} + +static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *ti, + unsigned target_bio_nr, gfp_t gfp_mask) +{ + struct dm_target_io *tio; + + if (!ci->io->tio.io) { + /* the dm_target_io embedded in ci->io is available */ + tio = &ci->io->tio; + } else { + struct bio *clone = bio_alloc_bioset(gfp_mask, 0, ci->io->md->bs); + if (!clone) + return NULL; + + tio = container_of(clone, struct dm_target_io, clone); + tio->inside_dm_io = false; + } + + tio->magic = DM_TIO_MAGIC; + tio->io = ci->io; + tio->ti = ti; + tio->target_bio_nr = target_bio_nr; + + return tio; } static void free_tio(struct dm_target_io *tio) { + if (tio->inside_dm_io) + return; bio_put(&tio->clone); } @@ -510,17 +607,15 @@ int md_in_flight(struct mapped_device *md) static void start_io_acct(struct dm_io *io) { struct mapped_device *md = io->md; - struct bio *bio = io->bio; - int cpu; + struct bio *bio = io->orig_bio; int rw = bio_data_dir(bio); io->start_time = jiffies; - cpu = part_stat_lock(); - part_round_stats(md->queue, cpu, &dm_disk(md)->part0); - part_stat_unlock(); + generic_start_io_acct(md->queue, rw, bio_sectors(bio), &dm_disk(md)->part0); + atomic_set(&dm_disk(md)->part0.in_flight[rw], - atomic_inc_return(&md->pending[rw])); + atomic_inc_return(&md->pending[rw])); if (unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio_data_dir(bio), @@ -531,7 +626,7 @@ static void start_io_acct(struct dm_io *io) static void end_io_acct(struct dm_io *io) { struct mapped_device *md = io->md; - struct bio *bio = io->bio; + struct bio *bio = io->orig_bio; unsigned long duration = jiffies - io->start_time; int pending; int rw = bio_data_dir(bio); @@ -752,15 +847,6 @@ int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) return 0; } -/*----------------------------------------------------------------- - * CRUD START: - * A more elegant soln is in the works that uses the queue - * merge fn, unfortunately there are a couple of changes to - * the block layer that I want to make for this. So in the - * interests of getting something for people to use I give - * you this clearly demarcated crap. - *---------------------------------------------------------------*/ - static int __noflush_suspending(struct mapped_device *md) { return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); @@ -780,8 +866,7 @@ static void dec_pending(struct dm_io *io, blk_status_t error) /* Push-back supersedes any I/O errors */ if (unlikely(error)) { spin_lock_irqsave(&io->endio_lock, flags); - if (!(io->status == BLK_STS_DM_REQUEUE && - __noflush_suspending(md))) + if (!(io->status == BLK_STS_DM_REQUEUE && __noflush_suspending(md))) io->status = error; spin_unlock_irqrestore(&io->endio_lock, flags); } @@ -793,7 +878,8 @@ static void dec_pending(struct dm_io *io, blk_status_t error) */ spin_lock_irqsave(&md->deferred_lock, flags); if (__noflush_suspending(md)) - bio_list_add_head(&md->deferred, io->bio); + /* NOTE early return due to BLK_STS_DM_REQUEUE below */ + bio_list_add_head(&md->deferred, io->orig_bio); else /* noflush suspend was interrupted. */ io->status = BLK_STS_IOERR; @@ -801,7 +887,7 @@ static void dec_pending(struct dm_io *io, blk_status_t error) } io_error = io->status; - bio = io->bio; + bio = io->orig_bio; end_io_acct(io); free_io(md, io); @@ -847,7 +933,7 @@ static void clone_endio(struct bio *bio) struct mapped_device *md = tio->io->md; dm_endio_fn endio = tio->ti->type->end_io; - if (unlikely(error == BLK_STS_TARGET)) { + if (unlikely(error == BLK_STS_TARGET) && md->type != DM_TYPE_NVME_BIO_BASED) { if (bio_op(bio) == REQ_OP_WRITE_SAME && !bio->bi_disk->queue->limits.max_write_same_sectors) disable_write_same(md); @@ -920,7 +1006,15 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) return -EINVAL; } - ti->max_io_len = (uint32_t) len; + /* + * BIO based queue uses its own splitting. When multipage bvecs + * is switched on, size of the incoming bio may be too big to + * be handled in some targets, such as crypt. + * + * When these targets are ready for the big bio, we can remove + * the limit. + */ + ti->max_io_len = min_t(uint32_t, len, BIO_MAX_PAGES * PAGE_SIZE); return 0; } @@ -997,7 +1091,7 @@ static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, /* * A target may call dm_accept_partial_bio only from the map routine. It is - * allowed for all bio types except REQ_PREFLUSH. + * allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET. * * dm_accept_partial_bio informs the dm that the target only wants to process * additional n_sectors sectors of the bio and the rest of the data should be @@ -1047,7 +1141,7 @@ void dm_remap_zone_report(struct dm_target *ti, struct bio *bio, sector_t start) { #ifdef CONFIG_BLK_DEV_ZONED struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); - struct bio *report_bio = tio->io->bio; + struct bio *report_bio = tio->io->orig_bio; struct blk_zone_report_hdr *hdr = NULL; struct blk_zone *zone; unsigned int nr_rep = 0; @@ -1114,67 +1208,15 @@ void dm_remap_zone_report(struct dm_target *ti, struct bio *bio, sector_t start) } EXPORT_SYMBOL_GPL(dm_remap_zone_report); -/* - * Flush current->bio_list when the target map method blocks. - * This fixes deadlocks in snapshot and possibly in other targets. - */ -struct dm_offload { - struct blk_plug plug; - struct blk_plug_cb cb; -}; - -static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule) -{ - struct dm_offload *o = container_of(cb, struct dm_offload, cb); - struct bio_list list; - struct bio *bio; - int i; - - INIT_LIST_HEAD(&o->cb.list); - - if (unlikely(!current->bio_list)) - return; - - for (i = 0; i < 2; i++) { - list = current->bio_list[i]; - bio_list_init(¤t->bio_list[i]); - - while ((bio = bio_list_pop(&list))) { - struct bio_set *bs = bio->bi_pool; - if (unlikely(!bs) || bs == fs_bio_set || - !bs->rescue_workqueue) { - bio_list_add(¤t->bio_list[i], bio); - continue; - } - - spin_lock(&bs->rescue_lock); - bio_list_add(&bs->rescue_list, bio); - queue_work(bs->rescue_workqueue, &bs->rescue_work); - spin_unlock(&bs->rescue_lock); - } - } -} - -static void dm_offload_start(struct dm_offload *o) -{ - blk_start_plug(&o->plug); - o->cb.callback = flush_current_bio_list; - list_add(&o->cb.list, ¤t->plug->cb_list); -} - -static void dm_offload_end(struct dm_offload *o) -{ - list_del(&o->cb.list); - blk_finish_plug(&o->plug); -} - -static void __map_bio(struct dm_target_io *tio) +static blk_qc_t __map_bio(struct dm_target_io *tio) { int r; sector_t sector; - struct dm_offload o; struct bio *clone = &tio->clone; + struct dm_io *io = tio->io; + struct mapped_device *md = io->md; struct dm_target *ti = tio->ti; + blk_qc_t ret = BLK_QC_T_NONE; clone->bi_end_io = clone_endio; @@ -1183,44 +1225,37 @@ static void __map_bio(struct dm_target_io *tio) * anything, the target has assumed ownership of * this io. */ - atomic_inc(&tio->io->io_count); + atomic_inc(&io->io_count); sector = clone->bi_iter.bi_sector; - dm_offload_start(&o); r = ti->type->map(ti, clone); - dm_offload_end(&o); - switch (r) { case DM_MAPIO_SUBMITTED: break; case DM_MAPIO_REMAPPED: /* the bio has been remapped so dispatch it */ trace_block_bio_remap(clone->bi_disk->queue, clone, - bio_dev(tio->io->bio), sector); - generic_make_request(clone); + bio_dev(io->orig_bio), sector); + if (md->type == DM_TYPE_NVME_BIO_BASED) + ret = direct_make_request(clone); + else + ret = generic_make_request(clone); break; case DM_MAPIO_KILL: - dec_pending(tio->io, BLK_STS_IOERR); free_tio(tio); + dec_pending(io, BLK_STS_IOERR); break; case DM_MAPIO_REQUEUE: - dec_pending(tio->io, BLK_STS_DM_REQUEUE); free_tio(tio); + dec_pending(io, BLK_STS_DM_REQUEUE); break; default: DMWARN("unimplemented target map return value: %d", r); BUG(); } -} -struct clone_info { - struct mapped_device *md; - struct dm_table *map; - struct bio *bio; - struct dm_io *io; - sector_t sector; - unsigned sector_count; -}; + return ret; +} static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len) { @@ -1264,28 +1299,49 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio, return 0; } -static struct dm_target_io *alloc_tio(struct clone_info *ci, - struct dm_target *ti, - unsigned target_bio_nr) +static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, + struct dm_target *ti, unsigned num_bios) { struct dm_target_io *tio; - struct bio *clone; + int try; - clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); - tio = container_of(clone, struct dm_target_io, clone); + if (!num_bios) + return; - tio->io = ci->io; - tio->ti = ti; - tio->target_bio_nr = target_bio_nr; + if (num_bios == 1) { + tio = alloc_tio(ci, ti, 0, GFP_NOIO); + bio_list_add(blist, &tio->clone); + return; + } - return tio; + for (try = 0; try < 2; try++) { + int bio_nr; + struct bio *bio; + + if (try) + mutex_lock(&ci->io->md->table_devices_lock); + for (bio_nr = 0; bio_nr < num_bios; bio_nr++) { + tio = alloc_tio(ci, ti, bio_nr, try ? GFP_NOIO : GFP_NOWAIT); + if (!tio) + break; + + bio_list_add(blist, &tio->clone); + } + if (try) + mutex_unlock(&ci->io->md->table_devices_lock); + if (bio_nr == num_bios) + return; + + while ((bio = bio_list_pop(blist))) { + tio = container_of(bio, struct dm_target_io, clone); + free_tio(tio); + } + } } -static void __clone_and_map_simple_bio(struct clone_info *ci, - struct dm_target *ti, - unsigned target_bio_nr, unsigned *len) +static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci, + struct dm_target_io *tio, unsigned *len) { - struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr); struct bio *clone = &tio->clone; tio->len_ptr = len; @@ -1294,16 +1350,22 @@ static void __clone_and_map_simple_bio(struct clone_info *ci, if (len) bio_setup_sector(clone, ci->sector, *len); - __map_bio(tio); + return __map_bio(tio); } static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, unsigned num_bios, unsigned *len) { - unsigned target_bio_nr; + struct bio_list blist = BIO_EMPTY_LIST; + struct bio *bio; + struct dm_target_io *tio; + + alloc_multiple_bios(&blist, ci, ti, num_bios); - for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++) - __clone_and_map_simple_bio(ci, ti, target_bio_nr, len); + while ((bio = bio_list_pop(&blist))) { + tio = container_of(bio, struct dm_target_io, clone); + (void) __clone_and_map_simple_bio(ci, tio, len); + } } static int __send_empty_flush(struct clone_info *ci) @@ -1319,32 +1381,22 @@ static int __send_empty_flush(struct clone_info *ci) } static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, - sector_t sector, unsigned *len) + sector_t sector, unsigned *len) { struct bio *bio = ci->bio; struct dm_target_io *tio; - unsigned target_bio_nr; - unsigned num_target_bios = 1; - int r = 0; + int r; - /* - * Does the target want to receive duplicate copies of the bio? - */ - if (bio_data_dir(bio) == WRITE && ti->num_write_bios) - num_target_bios = ti->num_write_bios(ti, bio); - - for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { - tio = alloc_tio(ci, ti, target_bio_nr); - tio->len_ptr = len; - r = clone_bio(tio, bio, sector, *len); - if (r < 0) { - free_tio(tio); - break; - } - __map_bio(tio); + tio = alloc_tio(ci, ti, 0, GFP_NOIO); + tio->len_ptr = len; + r = clone_bio(tio, bio, sector, *len); + if (r < 0) { + free_tio(tio); + return r; } + (void) __map_bio(tio); - return r; + return 0; } typedef unsigned (*get_num_bios_fn)(struct dm_target *ti); @@ -1371,56 +1423,50 @@ static bool is_split_required_for_discard(struct dm_target *ti) return ti->split_discard_bios; } -static int __send_changing_extent_only(struct clone_info *ci, +static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti, get_num_bios_fn get_num_bios, is_split_required_fn is_split_required) { - struct dm_target *ti; unsigned len; unsigned num_bios; - do { - ti = dm_table_find_target(ci->map, ci->sector); - if (!dm_target_is_valid(ti)) - return -EIO; - - /* - * Even though the device advertised support for this type of - * request, that does not mean every target supports it, and - * reconfiguration might also have changed that since the - * check was performed. - */ - num_bios = get_num_bios ? get_num_bios(ti) : 0; - if (!num_bios) - return -EOPNOTSUPP; + /* + * Even though the device advertised support for this type of + * request, that does not mean every target supports it, and + * reconfiguration might also have changed that since the + * check was performed. + */ + num_bios = get_num_bios ? get_num_bios(ti) : 0; + if (!num_bios) + return -EOPNOTSUPP; - if (is_split_required && !is_split_required(ti)) - len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); - else - len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti)); + if (is_split_required && !is_split_required(ti)) + len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); + else + len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti)); - __send_duplicate_bios(ci, ti, num_bios, &len); + __send_duplicate_bios(ci, ti, num_bios, &len); - ci->sector += len; - } while (ci->sector_count -= len); + ci->sector += len; + ci->sector_count -= len; return 0; } -static int __send_discard(struct clone_info *ci) +static int __send_discard(struct clone_info *ci, struct dm_target *ti) { - return __send_changing_extent_only(ci, get_num_discard_bios, + return __send_changing_extent_only(ci, ti, get_num_discard_bios, is_split_required_for_discard); } -static int __send_write_same(struct clone_info *ci) +static int __send_write_same(struct clone_info *ci, struct dm_target *ti) { - return __send_changing_extent_only(ci, get_num_write_same_bios, NULL); + return __send_changing_extent_only(ci, ti, get_num_write_same_bios, NULL); } -static int __send_write_zeroes(struct clone_info *ci) +static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti) { - return __send_changing_extent_only(ci, get_num_write_zeroes_bios, NULL); + return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios, NULL); } /* @@ -1433,17 +1479,17 @@ static int __split_and_process_non_flush(struct clone_info *ci) unsigned len; int r; - if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) - return __send_discard(ci); - else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) - return __send_write_same(ci); - else if (unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES)) - return __send_write_zeroes(ci); - ti = dm_table_find_target(ci->map, ci->sector); if (!dm_target_is_valid(ti)) return -EIO; + if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) + return __send_discard(ci, ti); + else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) + return __send_write_same(ci, ti); + else if (unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES)) + return __send_write_zeroes(ci, ti); + if (bio_op(bio) == REQ_OP_ZONE_REPORT) len = ci->sector_count; else @@ -1460,34 +1506,33 @@ static int __split_and_process_non_flush(struct clone_info *ci) return 0; } +static void init_clone_info(struct clone_info *ci, struct mapped_device *md, + struct dm_table *map, struct bio *bio) +{ + ci->map = map; + ci->io = alloc_io(md, bio); + ci->sector = bio->bi_iter.bi_sector; +} + /* * Entry point to split a bio into clones and submit them to the targets. */ -static void __split_and_process_bio(struct mapped_device *md, - struct dm_table *map, struct bio *bio) +static blk_qc_t __split_and_process_bio(struct mapped_device *md, + struct dm_table *map, struct bio *bio) { struct clone_info ci; + blk_qc_t ret = BLK_QC_T_NONE; int error = 0; if (unlikely(!map)) { bio_io_error(bio); - return; + return ret; } - ci.map = map; - ci.md = md; - ci.io = alloc_io(md); - ci.io->status = 0; - atomic_set(&ci.io->io_count, 1); - ci.io->bio = bio; - ci.io->md = md; - spin_lock_init(&ci.io->endio_lock); - ci.sector = bio->bi_iter.bi_sector; - - start_io_acct(ci.io); + init_clone_info(&ci, md, map, bio); if (bio->bi_opf & REQ_PREFLUSH) { - ci.bio = &ci.md->flush_bio; + ci.bio = &ci.io->md->flush_bio; ci.sector_count = 0; error = __send_empty_flush(&ci); /* dec_pending submits any data associated with flush */ @@ -1498,32 +1543,95 @@ static void __split_and_process_bio(struct mapped_device *md, } else { ci.bio = bio; ci.sector_count = bio_sectors(bio); - while (ci.sector_count && !error) + while (ci.sector_count && !error) { error = __split_and_process_non_flush(&ci); + if (current->bio_list && ci.sector_count && !error) { + /* + * Remainder must be passed to generic_make_request() + * so that it gets handled *after* bios already submitted + * have been completely processed. + * We take a clone of the original to store in + * ci.io->orig_bio to be used by end_io_acct() and + * for dec_pending to use for completion handling. + * As this path is not used for REQ_OP_ZONE_REPORT, + * the usage of io->orig_bio in dm_remap_zone_report() + * won't be affected by this reassignment. + */ + struct bio *b = bio_clone_bioset(bio, GFP_NOIO, + md->queue->bio_split); + ci.io->orig_bio = b; + bio_advance(bio, (bio_sectors(bio) - ci.sector_count) << 9); + bio_chain(b, bio); + ret = generic_make_request(bio); + break; + } + } } /* drop the extra reference count */ dec_pending(ci.io, errno_to_blk_status(error)); + return ret; } -/*----------------------------------------------------------------- - * CRUD END - *---------------------------------------------------------------*/ /* - * The request function that just remaps the bio built up by - * dm_merge_bvec. + * Optimized variant of __split_and_process_bio that leverages the + * fact that targets that use it do _not_ have a need to split bios. */ -static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t __process_bio(struct mapped_device *md, + struct dm_table *map, struct bio *bio) +{ + struct clone_info ci; + blk_qc_t ret = BLK_QC_T_NONE; + int error = 0; + + if (unlikely(!map)) { + bio_io_error(bio); + return ret; + } + + init_clone_info(&ci, md, map, bio); + + if (bio->bi_opf & REQ_PREFLUSH) { + ci.bio = &ci.io->md->flush_bio; + ci.sector_count = 0; + error = __send_empty_flush(&ci); + /* dec_pending submits any data associated with flush */ + } else { + struct dm_target *ti = md->immutable_target; + struct dm_target_io *tio; + + /* + * Defend against IO still getting in during teardown + * - as was seen for a time with nvme-fcloop + */ + if (unlikely(WARN_ON_ONCE(!ti || !dm_target_is_valid(ti)))) { + error = -EIO; + goto out; + } + + tio = alloc_tio(&ci, ti, 0, GFP_NOIO); + ci.bio = bio; + ci.sector_count = bio_sectors(bio); + ret = __clone_and_map_simple_bio(&ci, tio, NULL); + } +out: + /* drop the extra reference count */ + dec_pending(ci.io, errno_to_blk_status(error)); + return ret; +} + +typedef blk_qc_t (process_bio_fn)(struct mapped_device *, struct dm_table *, struct bio *); + +static blk_qc_t __dm_make_request(struct request_queue *q, struct bio *bio, + process_bio_fn process_bio) { - int rw = bio_data_dir(bio); struct mapped_device *md = q->queuedata; + blk_qc_t ret = BLK_QC_T_NONE; int srcu_idx; struct dm_table *map; map = dm_get_live_table(md, &srcu_idx); - generic_start_io_acct(q, rw, bio_sectors(bio), &dm_disk(md)->part0); - /* if we're suspended, we have to queue this io for later */ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { dm_put_live_table(md, srcu_idx); @@ -1532,12 +1640,27 @@ static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) queue_io(md, bio); else bio_io_error(bio); - return BLK_QC_T_NONE; + return ret; } - __split_and_process_bio(md, map, bio); + ret = process_bio(md, map, bio); + dm_put_live_table(md, srcu_idx); - return BLK_QC_T_NONE; + return ret; +} + +/* + * The request function that remaps the bio to one target and + * splits off any remainder. + */ +static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) +{ + return __dm_make_request(q, bio, __split_and_process_bio); +} + +static blk_qc_t dm_make_request_nvme(struct request_queue *q, struct bio *bio) +{ + return __dm_make_request(q, bio, __process_bio); } static int dm_any_congested(void *congested_data, int bdi_bits) @@ -1618,20 +1741,9 @@ static const struct dax_operations dm_dax_ops; static void dm_wq_work(struct work_struct *work); -void dm_init_md_queue(struct mapped_device *md) -{ - /* - * Initialize data that will only be used by a non-blk-mq DM queue - * - must do so here (in alloc_dev callchain) before queue is used - */ - md->queue->queuedata = md; - md->queue->backing_dev_info->congested_data = md; -} - -void dm_init_normal_md_queue(struct mapped_device *md) +static void dm_init_normal_md_queue(struct mapped_device *md) { md->use_blk_mq = false; - dm_init_md_queue(md); /* * Initialize aspects of queue that aren't relevant for blk-mq @@ -1645,9 +1757,10 @@ static void cleanup_mapped_device(struct mapped_device *md) destroy_workqueue(md->wq); if (md->kworker_task) kthread_stop(md->kworker_task); - mempool_destroy(md->io_pool); if (md->bs) bioset_free(md->bs); + if (md->io_bs) + bioset_free(md->io_bs); if (md->dax_dev) { kill_dax(md->dax_dev); @@ -1673,6 +1786,10 @@ static void cleanup_mapped_device(struct mapped_device *md) md->bdev = NULL; } + mutex_destroy(&md->suspend_lock); + mutex_destroy(&md->type_lock); + mutex_destroy(&md->table_devices_lock); + dm_mq_cleanup_mapped_device(md); } @@ -1726,10 +1843,10 @@ static struct mapped_device *alloc_dev(int minor) md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id); if (!md->queue) goto bad; + md->queue->queuedata = md; + md->queue->backing_dev_info->congested_data = md; - dm_init_md_queue(md); - - md->disk = alloc_disk_node(1, numa_node_id); + md->disk = alloc_disk_node(1, md->numa_node_id); if (!md->disk) goto bad; @@ -1753,7 +1870,7 @@ static struct mapped_device *alloc_dev(int minor) goto bad; md->dax_dev = dax_dev; - add_disk(md->disk); + add_disk_no_queue_reg(md->disk); format_dev_t(md->name, MKDEV(_major, minor)); md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); @@ -1812,17 +1929,22 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t) { struct dm_md_mempools *p = dm_table_get_md_mempools(t); - if (md->bs) { - /* The md already has necessary mempools. */ - if (dm_table_bio_based(t)) { - /* - * Reload bioset because front_pad may have changed - * because a different table was loaded. - */ + if (dm_table_bio_based(t)) { + /* + * The md may already have mempools that need changing. + * If so, reload bioset because front_pad may have changed + * because a different table was loaded. + */ + if (md->bs) { bioset_free(md->bs); - md->bs = p->bs; - p->bs = NULL; + md->bs = NULL; } + if (md->io_bs) { + bioset_free(md->io_bs); + md->io_bs = NULL; + } + + } else if (md->bs) { /* * There's no need to reload with request-based dm * because the size of front_pad doesn't change. @@ -1834,13 +1956,12 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t) goto out; } - BUG_ON(!p || md->io_pool || md->bs); + BUG_ON(!p || md->bs || md->io_bs); - md->io_pool = p->io_pool; - p->io_pool = NULL; md->bs = p->bs; p->bs = NULL; - + md->io_bs = p->io_bs; + p->io_bs = NULL; out: /* mempool bind completed, no longer need any mempools in the table */ dm_table_free_md_mempools(t); @@ -1886,6 +2007,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, { struct dm_table *old_map; struct request_queue *q = md->queue; + bool request_based = dm_table_request_based(t); sector_t size; lockdep_assert_held(&md->suspend_lock); @@ -1909,12 +2031,15 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, * This must be done before setting the queue restrictions, * because request-based dm may be run just after the setting. */ - if (dm_table_request_based(t)) { + if (request_based) dm_stop_queue(q); + + if (request_based || md->type == DM_TYPE_NVME_BIO_BASED) { /* - * Leverage the fact that request-based DM targets are - * immutable singletons and establish md->immutable_target - * - used to optimize both dm_request_fn and dm_mq_queue_rq + * Leverage the fact that request-based DM targets and + * NVMe bio based targets are immutable singletons + * - used to optimize both dm_request_fn and dm_mq_queue_rq; + * and __process_bio. */ md->immutable_target = dm_table_get_immutable_target(t); } @@ -1954,13 +2079,18 @@ static struct dm_table *__unbind(struct mapped_device *md) */ int dm_create(int minor, struct mapped_device **result) { + int r; struct mapped_device *md; md = alloc_dev(minor); if (!md) return -ENXIO; - dm_sysfs_init(md); + r = dm_sysfs_init(md); + if (r) { + free_dev(md); + return r; + } *result = md; return 0; @@ -2013,10 +2143,12 @@ EXPORT_SYMBOL_GPL(dm_get_queue_limits); int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) { int r; + struct queue_limits limits; enum dm_queue_mode type = dm_get_md_type(md); switch (type) { case DM_TYPE_REQUEST_BASED: + dm_init_normal_md_queue(md); r = dm_old_init_request_queue(md, t); if (r) { DMERR("Cannot initialize queue for request-based mapped device"); @@ -2034,21 +2166,24 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) case DM_TYPE_DAX_BIO_BASED: dm_init_normal_md_queue(md); blk_queue_make_request(md->queue, dm_make_request); - /* - * DM handles splitting bios as needed. Free the bio_split bioset - * since it won't be used (saves 1 process per bio-based DM device). - */ - bioset_free(md->queue->bio_split); - md->queue->bio_split = NULL; - - if (type == DM_TYPE_DAX_BIO_BASED) - queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue); + break; + case DM_TYPE_NVME_BIO_BASED: + dm_init_normal_md_queue(md); + blk_queue_make_request(md->queue, dm_make_request_nvme); break; case DM_TYPE_NONE: WARN_ON_ONCE(true); break; } + r = dm_calculate_queue_limits(t, &limits); + if (r) { + DMERR("Cannot calculate initial queue limits"); + return r; + } + dm_table_set_restrictions(t, md->queue, &limits); + blk_register_queue(md->disk); + return 0; } @@ -2113,7 +2248,6 @@ EXPORT_SYMBOL_GPL(dm_device_name); static void __dm_destroy(struct mapped_device *md, bool wait) { - struct request_queue *q = dm_get_md_queue(md); struct dm_table *map; int srcu_idx; @@ -2124,7 +2258,7 @@ static void __dm_destroy(struct mapped_device *md, bool wait) set_bit(DMF_FREEING, &md->flags); spin_unlock(&_minor_lock); - blk_set_queue_dying(q); + blk_set_queue_dying(md->queue); if (dm_request_based(md) && md->kworker_task) kthread_flush_worker(&md->kworker); @@ -2735,11 +2869,12 @@ int dm_noflush_suspending(struct dm_target *ti) EXPORT_SYMBOL_GPL(dm_noflush_suspending); struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type, - unsigned integrity, unsigned per_io_data_size) + unsigned integrity, unsigned per_io_data_size, + unsigned min_pool_size) { struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); unsigned int pool_size = 0; - unsigned int front_pad; + unsigned int front_pad, io_front_pad; if (!pools) return NULL; @@ -2747,16 +2882,19 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu switch (type) { case DM_TYPE_BIO_BASED: case DM_TYPE_DAX_BIO_BASED: - pool_size = dm_get_reserved_bio_based_ios(); + case DM_TYPE_NVME_BIO_BASED: + pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); - - pools->io_pool = mempool_create_slab_pool(pool_size, _io_cache); - if (!pools->io_pool) + io_front_pad = roundup(front_pad, __alignof__(struct dm_io)) + offsetof(struct dm_io, tio); + pools->io_bs = bioset_create(pool_size, io_front_pad, 0); + if (!pools->io_bs) + goto out; + if (integrity && bioset_integrity_create(pools->io_bs, pool_size)) goto out; break; case DM_TYPE_REQUEST_BASED: case DM_TYPE_MQ_REQUEST_BASED: - pool_size = dm_get_reserved_rq_based_ios(); + pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size); front_pad = offsetof(struct dm_rq_clone_bio_info, clone); /* per_io_data_size is used for blk-mq pdu at queue allocation */ break; @@ -2764,7 +2902,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu BUG(); } - pools->bs = bioset_create(pool_size, front_pad, BIOSET_NEED_RESCUER); + pools->bs = bioset_create(pool_size, front_pad, 0); if (!pools->bs) goto out; @@ -2784,10 +2922,10 @@ void dm_free_md_mempools(struct dm_md_mempools *pools) if (!pools) return; - mempool_destroy(pools->io_pool); - if (pools->bs) bioset_free(pools->bs); + if (pools->io_bs) + bioset_free(pools->io_bs); kfree(pools); } diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 36399bb875dd..114a81b27c37 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -49,7 +49,6 @@ struct dm_md_mempools; /*----------------------------------------------------------------- * Internal table functions. *---------------------------------------------------------------*/ -void dm_table_destroy(struct dm_table *t); void dm_table_event_callback(struct dm_table *t, void (*fn)(void *), void *context); struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); @@ -206,7 +205,8 @@ void dm_kcopyd_exit(void); * Mempool operations */ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type, - unsigned integrity, unsigned per_bio_data_size); + unsigned integrity, unsigned per_bio_data_size, + unsigned min_pool_size); void dm_free_md_mempools(struct dm_md_mempools *pools); /* diff --git a/drivers/md/md.c b/drivers/md/md.c index 4e4dee0ec2de..bc67ab6844f0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -711,7 +711,7 @@ static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) return NULL; } -static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev) +struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) { struct md_rdev *rdev; @@ -721,6 +721,7 @@ static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev) return NULL; } +EXPORT_SYMBOL_GPL(md_find_rdev_rcu); static struct md_personality *find_pers(int level, char *clevel) { @@ -5560,11 +5561,6 @@ int md_run(struct mddev *mddev) if (start_readonly && mddev->ro == 0) mddev->ro = 2; /* read-only, but switch on first write */ - /* - * NOTE: some pers->run(), for example r5l_recovery_log(), wakes - * up mddev->thread. It is important to initialize critical - * resources for mddev->thread BEFORE calling pers->run(). - */ err = pers->run(mddev); if (err) pr_warn("md: pers->run() failed ...\n"); @@ -5678,6 +5674,9 @@ static int do_md_run(struct mddev *mddev) if (mddev_is_clustered(mddev)) md_allow_write(mddev); + /* run start up tasks that require md_thread */ + md_start(mddev); + md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ @@ -5689,6 +5688,21 @@ out: return err; } +int md_start(struct mddev *mddev) +{ + int ret = 0; + + if (mddev->pers->start) { + set_bit(MD_RECOVERY_WAIT, &mddev->recovery); + md_wakeup_thread(mddev->thread); + ret = mddev->pers->start(mddev); + clear_bit(MD_RECOVERY_WAIT, &mddev->recovery); + md_wakeup_thread(mddev->sync_thread); + } + return ret; +} +EXPORT_SYMBOL_GPL(md_start); + static int restart_array(struct mddev *mddev) { struct gendisk *disk = mddev->gendisk; @@ -6997,7 +7011,7 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev) return -ENODEV; rcu_read_lock(); - rdev = find_rdev_rcu(mddev, dev); + rdev = md_find_rdev_rcu(mddev, dev); if (!rdev) err = -ENODEV; else { @@ -7871,20 +7885,20 @@ static int md_seq_open(struct inode *inode, struct file *file) } static int md_unloading; -static unsigned int mdstat_poll(struct file *filp, poll_table *wait) +static __poll_t mdstat_poll(struct file *filp, poll_table *wait) { struct seq_file *seq = filp->private_data; - int mask; + __poll_t mask; if (md_unloading) - return POLLIN|POLLRDNORM|POLLERR|POLLPRI; + return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; poll_wait(filp, &md_event_waiters, wait); /* always allow read */ - mask = POLLIN | POLLRDNORM; + mask = EPOLLIN | EPOLLRDNORM; if (seq->poll_event != atomic_read(&md_event_count)) - mask |= POLLERR | POLLPRI; + mask |= EPOLLERR | EPOLLPRI; return mask; } @@ -8169,7 +8183,8 @@ void md_do_sync(struct md_thread *thread) int ret; /* just incase thread restarts... */ - if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) + if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) || + test_bit(MD_RECOVERY_WAIT, &mddev->recovery)) return; if (mddev->ro) {/* never try to sync a read-only array */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); diff --git a/drivers/md/md.h b/drivers/md/md.h index 7d6bcf0eba0c..58cd20a5e85e 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -485,6 +485,7 @@ enum recovery_flags { MD_RECOVERY_RESHAPE, /* A reshape is happening */ MD_RECOVERY_FROZEN, /* User request to abort, and not restart, any action */ MD_RECOVERY_ERROR, /* sync-action interrupted because io-error */ + MD_RECOVERY_WAIT, /* waiting for pers->start() to finish */ }; static inline int __must_check mddev_lock(struct mddev *mddev) @@ -523,7 +524,13 @@ struct md_personality struct list_head list; struct module *owner; bool (*make_request)(struct mddev *mddev, struct bio *bio); + /* + * start up works that do NOT require md_thread. tasks that + * requires md_thread should go into start() + */ int (*run)(struct mddev *mddev); + /* start up works that require md threads */ + int (*start)(struct mddev *mddev); void (*free)(struct mddev *mddev, void *priv); void (*status)(struct seq_file *seq, struct mddev *mddev); /* error_handler must set ->faulty and clear ->in_sync @@ -687,6 +694,7 @@ extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); extern void mddev_init(struct mddev *mddev); extern int md_run(struct mddev *mddev); +extern int md_start(struct mddev *mddev); extern void md_stop(struct mddev *mddev); extern void md_stop_writes(struct mddev *mddev); extern int md_rdev_init(struct md_rdev *rdev); @@ -702,6 +710,7 @@ extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); extern void md_kick_rdev_from_array(struct md_rdev * rdev); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); +struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) { diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index f21ce6a3d4cf..58b319757b1e 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -683,23 +683,8 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key) pn->keys[1] = rn->keys[0]; memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64)); - /* - * rejig the spine. This is ugly, since it knows too - * much about the spine - */ - if (s->nodes[0] != new_parent) { - unlock_block(s->info, s->nodes[0]); - s->nodes[0] = new_parent; - } - if (key < le64_to_cpu(rn->keys[0])) { - unlock_block(s->info, right); - s->nodes[1] = left; - } else { - unlock_block(s->info, left); - s->nodes[1] = right; - } - s->count = 2; - + unlock_block(s->info, left); + unlock_block(s->info, right); return 0; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 6df398e3a008..b2eae332e1a2 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -815,6 +815,17 @@ static void flush_pending_writes(struct r1conf *conf) bio = bio_list_get(&conf->pending_bio_list); conf->pending_count = 0; spin_unlock_irq(&conf->device_lock); + + /* + * As this is called in a wait_event() loop (see freeze_array), + * current->state might be TASK_UNINTERRUPTIBLE which will + * cause a warning when we prepare to wait again. As it is + * rare that this path is taken, it is perfectly safe to force + * us to go around the wait_event() loop again, so the warning + * is a false-positive. Silence the warning by resetting + * thread state + */ + __set_current_state(TASK_RUNNING); blk_start_plug(&plug); flush_bio_list(conf, bio); blk_finish_plug(&plug); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c131835cf008..99c9207899a7 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -900,6 +900,18 @@ static void flush_pending_writes(struct r10conf *conf) bio = bio_list_get(&conf->pending_bio_list); conf->pending_count = 0; spin_unlock_irq(&conf->device_lock); + + /* + * As this is called in a wait_event() loop (see freeze_array), + * current->state might be TASK_UNINTERRUPTIBLE which will + * cause a warning when we prepare to wait again. As it is + * rare that this path is taken, it is perfectly safe to force + * us to go around the wait_event() loop again, so the warning + * is a false-positive. Silence the warning by resetting + * thread state + */ + __set_current_state(TASK_RUNNING); + blk_start_plug(&plug); /* flush any pending bitmap writes to disk * before proceeding w/ I/O */ diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 39f31f07ffe9..3c65f52b68f5 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1111,9 +1111,6 @@ void r5l_write_stripe_run(struct r5l_log *log) int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) { - if (!log) - return -ENODEV; - if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { /* * in write through (journal only) @@ -1592,8 +1589,6 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space) void r5l_quiesce(struct r5l_log *log, int quiesce) { struct mddev *mddev; - if (!log) - return; if (quiesce) { /* make sure r5l_write_super_and_discard_space exits */ @@ -2448,7 +2443,6 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log, raid5_release_stripe(sh); } - md_wakeup_thread(conf->mddev->thread); /* reuse conf->wait_for_quiescent in recovery */ wait_event(conf->wait_for_quiescent, atomic_read(&conf->active_stripes) == 0); @@ -2491,10 +2485,10 @@ static int r5l_recovery_log(struct r5l_log *log) ctx->seq += 10000; if ((ctx->data_only_stripes == 0) && (ctx->data_parity_stripes == 0)) - pr_debug("md/raid:%s: starting from clean shutdown\n", + pr_info("md/raid:%s: starting from clean shutdown\n", mdname(mddev)); else - pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n", + pr_info("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n", mdname(mddev), ctx->data_only_stripes, ctx->data_parity_stripes); @@ -3036,6 +3030,23 @@ ioerr: return ret; } +int r5l_start(struct r5l_log *log) +{ + int ret; + + if (!log) + return 0; + + ret = r5l_load_log(log); + if (ret) { + struct mddev *mddev = log->rdev->mddev; + struct r5conf *conf = mddev->private; + + r5l_exit_log(conf); + } + return ret; +} + void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev) { struct r5conf *conf = mddev->private; @@ -3138,13 +3149,9 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) rcu_assign_pointer(conf->log, log); - if (r5l_load_log(log)) - goto error; - set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); return 0; -error: rcu_assign_pointer(conf->log, NULL); md_unregister_thread(&log->reclaim_thread); reclaim_thread: diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h index 284578b0a349..0c76bcedfc1c 100644 --- a/drivers/md/raid5-log.h +++ b/drivers/md/raid5-log.h @@ -32,6 +32,7 @@ extern struct md_sysfs_entry r5c_journal_mode; extern void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev); extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect); +extern int r5l_start(struct r5l_log *log); extern struct dma_async_tx_descriptor * ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu, @@ -42,6 +43,7 @@ extern int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh); extern void ppl_write_stripe_run(struct r5conf *conf); extern void ppl_stripe_write_finished(struct stripe_head *sh); extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add); +extern void ppl_quiesce(struct r5conf *conf, int quiesce); static inline bool raid5_has_ppl(struct r5conf *conf) { @@ -87,6 +89,34 @@ static inline void log_write_stripe_run(struct r5conf *conf) ppl_write_stripe_run(conf); } +static inline void log_flush_stripe_to_raid(struct r5conf *conf) +{ + if (conf->log) + r5l_flush_stripe_to_raid(conf->log); + else if (raid5_has_ppl(conf)) + ppl_write_stripe_run(conf); +} + +static inline int log_handle_flush_request(struct r5conf *conf, struct bio *bio) +{ + int ret = -ENODEV; + + if (conf->log) + ret = r5l_handle_flush_request(conf->log, bio); + else if (raid5_has_ppl(conf)) + ret = 0; + + return ret; +} + +static inline void log_quiesce(struct r5conf *conf, int quiesce) +{ + if (conf->log) + r5l_quiesce(conf->log, quiesce); + else if (raid5_has_ppl(conf)) + ppl_quiesce(conf, quiesce); +} + static inline void log_exit(struct r5conf *conf) { if (conf->log) diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 628c0bf7b9fd..2764c2290062 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -85,6 +85,9 @@ * (for a single member disk). New io_units are added to the end of the list * and the first io_unit is submitted, if it is not submitted already. * The current io_unit accepting new stripes is always at the end of the list. + * + * If write-back cache is enabled for any of the disks in the array, its data + * must be flushed before next io_unit is submitted. */ #define PPL_SPACE_SIZE (128 * 1024) @@ -104,6 +107,7 @@ struct ppl_conf { struct kmem_cache *io_kc; mempool_t *io_pool; struct bio_set *bs; + struct bio_set *flush_bs; /* used only for recovery */ int recovered_entries; @@ -128,6 +132,8 @@ struct ppl_log { sector_t next_io_sector; unsigned int entry_space; bool use_multippl; + bool wb_cache_on; + unsigned long disk_flush_bitmap; }; #define PPL_IO_INLINE_BVECS 32 @@ -145,6 +151,7 @@ struct ppl_io_unit { struct list_head stripe_list; /* stripes added to the io_unit */ atomic_t pending_stripes; /* how many stripes not written to raid */ + atomic_t pending_flushes; /* how many disk flushes are in progress */ bool submitted; /* true if write to log started */ @@ -249,6 +256,7 @@ static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log, INIT_LIST_HEAD(&io->log_sibling); INIT_LIST_HEAD(&io->stripe_list); atomic_set(&io->pending_stripes, 0); + atomic_set(&io->pending_flushes, 0); bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS); pplhdr = page_address(io->header_page); @@ -475,7 +483,18 @@ static void ppl_submit_iounit(struct ppl_io_unit *io) if (log->use_multippl) log->next_io_sector += (PPL_HEADER_SIZE + io->pp_size) >> 9; + WARN_ON(log->disk_flush_bitmap != 0); + list_for_each_entry(sh, &io->stripe_list, log_list) { + for (i = 0; i < sh->disks; i++) { + struct r5dev *dev = &sh->dev[i]; + + if ((ppl_conf->child_logs[i].wb_cache_on) && + (test_bit(R5_Wantwrite, &dev->flags))) { + set_bit(i, &log->disk_flush_bitmap); + } + } + /* entries for full stripe writes have no partial parity */ if (test_bit(STRIPE_FULL_WRITE, &sh->state)) continue; @@ -540,6 +559,7 @@ static void ppl_io_unit_finished(struct ppl_io_unit *io) { struct ppl_log *log = io->log; struct ppl_conf *ppl_conf = log->ppl_conf; + struct r5conf *conf = ppl_conf->mddev->private; unsigned long flags; pr_debug("%s: seq: %llu\n", __func__, io->seq); @@ -565,6 +585,112 @@ static void ppl_io_unit_finished(struct ppl_io_unit *io) spin_unlock(&ppl_conf->no_mem_stripes_lock); local_irq_restore(flags); + + wake_up(&conf->wait_for_quiescent); +} + +static void ppl_flush_endio(struct bio *bio) +{ + struct ppl_io_unit *io = bio->bi_private; + struct ppl_log *log = io->log; + struct ppl_conf *ppl_conf = log->ppl_conf; + struct r5conf *conf = ppl_conf->mddev->private; + char b[BDEVNAME_SIZE]; + + pr_debug("%s: dev: %s\n", __func__, bio_devname(bio, b)); + + if (bio->bi_status) { + struct md_rdev *rdev; + + rcu_read_lock(); + rdev = md_find_rdev_rcu(conf->mddev, bio_dev(bio)); + if (rdev) + md_error(rdev->mddev, rdev); + rcu_read_unlock(); + } + + bio_put(bio); + + if (atomic_dec_and_test(&io->pending_flushes)) { + ppl_io_unit_finished(io); + md_wakeup_thread(conf->mddev->thread); + } +} + +static void ppl_do_flush(struct ppl_io_unit *io) +{ + struct ppl_log *log = io->log; + struct ppl_conf *ppl_conf = log->ppl_conf; + struct r5conf *conf = ppl_conf->mddev->private; + int raid_disks = conf->raid_disks; + int flushed_disks = 0; + int i; + + atomic_set(&io->pending_flushes, raid_disks); + + for_each_set_bit(i, &log->disk_flush_bitmap, raid_disks) { + struct md_rdev *rdev; + struct block_device *bdev = NULL; + + rcu_read_lock(); + rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev && !test_bit(Faulty, &rdev->flags)) + bdev = rdev->bdev; + rcu_read_unlock(); + + if (bdev) { + struct bio *bio; + char b[BDEVNAME_SIZE]; + + bio = bio_alloc_bioset(GFP_NOIO, 0, ppl_conf->flush_bs); + bio_set_dev(bio, bdev); + bio->bi_private = io; + bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; + bio->bi_end_io = ppl_flush_endio; + + pr_debug("%s: dev: %s\n", __func__, + bio_devname(bio, b)); + + submit_bio(bio); + flushed_disks++; + } + } + + log->disk_flush_bitmap = 0; + + for (i = flushed_disks ; i < raid_disks; i++) { + if (atomic_dec_and_test(&io->pending_flushes)) + ppl_io_unit_finished(io); + } +} + +static inline bool ppl_no_io_unit_submitted(struct r5conf *conf, + struct ppl_log *log) +{ + struct ppl_io_unit *io; + + io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit, + log_sibling); + + return !io || !io->submitted; +} + +void ppl_quiesce(struct r5conf *conf, int quiesce) +{ + struct ppl_conf *ppl_conf = conf->log_private; + int i; + + if (quiesce) { + for (i = 0; i < ppl_conf->count; i++) { + struct ppl_log *log = &ppl_conf->child_logs[i]; + + spin_lock_irq(&log->io_list_lock); + wait_event_lock_irq(conf->wait_for_quiescent, + ppl_no_io_unit_submitted(conf, log), + log->io_list_lock); + spin_unlock_irq(&log->io_list_lock); + } + } } void ppl_stripe_write_finished(struct stripe_head *sh) @@ -574,8 +700,12 @@ void ppl_stripe_write_finished(struct stripe_head *sh) io = sh->ppl_io; sh->ppl_io = NULL; - if (io && atomic_dec_and_test(&io->pending_stripes)) - ppl_io_unit_finished(io); + if (io && atomic_dec_and_test(&io->pending_stripes)) { + if (io->log->disk_flush_bitmap) + ppl_do_flush(io); + else + ppl_io_unit_finished(io); + } } static void ppl_xor(int size, struct page *page1, struct page *page2) @@ -1108,6 +1238,8 @@ static void __ppl_exit_log(struct ppl_conf *ppl_conf) if (ppl_conf->bs) bioset_free(ppl_conf->bs); + if (ppl_conf->flush_bs) + bioset_free(ppl_conf->flush_bs); mempool_destroy(ppl_conf->io_pool); kmem_cache_destroy(ppl_conf->io_kc); @@ -1173,6 +1305,8 @@ static int ppl_validate_rdev(struct md_rdev *rdev) static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev) { + struct request_queue *q; + if ((rdev->ppl.size << 9) >= (PPL_SPACE_SIZE + PPL_HEADER_SIZE) * 2) { log->use_multippl = true; @@ -1185,6 +1319,10 @@ static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev) PPL_HEADER_SIZE; } log->next_io_sector = rdev->ppl.sector; + + q = bdev_get_queue(rdev->bdev); + if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) + log->wb_cache_on = true; } int ppl_init_log(struct r5conf *conf) @@ -1192,8 +1330,8 @@ int ppl_init_log(struct r5conf *conf) struct ppl_conf *ppl_conf; struct mddev *mddev = conf->mddev; int ret = 0; + int max_disks; int i; - bool need_cache_flush = false; pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n", mdname(conf->mddev)); @@ -1219,6 +1357,14 @@ int ppl_init_log(struct r5conf *conf) return -EINVAL; } + max_disks = FIELD_SIZEOF(struct ppl_log, disk_flush_bitmap) * + BITS_PER_BYTE; + if (conf->raid_disks > max_disks) { + pr_warn("md/raid:%s PPL doesn't support over %d disks in the array\n", + mdname(mddev), max_disks); + return -EINVAL; + } + ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL); if (!ppl_conf) return -ENOMEM; @@ -1244,6 +1390,12 @@ int ppl_init_log(struct r5conf *conf) goto err; } + ppl_conf->flush_bs = bioset_create(conf->raid_disks, 0, 0); + if (!ppl_conf->flush_bs) { + ret = -ENOMEM; + goto err; + } + ppl_conf->count = conf->raid_disks; ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log), GFP_KERNEL); @@ -1275,23 +1427,14 @@ int ppl_init_log(struct r5conf *conf) log->rdev = rdev; if (rdev) { - struct request_queue *q; - ret = ppl_validate_rdev(rdev); if (ret) goto err; - q = bdev_get_queue(rdev->bdev); - if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) - need_cache_flush = true; ppl_init_child_log(log, rdev); } } - if (need_cache_flush) - pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n", - mdname(mddev)); - /* load and possibly recover the logs from the member disks */ ret = ppl_load(ppl_conf); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 98ce4272ace9..50d01144b805 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5563,7 +5563,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) bool do_flush = false; if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { - int ret = r5l_handle_flush_request(conf->log, bi); + int ret = log_handle_flush_request(conf, bi); if (ret == 0) return true; @@ -6168,7 +6168,7 @@ static int handle_active_stripes(struct r5conf *conf, int group, break; if (i == NR_STRIPE_HASH_LOCKS) { spin_unlock_irq(&conf->device_lock); - r5l_flush_stripe_to_raid(conf->log); + log_flush_stripe_to_raid(conf); spin_lock_irq(&conf->device_lock); return batch_size; } @@ -8060,7 +8060,7 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce) wake_up(&conf->wait_for_overlap); unlock_all_device_hash_locks_irq(conf); } - r5l_quiesce(conf->log, quiesce); + log_quiesce(conf, quiesce); } static void *raid45_takeover_raid0(struct mddev *mddev, int level) @@ -8364,6 +8364,13 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) return err; } +static int raid5_start(struct mddev *mddev) +{ + struct r5conf *conf = mddev->private; + + return r5l_start(conf->log); +} + static struct md_personality raid6_personality = { .name = "raid6", @@ -8371,6 +8378,7 @@ static struct md_personality raid6_personality = .owner = THIS_MODULE, .make_request = raid5_make_request, .run = raid5_run, + .start = raid5_start, .free = raid5_free, .status = raid5_status, .error_handler = raid5_error, @@ -8395,6 +8403,7 @@ static struct md_personality raid5_personality = .owner = THIS_MODULE, .make_request = raid5_make_request, .run = raid5_run, + .start = raid5_start, .free = raid5_free, .status = raid5_status, .error_handler = raid5_error, @@ -8420,6 +8429,7 @@ static struct md_personality raid4_personality = .owner = THIS_MODULE, .make_request = raid5_make_request, .run = raid5_run, + .start = raid5_start, .free = raid5_free, .status = raid5_status, .error_handler = raid5_error, |