diff options
Diffstat (limited to 'drivers/md')
57 files changed, 2021 insertions, 744 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 2557f198e175..45254b3ef715 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # Block device driver configuration # @@ -436,6 +437,15 @@ config DM_DELAY If unsure, say N. +config DM_DUST + tristate "Bad sector simulation target" + depends on BLK_DEV_DM + ---help--- + A target that simulates bad sector behavior. + Useful for testing. + + If unsure, say N. + config DM_INIT bool "DM \"dm-mod.create=\" parameter support" depends on BLK_DEV_DM=y diff --git a/drivers/md/Makefile b/drivers/md/Makefile index a52b703e588e..be7a6eb92abc 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_DM_BUFIO) += dm-bufio.o obj-$(CONFIG_DM_BIO_PRISON) += dm-bio-prison.o obj-$(CONFIG_DM_CRYPT) += dm-crypt.o obj-$(CONFIG_DM_DELAY) += dm-delay.o +obj-$(CONFIG_DM_DUST) += dm-dust.o obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index f6e0a8b3a61e..6dfa653d30db 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only config BCACHE tristate "Block device as cache" diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 5002838ea476..f8986effcb50 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -327,10 +327,11 @@ static int bch_allocator_thread(void *arg) * possibly issue discards to them, then we add the bucket to * the free list: */ - while (!fifo_empty(&ca->free_inc)) { + while (1) { long bucket; - fifo_pop(&ca->free_inc, bucket); + if (!fifo_pop(&ca->free_inc, bucket)) + break; if (ca->discard) { mutex_unlock(&ca->set->bucket_lock); diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 8f07fa6e1739..268f1b685084 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -887,12 +887,22 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, struct bset *i = bset_tree_last(b)->data; struct bkey *m, *prev = NULL; struct btree_iter iter; + struct bkey preceding_key_on_stack = ZERO_KEY; + struct bkey *preceding_key_p = &preceding_key_on_stack; BUG_ON(b->ops->is_extents && !KEY_SIZE(k)); - m = bch_btree_iter_init(b, &iter, b->ops->is_extents - ? PRECEDING_KEY(&START_KEY(k)) - : PRECEDING_KEY(k)); + /* + * If k has preceding key, preceding_key_p will be set to address + * of k's preceding key; otherwise preceding_key_p will be set + * to NULL inside preceding_key(). + */ + if (b->ops->is_extents) + preceding_key(&START_KEY(k), &preceding_key_p); + else + preceding_key(k, &preceding_key_p); + + m = bch_btree_iter_init(b, &iter, preceding_key_p); if (b->ops->insert_fixup(b, k, &iter, replace_key)) return status; diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index bac76aabca6d..c71365e7c1fa 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -434,20 +434,26 @@ static inline bool bch_cut_back(const struct bkey *where, struct bkey *k) return __bch_cut_back(where, k); } -#define PRECEDING_KEY(_k) \ -({ \ - struct bkey *_ret = NULL; \ - \ - if (KEY_INODE(_k) || KEY_OFFSET(_k)) { \ - _ret = &KEY(KEY_INODE(_k), KEY_OFFSET(_k), 0); \ - \ - if (!_ret->low) \ - _ret->high--; \ - _ret->low--; \ - } \ - \ - _ret; \ -}) +/* + * Pointer '*preceding_key_p' points to a memory object to store preceding + * key of k. If the preceding key does not exist, set '*preceding_key_p' to + * NULL. So the caller of preceding_key() needs to take care of memory + * which '*preceding_key_p' pointed to before calling preceding_key(). + * Currently the only caller of preceding_key() is bch_btree_insert_key(), + * and it points to an on-stack variable, so the memory release is handled + * by stackframe itself. + */ +static inline void preceding_key(struct bkey *k, struct bkey **preceding_key_p) +{ + if (KEY_INODE(k) || KEY_OFFSET(k)) { + (**preceding_key_p) = KEY(KEY_INODE(k), KEY_OFFSET(k), 0); + if (!(*preceding_key_p)->low) + (*preceding_key_p)->high--; + (*preceding_key_p)->low--; + } else { + (*preceding_key_p) = NULL; + } +} static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k) { diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 64def336f053..773f5fdad25f 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -429,14 +429,14 @@ static void do_btree_node_write(struct btree *b) bset_sector_offset(&b->keys, i)); if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { - int j; struct bio_vec *bv; - void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); + void *addr = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); struct bvec_iter_all iter_all; - bio_for_each_segment_all(bv, b->bio, j, iter_all) - memcpy(page_address(bv->bv_page), - base + j * PAGE_SIZE, PAGE_SIZE); + bio_for_each_segment_all(bv, b->bio, iter_all) { + memcpy(page_address(bv->bv_page), addr, PAGE_SIZE); + addr += PAGE_SIZE; + } bch_submit_bbio(b->bio, b->c, &k.key, 0); @@ -1476,11 +1476,11 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, out_nocoalesce: closure_sync(&cl); - bch_keylist_free(&keylist); while ((k = bch_keylist_pop(&keylist))) if (!bkey_cmp(k, &ZERO_KEY)) atomic_dec(&b->c->prio_blocked); + bch_keylist_free(&keylist); for (i = 0; i < nodes; i++) if (!IS_ERR_OR_NULL(new_nodes[i])) { diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index b2fd412715b1..12dae9348147 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -147,7 +147,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) { #define read_bucket(b) \ ({ \ - int ret = journal_read_bucket(ca, list, b); \ + ret = journal_read_bucket(ca, list, b); \ __set_bit(b, bitmap); \ if (ret < 0) \ return ret; \ @@ -156,6 +156,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) struct cache *ca; unsigned int iter; + int ret = 0; for_each_cache(ca, c, iter) { struct journal_device *ja = &ca->journal; @@ -267,7 +268,7 @@ bsearch: struct journal_replay, list)->j.seq; - return 0; + return ret; #undef read_bucket } @@ -317,6 +318,18 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) } } +static bool is_discard_enabled(struct cache_set *s) +{ + struct cache *ca; + unsigned int i; + + for_each_cache(ca, s, i) + if (ca->discard) + return true; + + return false; +} + int bch_journal_replay(struct cache_set *s, struct list_head *list) { int ret = 0, keys = 0, entries = 0; @@ -330,9 +343,17 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) list_for_each_entry(i, list, list) { BUG_ON(i->pin && atomic_read(i->pin) != 1); - cache_set_err_on(n != i->j.seq, s, -"bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)", - n, i->j.seq - 1, start, end); + if (n != i->j.seq) { + if (n == start && is_discard_enabled(s)) + pr_info("bcache: journal entries %llu-%llu may be discarded! (replaying %llu-%llu)", + n, i->j.seq - 1, start, end); + else { + pr_err("bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)", + n, i->j.seq - 1, start, end); + ret = -EIO; + goto err; + } + } for (k = i->j.start; k < bset_bkey_last(&i->j); @@ -540,11 +561,11 @@ static void journal_reclaim(struct cache_set *c) ca->sb.nr_this_dev); } - bkey_init(k); - SET_KEY_PTRS(k, n); - - if (n) + if (n) { + bkey_init(k); + SET_KEY_PTRS(k, n); c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; + } out: if (!journal_full(&c->journal)) __closure_wake_up(&c->journal.wait); @@ -671,6 +692,9 @@ static void journal_write_unlocked(struct closure *cl) ca->journal.seq[ca->journal.cur_idx] = w->data->seq; } + /* If KEY_PTRS(k) == 0, this jset gets lost in air */ + BUG_ON(i == 0); + atomic_dec_bug(&fifo_back(&c->journal.pin)); bch_journal_next(&c->journal); journal_reclaim(c); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index f101bfe8657a..41adcd1546f1 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -329,12 +329,13 @@ void bch_data_insert(struct closure *cl) bch_data_insert_start(cl); } -/* Congested? */ - -unsigned int bch_get_congested(struct cache_set *c) +/* + * Congested? Return 0 (not congested) or the limit (in sectors) + * beyond which we should bypass the cache due to congestion. + */ +unsigned int bch_get_congested(const struct cache_set *c) { int i; - long rand; if (!c->congested_read_threshold_us && !c->congested_write_threshold_us) @@ -353,8 +354,7 @@ unsigned int bch_get_congested(struct cache_set *c) if (i > 0) i = fract_exp_two(i, 6); - rand = get_random_int(); - i -= bitmap_weight(&rand, BITS_PER_LONG); + i -= hweight32(get_random_u32()); return i > 0 ? i : 1; } @@ -376,7 +376,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) { struct cache_set *c = dc->disk.c; unsigned int mode = cache_mode(dc); - unsigned int sectors, congested = bch_get_congested(c); + unsigned int sectors, congested; struct task_struct *task = current; struct io *i; @@ -412,6 +412,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) goto rescale; } + congested = bch_get_congested(c); if (!congested && !dc->sequential_cutoff) goto rescale; @@ -706,14 +707,14 @@ static void search_free(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); - atomic_dec(&s->d->c->search_inflight); + atomic_dec(&s->iop.c->search_inflight); if (s->iop.bio) bio_put(s->iop.bio); bio_complete(s); closure_debug_destroy(cl); - mempool_free(s, &s->d->c->search); + mempool_free(s, &s->iop.c->search); } static inline struct search *search_alloc(struct bio *bio, @@ -756,13 +757,13 @@ static void cached_dev_bio_complete(struct closure *cl) struct search *s = container_of(cl, struct search, cl); struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); - search_free(cl); cached_dev_put(dc); + search_free(cl); } /* Process reads */ -static void cached_dev_cache_miss_done(struct closure *cl) +static void cached_dev_read_error_done(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); @@ -800,7 +801,22 @@ static void cached_dev_read_error(struct closure *cl) closure_bio_submit(s->iop.c, bio, cl); } - continue_at(cl, cached_dev_cache_miss_done, NULL); + continue_at(cl, cached_dev_read_error_done, NULL); +} + +static void cached_dev_cache_miss_done(struct closure *cl) +{ + struct search *s = container_of(cl, struct search, cl); + struct bcache_device *d = s->d; + + if (s->iop.replace_collision) + bch_mark_cache_miss_collision(s->iop.c, s->d); + + if (s->iop.bio) + bio_free_pages(s->iop.bio); + + cached_dev_bio_complete(cl); + closure_put(&d->cl); } static void cached_dev_read_done(struct closure *cl) @@ -833,6 +849,7 @@ static void cached_dev_read_done(struct closure *cl) if (verify(dc) && s->recoverable && !s->read_dirty_data) bch_data_verify(dc, s->orig_bio); + closure_get(&dc->disk.cl); bio_complete(s); if (s->iop.bio && diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 721bf336ed1a..c64dbd7a91aa 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -33,7 +33,7 @@ struct data_insert_op { BKEY_PADDED(replace_key); }; -unsigned int bch_get_congested(struct cache_set *c); +unsigned int bch_get_congested(const struct cache_set *c); void bch_data_insert(struct closure *cl); void bch_cached_dev_request_init(struct cached_dev *dc); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a697a3a923cd..1b63ac876169 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -662,6 +662,11 @@ static const struct block_device_operations bcache_ops = { void bcache_device_stop(struct bcache_device *d) { if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags)) + /* + * closure_fn set to + * - cached device: cached_dev_flush() + * - flash dev: flash_dev_flush() + */ closure_queue(&d->cl); } @@ -906,21 +911,18 @@ static int cached_dev_status_update(void *arg) void bch_cached_dev_run(struct cached_dev *dc) { struct bcache_device *d = &dc->disk; - char buf[SB_LABEL_SIZE + 1]; + char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL); char *env[] = { "DRIVER=bcache", kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid), - NULL, + kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf ? : ""), NULL, }; - memcpy(buf, dc->sb.label, SB_LABEL_SIZE); - buf[SB_LABEL_SIZE] = '\0'; - env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf); - if (atomic_xchg(&dc->running, 1)) { kfree(env[1]); kfree(env[2]); + kfree(buf); return; } @@ -944,6 +946,7 @@ void bch_cached_dev_run(struct cached_dev *dc) kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); kfree(env[1]); kfree(env[2]); + kfree(buf); if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) @@ -1174,6 +1177,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, return 0; } +/* when dc->disk.kobj released */ void bch_cached_dev_release(struct kobject *kobj) { struct cached_dev *dc = container_of(kobj, struct cached_dev, @@ -1280,7 +1284,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) /* Cached device - bcache superblock */ -static void register_bdev(struct cache_sb *sb, struct page *sb_page, +static int register_bdev(struct cache_sb *sb, struct page *sb_page, struct block_device *bdev, struct cached_dev *dc) { @@ -1318,14 +1322,16 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page, BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) bch_cached_dev_run(dc); - return; + return 0; err: pr_notice("error %s: %s", dc->backing_dev_name, err); bcache_device_stop(&dc->disk); + return -EIO; } /* Flash only volumes */ +/* When d->kobj released */ void bch_flash_dev_release(struct kobject *kobj) { struct bcache_device *d = container_of(kobj, struct bcache_device, @@ -1496,6 +1502,7 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) return true; } +/* When c->kobj released */ void bch_cache_set_release(struct kobject *kobj) { struct cache_set *c = container_of(kobj, struct cache_set, kobj); @@ -1516,6 +1523,7 @@ static void cache_set_free(struct closure *cl) bch_btree_cache_free(c); bch_journal_free(c); + mutex_lock(&bch_register_lock); for_each_cache(ca, c, i) if (ca) { ca->set = NULL; @@ -1534,7 +1542,6 @@ static void cache_set_free(struct closure *cl) mempool_exit(&c->search); kfree(c->devices); - mutex_lock(&bch_register_lock); list_del(&c->list); mutex_unlock(&bch_register_lock); @@ -1673,6 +1680,7 @@ static void __cache_set_unregister(struct closure *cl) void bch_cache_set_stop(struct cache_set *c) { if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags)) + /* closure_fn set to __cache_set_unregister() */ closure_queue(&c->caching); } @@ -1775,13 +1783,15 @@ err: return NULL; } -static void run_cache_set(struct cache_set *c) +static int run_cache_set(struct cache_set *c) { const char *err = "cannot allocate memory"; struct cached_dev *dc, *t; struct cache *ca; struct closure cl; unsigned int i; + LIST_HEAD(journal); + struct journal_replay *l; closure_init_stack(&cl); @@ -1790,7 +1800,6 @@ static void run_cache_set(struct cache_set *c) set_gc_sectors(c); if (CACHE_SYNC(&c->sb)) { - LIST_HEAD(journal); struct bkey *k; struct jset *j; @@ -1869,7 +1878,9 @@ static void run_cache_set(struct cache_set *c) if (j->version < BCACHE_JSET_VERSION_UUID) __uuid_write(c); - bch_journal_replay(c, &journal); + err = "bcache: replay journal failed"; + if (bch_journal_replay(c, &journal)) + goto err; } else { pr_notice("invalidating existing data"); @@ -1937,11 +1948,19 @@ static void run_cache_set(struct cache_set *c) flash_devs_run(c); set_bit(CACHE_SET_RUNNING, &c->flags); - return; + return 0; err: + while (!list_empty(&journal)) { + l = list_first_entry(&journal, struct journal_replay, list); + list_del(&l->list); + kfree(l); + } + closure_sync(&cl); /* XXX: test this, it's broken */ bch_cache_set_error(c, "%s", err); + + return -EIO; } static bool can_attach_cache(struct cache *ca, struct cache_set *c) @@ -2005,8 +2024,11 @@ found: ca->set->cache[ca->sb.nr_this_dev] = ca; c->cache_by_alloc[c->caches_loaded++] = ca; - if (c->caches_loaded == c->sb.nr_in_set) - run_cache_set(c); + if (c->caches_loaded == c->sb.nr_in_set) { + err = "failed to run cache set"; + if (run_cache_set(c) < 0) + goto err; + } return NULL; err: @@ -2016,6 +2038,7 @@ err: /* Cache device */ +/* When ca->kobj released */ void bch_cache_release(struct kobject *kobj) { struct cache *ca = container_of(kobj, struct cache, kobj); @@ -2179,6 +2202,12 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, ret = cache_alloc(ca); if (ret != 0) { + /* + * If we failed here, it means ca->kobj is not initialized yet, + * kobject_put() won't be called and there is no chance to + * call blkdev_put() to bdev in bch_cache_release(). So we + * explicitly call blkdev_put() here. + */ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); if (ret == -ENOMEM) err = "cache_alloc(): -ENOMEM"; @@ -2262,7 +2291,7 @@ static bool bch_is_open(struct block_device *bdev) static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, const char *buffer, size_t size) { - ssize_t ret = size; + ssize_t ret = -EINVAL; const char *err = "cannot allocate memory"; char *path = NULL; struct cache_sb *sb = NULL; @@ -2296,7 +2325,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!IS_ERR(bdev)) bdput(bdev); if (attr == &ksysfs_register_quiet) - goto out; + goto quiet_out; } goto err; } @@ -2317,17 +2346,23 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, goto err_close; mutex_lock(&bch_register_lock); - register_bdev(sb, sb_page, bdev, dc); + ret = register_bdev(sb, sb_page, bdev, dc); mutex_unlock(&bch_register_lock); + /* blkdev_put() will be called in cached_dev_free() */ + if (ret < 0) + goto err; } else { struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); if (!ca) goto err_close; + /* blkdev_put() will be called in bch_cache_release() */ if (register_cache(sb, sb_page, bdev, ca) != 0) goto err; } +quiet_out: + ret = size; out: if (sb_page) put_page(sb_page); @@ -2340,7 +2375,6 @@ err_close: blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); err: pr_info("error %s: %s", path, err); - ret = -EINVAL; goto out; } @@ -2370,10 +2404,19 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) list_for_each_entry_safe(dc, tdc, &uncached_devices, list) bcache_device_stop(&dc->disk); + mutex_unlock(&bch_register_lock); + + /* + * Give an early chance for other kthreads and + * kworkers to stop themselves + */ + schedule(); + /* What's a condition variable? */ while (1) { - long timeout = start + 2 * HZ - jiffies; + long timeout = start + 10 * HZ - jiffies; + mutex_lock(&bch_register_lock); stopped = list_empty(&bch_cache_sets) && list_empty(&uncached_devices); @@ -2385,7 +2428,6 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) mutex_unlock(&bch_register_lock); schedule_timeout(timeout); - mutex_lock(&bch_register_lock); } finish_wait(&unregister_wait, &wait); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 17bae9c14ca0..bfb437ffb13c 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -431,8 +431,13 @@ STORE(bch_cached_dev) bch_writeback_queue(dc); } + /* + * Only set BCACHE_DEV_WB_RUNNING when cached device attached to + * a cache set, otherwise it doesn't make sense. + */ if (attr == &sysfs_writeback_percent) - if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) + if ((dc->disk.c != NULL) && + (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))) schedule_delayed_work(&dc->writeback_rate_update, dc->writeback_rate_update_seconds * HZ); @@ -996,8 +1001,6 @@ SHOW(__bch_cache) !cached[n - 1]) --n; - unused = ca->sb.nbuckets - n; - while (cached < p + n && *cached == BTREE_PRIO) cached++, n--; diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 00aab6abcfe4..1fbced94e4cc 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -560,17 +560,29 @@ static inline uint64_t bch_crc64_update(uint64_t crc, return crc; } -/* Does linear interpolation between powers of two */ +/* + * A stepwise-linear pseudo-exponential. This returns 1 << (x >> + * frac_bits), with the less-significant bits filled in by linear + * interpolation. + * + * This can also be interpreted as a floating-point number format, + * where the low frac_bits are the mantissa (with implicit leading + * 1 bit), and the more significant bits are the exponent. + * The return value is 1.mantissa * 2^exponent. + * + * The way this is used, fract_bits is 6 and the largest possible + * input is CONGESTED_MAX-1 = 1023 (exponent 16, mantissa 0x1.fc), + * so the maximum output is 0x1fc00. + */ static inline unsigned int fract_exp_two(unsigned int x, unsigned int fract_bits) { - unsigned int fract = x & ~(~0 << fract_bits); - - x >>= fract_bits; - x = 1 << x; - x += (x * fract) >> fract_bits; + unsigned int mantissa = 1 << fract_bits; /* Implicit bit */ - return x; + mantissa += x & (mantissa - 1); + x >>= fract_bits; /* The exponent */ + /* Largest intermediate value 0x7f0000 */ + return mantissa << x >> fract_bits; } void bch_bio_map(struct bio *bio, void *base); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 1ecef76225a1..2a48ea3f1b30 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -150,7 +150,7 @@ struct dm_buffer { void (*end_io)(struct dm_buffer *, blk_status_t); #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING #define MAX_STACK 10 - struct stack_trace stack_trace; + unsigned int stack_len; unsigned long stack_entries[MAX_STACK]; #endif }; @@ -232,11 +232,7 @@ static DEFINE_MUTEX(dm_bufio_clients_lock); #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING static void buffer_record_stack(struct dm_buffer *b) { - b->stack_trace.nr_entries = 0; - b->stack_trace.max_entries = MAX_STACK; - b->stack_trace.entries = b->stack_entries; - b->stack_trace.skip = 2; - save_stack_trace(&b->stack_trace); + b->stack_len = stack_trace_save(b->stack_entries, MAX_STACK, 2); } #endif @@ -438,7 +434,7 @@ static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask) adjust_total_allocated(b->data_mode, (long)c->block_size); #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING - memset(&b->stack_trace, 0, sizeof(b->stack_trace)); + b->stack_len = 0; #endif return b; } @@ -1520,8 +1516,9 @@ static void drop_buffers(struct dm_bufio_client *c) DMERR("leaked buffer %llx, hold count %u, list %d", (unsigned long long)b->block, b->hold_count, i); #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING - print_stack_trace(&b->stack_trace, 1); - b->hold_count = 0; /* mark unclaimed to avoid BUG_ON below */ + stack_trace_print(b->stack_entries, b->stack_len, 1); + /* mark unclaimed to avoid BUG_ON below */ + b->hold_count = 0; #endif } diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 6fc93834da44..151aa95775be 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -1167,11 +1167,18 @@ static int __load_discards(struct dm_cache_metadata *cmd, if (r) return r; - for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) { + for (b = 0; ; b++) { r = fn(context, cmd->discard_block_size, to_dblock(b), dm_bitset_cursor_get_value(&c)); if (r) break; + + if (b >= (from_dblock(cmd->discard_nr_blocks) - 1)) + break; + + r = dm_bitset_cursor_next(&c); + if (r) + break; } dm_bitset_cursor_end(&c); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index dd6565798778..1b16d34bb785 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -332,7 +332,6 @@ static int crypt_iv_essiv_init(struct crypt_config *cc) int err; desc->tfm = essiv->hash_tfm; - desc->flags = 0; err = crypto_shash_digest(desc, cc->key, cc->key_size, essiv->salt); shash_desc_zero(desc); @@ -606,7 +605,6 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, int i, r; desc->tfm = lmk->hash_tfm; - desc->flags = 0; r = crypto_shash_init(desc); if (r) @@ -768,7 +766,6 @@ static int crypt_iv_tcw_whitening(struct crypt_config *cc, /* calculate crc32 for every 32bit part and xor it */ desc->tfm = tcw->crc32_tfm; - desc->flags = 0; for (i = 0; i < 4; i++) { r = crypto_shash_init(desc); if (r) @@ -949,6 +946,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti) { #ifdef CONFIG_BLK_DEV_INTEGRITY struct blk_integrity *bi = blk_get_integrity(cc->dev->bdev->bd_disk); + struct mapped_device *md = dm_table_get_md(ti->table); /* From now we require underlying device with our integrity profile */ if (!bi || strcasecmp(bi->profile->name, "DM-DIF-EXT-TAG")) { @@ -968,7 +966,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti) if (crypt_integrity_aead(cc)) { cc->integrity_tag_size = cc->on_disk_tag_size - cc->integrity_iv_size; - DMINFO("Integrity AEAD, tag size %u, IV size %u.", + DMDEBUG("%s: Integrity AEAD, tag size %u, IV size %u.", dm_device_name(md), cc->integrity_tag_size, cc->integrity_iv_size); if (crypto_aead_setauthsize(any_tfm_aead(cc), cc->integrity_tag_size)) { @@ -976,7 +974,7 @@ static int crypt_integrity_ctr(struct crypt_config *cc, struct dm_target *ti) return -EINVAL; } } else if (cc->integrity_iv_size) - DMINFO("Additional per-sector space %u bytes for IV.", + DMDEBUG("%s: Additional per-sector space %u bytes for IV.", dm_device_name(md), cc->integrity_iv_size); if ((cc->integrity_tag_size + cc->integrity_iv_size) != bi->tag_size) { @@ -1034,11 +1032,11 @@ static u8 *org_iv_of_dmreq(struct crypt_config *cc, return iv_of_dmreq(cc, dmreq) + cc->iv_size; } -static uint64_t *org_sector_of_dmreq(struct crypt_config *cc, +static __le64 *org_sector_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq) { u8 *ptr = iv_of_dmreq(cc, dmreq) + cc->iv_size + cc->iv_size; - return (uint64_t*) ptr; + return (__le64 *) ptr; } static unsigned int *org_tag_of_dmreq(struct crypt_config *cc, @@ -1074,7 +1072,7 @@ static int crypt_convert_block_aead(struct crypt_config *cc, struct bio_vec bv_out = bio_iter_iovec(ctx->bio_out, ctx->iter_out); struct dm_crypt_request *dmreq; u8 *iv, *org_iv, *tag_iv, *tag; - uint64_t *sector; + __le64 *sector; int r = 0; BUG_ON(cc->integrity_iv_size && cc->integrity_iv_size != cc->iv_size); @@ -1146,9 +1144,11 @@ static int crypt_convert_block_aead(struct crypt_config *cc, r = crypto_aead_decrypt(req); } - if (r == -EBADMSG) - DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu", + if (r == -EBADMSG) { + char b[BDEVNAME_SIZE]; + DMERR_LIMIT("%s: INTEGRITY AEAD ERROR, sector %llu", bio_devname(ctx->bio_in, b), (unsigned long long)le64_to_cpu(*sector)); + } if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post) r = cc->iv_gen_ops->post(cc, org_iv, dmreq); @@ -1169,7 +1169,7 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc, struct scatterlist *sg_in, *sg_out; struct dm_crypt_request *dmreq; u8 *iv, *org_iv, *tag_iv; - uint64_t *sector; + __le64 *sector; int r = 0; /* Reject unexpected unaligned bio. */ @@ -1445,11 +1445,10 @@ out: static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone) { - unsigned int i; struct bio_vec *bv; struct bvec_iter_all iter_all; - bio_for_each_segment_all(bv, clone, i, iter_all) { + bio_for_each_segment_all(bv, clone, iter_all) { BUG_ON(!bv->bv_page); mempool_free(bv->bv_page, &cc->page_pool); } @@ -1792,7 +1791,8 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, error = cc->iv_gen_ops->post(cc, org_iv_of_dmreq(cc, dmreq), dmreq); if (error == -EBADMSG) { - DMERR_LIMIT("INTEGRITY AEAD ERROR, sector %llu", + char b[BDEVNAME_SIZE]; + DMERR_LIMIT("%s: INTEGRITY AEAD ERROR, sector %llu", bio_devname(ctx->bio_in, b), (unsigned long long)le64_to_cpu(*org_sector_of_dmreq(cc, dmreq))); io->error = BLK_STS_PROTECTION; } else if (error < 0) @@ -1891,7 +1891,7 @@ static int crypt_alloc_tfms_skcipher(struct crypt_config *cc, char *ciphermode) * algorithm implementation is used. Help people debug performance * problems by logging the ->cra_driver_name. */ - DMINFO("%s using implementation \"%s\"", ciphermode, + DMDEBUG_LIMIT("%s using implementation \"%s\"", ciphermode, crypto_skcipher_alg(any_tfm(cc))->base.cra_driver_name); return 0; } @@ -1911,7 +1911,7 @@ static int crypt_alloc_tfms_aead(struct crypt_config *cc, char *ciphermode) return err; } - DMINFO("%s using implementation \"%s\"", ciphermode, + DMDEBUG_LIMIT("%s using implementation \"%s\"", ciphermode, crypto_aead_alg(any_tfm_aead(cc))->base.cra_driver_name); return 0; } diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index fddffe251bf6..f496213f8b67 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -121,7 +121,8 @@ static void delay_dtr(struct dm_target *ti) { struct delay_c *dc = ti->private; - destroy_workqueue(dc->kdelayd_wq); + if (dc->kdelayd_wq) + destroy_workqueue(dc->kdelayd_wq); if (dc->read.dev) dm_put_device(ti, dc->read.dev); diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c new file mode 100644 index 000000000000..845f376a72d9 --- /dev/null +++ b/drivers/md/dm-dust.c @@ -0,0 +1,515 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018 Red Hat, Inc. + * + * This is a test "dust" device, which fails reads on specified + * sectors, emulating the behavior of a hard disk drive sending + * a "Read Medium Error" sense. + * + */ + +#include <linux/device-mapper.h> +#include <linux/module.h> +#include <linux/rbtree.h> + +#define DM_MSG_PREFIX "dust" + +struct badblock { + struct rb_node node; + sector_t bb; +}; + +struct dust_device { + struct dm_dev *dev; + struct rb_root badblocklist; + unsigned long long badblock_count; + spinlock_t dust_lock; + unsigned int blksz; + unsigned int sect_per_block; + sector_t start; + bool fail_read_on_bb:1; + bool quiet_mode:1; +}; + +static struct badblock *dust_rb_search(struct rb_root *root, sector_t blk) +{ + struct rb_node *node = root->rb_node; + + while (node) { + struct badblock *bblk = rb_entry(node, struct badblock, node); + + if (bblk->bb > blk) + node = node->rb_left; + else if (bblk->bb < blk) + node = node->rb_right; + else + return bblk; + } + + return NULL; +} + +static bool dust_rb_insert(struct rb_root *root, struct badblock *new) +{ + struct badblock *bblk; + struct rb_node **link = &root->rb_node, *parent = NULL; + sector_t value = new->bb; + + while (*link) { + parent = *link; + bblk = rb_entry(parent, struct badblock, node); + + if (bblk->bb > value) + link = &(*link)->rb_left; + else if (bblk->bb < value) + link = &(*link)->rb_right; + else + return false; + } + + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, root); + + return true; +} + +static int dust_remove_block(struct dust_device *dd, unsigned long long block) +{ + struct badblock *bblock; + unsigned long flags; + + spin_lock_irqsave(&dd->dust_lock, flags); + bblock = dust_rb_search(&dd->badblocklist, block * dd->sect_per_block); + + if (bblock == NULL) { + if (!dd->quiet_mode) { + DMERR("%s: block %llu not found in badblocklist", + __func__, block); + } + spin_unlock_irqrestore(&dd->dust_lock, flags); + return -EINVAL; + } + + rb_erase(&bblock->node, &dd->badblocklist); + dd->badblock_count--; + if (!dd->quiet_mode) + DMINFO("%s: badblock removed at block %llu", __func__, block); + kfree(bblock); + spin_unlock_irqrestore(&dd->dust_lock, flags); + + return 0; +} + +static int dust_add_block(struct dust_device *dd, unsigned long long block) +{ + struct badblock *bblock; + unsigned long flags; + + bblock = kmalloc(sizeof(*bblock), GFP_KERNEL); + if (bblock == NULL) { + if (!dd->quiet_mode) + DMERR("%s: badblock allocation failed", __func__); + return -ENOMEM; + } + + spin_lock_irqsave(&dd->dust_lock, flags); + bblock->bb = block * dd->sect_per_block; + if (!dust_rb_insert(&dd->badblocklist, bblock)) { + if (!dd->quiet_mode) { + DMERR("%s: block %llu already in badblocklist", + __func__, block); + } + spin_unlock_irqrestore(&dd->dust_lock, flags); + kfree(bblock); + return -EINVAL; + } + + dd->badblock_count++; + if (!dd->quiet_mode) + DMINFO("%s: badblock added at block %llu", __func__, block); + spin_unlock_irqrestore(&dd->dust_lock, flags); + + return 0; +} + +static int dust_query_block(struct dust_device *dd, unsigned long long block) +{ + struct badblock *bblock; + unsigned long flags; + + spin_lock_irqsave(&dd->dust_lock, flags); + bblock = dust_rb_search(&dd->badblocklist, block * dd->sect_per_block); + if (bblock != NULL) + DMINFO("%s: block %llu found in badblocklist", __func__, block); + else + DMINFO("%s: block %llu not found in badblocklist", __func__, block); + spin_unlock_irqrestore(&dd->dust_lock, flags); + + return 0; +} + +static int __dust_map_read(struct dust_device *dd, sector_t thisblock) +{ + struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock); + + if (bblk) + return DM_MAPIO_KILL; + + return DM_MAPIO_REMAPPED; +} + +static int dust_map_read(struct dust_device *dd, sector_t thisblock, + bool fail_read_on_bb) +{ + unsigned long flags; + int ret = DM_MAPIO_REMAPPED; + + if (fail_read_on_bb) { + spin_lock_irqsave(&dd->dust_lock, flags); + ret = __dust_map_read(dd, thisblock); + spin_unlock_irqrestore(&dd->dust_lock, flags); + } + + return ret; +} + +static void __dust_map_write(struct dust_device *dd, sector_t thisblock) +{ + struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock); + + if (bblk) { + rb_erase(&bblk->node, &dd->badblocklist); + dd->badblock_count--; + kfree(bblk); + if (!dd->quiet_mode) { + sector_div(thisblock, dd->sect_per_block); + DMINFO("block %llu removed from badblocklist by write", + (unsigned long long)thisblock); + } + } +} + +static int dust_map_write(struct dust_device *dd, sector_t thisblock, + bool fail_read_on_bb) +{ + unsigned long flags; + + if (fail_read_on_bb) { + spin_lock_irqsave(&dd->dust_lock, flags); + __dust_map_write(dd, thisblock); + spin_unlock_irqrestore(&dd->dust_lock, flags); + } + + return DM_MAPIO_REMAPPED; +} + +static int dust_map(struct dm_target *ti, struct bio *bio) +{ + struct dust_device *dd = ti->private; + int ret; + + bio_set_dev(bio, dd->dev->bdev); + bio->bi_iter.bi_sector = dd->start + dm_target_offset(ti, bio->bi_iter.bi_sector); + + if (bio_data_dir(bio) == READ) + ret = dust_map_read(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb); + else + ret = dust_map_write(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb); + + return ret; +} + +static bool __dust_clear_badblocks(struct rb_root *tree, + unsigned long long count) +{ + struct rb_node *node = NULL, *nnode = NULL; + + nnode = rb_first(tree); + if (nnode == NULL) { + BUG_ON(count != 0); + return false; + } + + while (nnode) { + node = nnode; + nnode = rb_next(node); + rb_erase(node, tree); + count--; + kfree(node); + } + BUG_ON(count != 0); + BUG_ON(tree->rb_node != NULL); + + return true; +} + +static int dust_clear_badblocks(struct dust_device *dd) +{ + unsigned long flags; + struct rb_root badblocklist; + unsigned long long badblock_count; + + spin_lock_irqsave(&dd->dust_lock, flags); + badblocklist = dd->badblocklist; + badblock_count = dd->badblock_count; + dd->badblocklist = RB_ROOT; + dd->badblock_count = 0; + spin_unlock_irqrestore(&dd->dust_lock, flags); + + if (!__dust_clear_badblocks(&badblocklist, badblock_count)) + DMINFO("%s: no badblocks found", __func__); + else + DMINFO("%s: badblocks cleared", __func__); + + return 0; +} + +/* + * Target parameters: + * + * <device_path> <offset> <blksz> + * + * device_path: path to the block device + * offset: offset to data area from start of device_path + * blksz: block size (minimum 512, maximum 1073741824, must be a power of 2) + */ +static int dust_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct dust_device *dd; + unsigned long long tmp; + char dummy; + unsigned int blksz; + unsigned int sect_per_block; + sector_t DUST_MAX_BLKSZ_SECTORS = 2097152; + sector_t max_block_sectors = min(ti->len, DUST_MAX_BLKSZ_SECTORS); + + if (argc != 3) { + ti->error = "Invalid argument count"; + return -EINVAL; + } + + if (kstrtouint(argv[2], 10, &blksz) || !blksz) { + ti->error = "Invalid block size parameter"; + return -EINVAL; + } + + if (blksz < 512) { + ti->error = "Block size must be at least 512"; + return -EINVAL; + } + + if (!is_power_of_2(blksz)) { + ti->error = "Block size must be a power of 2"; + return -EINVAL; + } + + if (to_sector(blksz) > max_block_sectors) { + ti->error = "Block size is too large"; + return -EINVAL; + } + + sect_per_block = (blksz >> SECTOR_SHIFT); + + if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1 || tmp != (sector_t)tmp) { + ti->error = "Invalid device offset sector"; + return -EINVAL; + } + + dd = kzalloc(sizeof(struct dust_device), GFP_KERNEL); + if (dd == NULL) { + ti->error = "Cannot allocate context"; + return -ENOMEM; + } + + if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dd->dev)) { + ti->error = "Device lookup failed"; + kfree(dd); + return -EINVAL; + } + + dd->sect_per_block = sect_per_block; + dd->blksz = blksz; + dd->start = tmp; + + /* + * Whether to fail a read on a "bad" block. + * Defaults to false; enabled later by message. + */ + dd->fail_read_on_bb = false; + + /* + * Initialize bad block list rbtree. + */ + dd->badblocklist = RB_ROOT; + dd->badblock_count = 0; + spin_lock_init(&dd->dust_lock); + + dd->quiet_mode = false; + + BUG_ON(dm_set_target_max_io_len(ti, dd->sect_per_block) != 0); + + ti->num_discard_bios = 1; + ti->num_flush_bios = 1; + ti->private = dd; + + return 0; +} + +static void dust_dtr(struct dm_target *ti) +{ + struct dust_device *dd = ti->private; + + __dust_clear_badblocks(&dd->badblocklist, dd->badblock_count); + dm_put_device(ti, dd->dev); + kfree(dd); +} + +static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, + char *result_buf, unsigned int maxlen) +{ + struct dust_device *dd = ti->private; + sector_t size = i_size_read(dd->dev->bdev->bd_inode) >> SECTOR_SHIFT; + bool invalid_msg = false; + int result = -EINVAL; + unsigned long long tmp, block; + unsigned long flags; + char dummy; + + if (argc == 1) { + if (!strcasecmp(argv[0], "addbadblock") || + !strcasecmp(argv[0], "removebadblock") || + !strcasecmp(argv[0], "queryblock")) { + DMERR("%s requires an additional argument", argv[0]); + } else if (!strcasecmp(argv[0], "disable")) { + DMINFO("disabling read failures on bad sectors"); + dd->fail_read_on_bb = false; + result = 0; + } else if (!strcasecmp(argv[0], "enable")) { + DMINFO("enabling read failures on bad sectors"); + dd->fail_read_on_bb = true; + result = 0; + } else if (!strcasecmp(argv[0], "countbadblocks")) { + spin_lock_irqsave(&dd->dust_lock, flags); + DMINFO("countbadblocks: %llu badblock(s) found", + dd->badblock_count); + spin_unlock_irqrestore(&dd->dust_lock, flags); + result = 0; + } else if (!strcasecmp(argv[0], "clearbadblocks")) { + result = dust_clear_badblocks(dd); + } else if (!strcasecmp(argv[0], "quiet")) { + if (!dd->quiet_mode) + dd->quiet_mode = true; + else + dd->quiet_mode = false; + result = 0; + } else { + invalid_msg = true; + } + } else if (argc == 2) { + if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) + return result; + + block = tmp; + sector_div(size, dd->sect_per_block); + if (block > size) { + DMERR("selected block value out of range"); + return result; + } + + if (!strcasecmp(argv[0], "addbadblock")) + result = dust_add_block(dd, block); + else if (!strcasecmp(argv[0], "removebadblock")) + result = dust_remove_block(dd, block); + else if (!strcasecmp(argv[0], "queryblock")) + result = dust_query_block(dd, block); + else + invalid_msg = true; + + } else + DMERR("invalid number of arguments '%d'", argc); + + if (invalid_msg) + DMERR("unrecognized message '%s' received", argv[0]); + + return result; +} + +static void dust_status(struct dm_target *ti, status_type_t type, + unsigned int status_flags, char *result, unsigned int maxlen) +{ + struct dust_device *dd = ti->private; + unsigned int sz = 0; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%s %s %s", dd->dev->name, + dd->fail_read_on_bb ? "fail_read_on_bad_block" : "bypass", + dd->quiet_mode ? "quiet" : "verbose"); + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s %llu %u", dd->dev->name, + (unsigned long long)dd->start, dd->blksz); + break; + } +} + +static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +{ + struct dust_device *dd = ti->private; + struct dm_dev *dev = dd->dev; + + *bdev = dev->bdev; + + /* + * Only pass ioctls through if the device sizes match exactly. + */ + if (dd->start || + ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT) + return 1; + + return 0; +} + +static int dust_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, + void *data) +{ + struct dust_device *dd = ti->private; + + return fn(ti, dd->dev, dd->start, ti->len, data); +} + +static struct target_type dust_target = { + .name = "dust", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = dust_ctr, + .dtr = dust_dtr, + .iterate_devices = dust_iterate_devices, + .map = dust_map, + .message = dust_message, + .status = dust_status, + .prepare_ioctl = dust_prepare_ioctl, +}; + +static int __init dm_dust_init(void) +{ + int result = dm_register_target(&dust_target); + + if (result < 0) + DMERR("dm_register_target failed %d", result); + + return result; +} + +static void __exit dm_dust_exit(void) +{ + dm_unregister_target(&dust_target); +} + +module_init(dm_dust_init); +module_exit(dm_dust_exit); + +MODULE_DESCRIPTION(DM_NAME " dust test target"); +MODULE_AUTHOR("Bryan Gurney <dm-devel@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index 8e48920a3ffa..bdb84b8e7162 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include "dm.h" #include "persistent-data/dm-transaction-manager.h" #include "persistent-data/dm-bitset.h" diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h index 12b5216c2cfe..3f4139ac1f60 100644 --- a/drivers/md/dm-exception-store.h +++ b/drivers/md/dm-exception-store.h @@ -11,6 +11,7 @@ #define _LINUX_DM_EXCEPTION_STORE #include <linux/blkdev.h> +#include <linux/list_bl.h> #include <linux/device-mapper.h> /* @@ -27,7 +28,7 @@ typedef sector_t chunk_t; * chunk within the device. */ struct dm_exception { - struct list_head hash_list; + struct hlist_bl_node hash_list; chunk_t old_chunk; chunk_t new_chunk; @@ -135,9 +136,8 @@ struct dm_dev *dm_snap_cow(struct dm_snapshot *snap); /* * Funtions to manipulate consecutive chunks */ -# if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64) -# define DM_CHUNK_CONSECUTIVE_BITS 8 -# define DM_CHUNK_NUMBER_BITS 56 +#define DM_CHUNK_CONSECUTIVE_BITS 8 +#define DM_CHUNK_NUMBER_BITS 56 static inline chunk_t dm_chunk_number(chunk_t chunk) { @@ -163,29 +163,6 @@ static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e) e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS); } -# else -# define DM_CHUNK_CONSECUTIVE_BITS 0 - -static inline chunk_t dm_chunk_number(chunk_t chunk) -{ - return chunk; -} - -static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e) -{ - return 0; -} - -static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e) -{ -} - -static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e) -{ -} - -# endif - /* * Return the number of sectors in the device. */ diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c index 4b76f84424c3..728733a514c7 100644 --- a/drivers/md/dm-init.c +++ b/drivers/md/dm-init.c @@ -140,8 +140,8 @@ static char __init *dm_parse_table_entry(struct dm_device *dev, char *str) return ERR_PTR(-EINVAL); } /* target_args */ - dev->target_args_array[n] = kstrndup(field[3], GFP_KERNEL, - DM_MAX_STR_SIZE); + dev->target_args_array[n] = kstrndup(field[3], DM_MAX_STR_SIZE, + GFP_KERNEL); if (!dev->target_args_array[n]) return ERR_PTR(-ENOMEM); @@ -160,7 +160,7 @@ static int __init dm_parse_table(struct dm_device *dev, char *str) while (table_entry) { DMDEBUG("parsing table \"%s\"", str); - if (++dev->dmi.target_count >= DM_MAX_TARGETS) { + if (++dev->dmi.target_count > DM_MAX_TARGETS) { DMERR("too many targets %u > %d", dev->dmi.target_count, DM_MAX_TARGETS); return -EINVAL; @@ -242,9 +242,9 @@ static int __init dm_parse_devices(struct list_head *devices, char *str) return -ENOMEM; list_add_tail(&dev->list, devices); - if (++ndev >= DM_MAX_DEVICES) { - DMERR("too many targets %u > %d", - dev->dmi.target_count, DM_MAX_TARGETS); + if (++ndev > DM_MAX_DEVICES) { + DMERR("too many devices %lu > %d", + ndev, DM_MAX_DEVICES); return -EINVAL; } @@ -272,10 +272,10 @@ static int __init dm_init_init(void) return 0; if (strlen(create) >= DM_MAX_STR_SIZE) { - DMERR("Argument is too big. Limit is %d\n", DM_MAX_STR_SIZE); + DMERR("Argument is too big. Limit is %d", DM_MAX_STR_SIZE); return -EINVAL; } - str = kstrndup(create, GFP_KERNEL, DM_MAX_STR_SIZE); + str = kstrndup(create, DM_MAX_STR_SIZE, GFP_KERNEL); if (!str) return -ENOMEM; @@ -283,7 +283,7 @@ static int __init dm_init_init(void) if (r) goto out; - DMINFO("waiting for all devices to be available before creating mapped devices\n"); + DMINFO("waiting for all devices to be available before creating mapped devices"); wait_for_device_probe(); list_for_each_entry(dev, &devices, list) { diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 7c678f50aaa3..44e76cda087a 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -15,6 +15,7 @@ #include <linux/rbtree.h> #include <linux/delay.h> #include <linux/random.h> +#include <linux/reboot.h> #include <crypto/hash.h> #include <crypto/skcipher.h> #include <linux/async_tx.h> @@ -24,6 +25,7 @@ #define DEFAULT_INTERLEAVE_SECTORS 32768 #define DEFAULT_JOURNAL_SIZE_FACTOR 7 +#define DEFAULT_SECTORS_PER_BITMAP_BIT 32768 #define DEFAULT_BUFFER_SECTORS 128 #define DEFAULT_JOURNAL_WATERMARK 50 #define DEFAULT_SYNC_MSEC 10000 @@ -33,6 +35,8 @@ #define METADATA_WORKQUEUE_MAX_ACTIVE 16 #define RECALC_SECTORS 8192 #define RECALC_WRITE_SUPER 16 +#define BITMAP_BLOCK_SIZE 4096 /* don't change it */ +#define BITMAP_FLUSH_INTERVAL (10 * HZ) /* * Warning - DEBUG_PRINT prints security-sensitive data to the log, @@ -48,6 +52,7 @@ #define SB_MAGIC "integrt" #define SB_VERSION_1 1 #define SB_VERSION_2 2 +#define SB_VERSION_3 3 #define SB_SECTORS 8 #define MAX_SECTORS_PER_BLOCK 8 @@ -60,12 +65,14 @@ struct superblock { __u64 provided_data_sectors; /* userspace uses this value */ __u32 flags; __u8 log2_sectors_per_block; - __u8 pad[3]; + __u8 log2_blocks_per_bitmap_bit; + __u8 pad[2]; __u64 recalc_sector; }; #define SB_FLAG_HAVE_JOURNAL_MAC 0x1 #define SB_FLAG_RECALCULATING 0x2 +#define SB_FLAG_DIRTY_BITMAP 0x4 #define JOURNAL_ENTRY_ROUNDUP 8 @@ -88,14 +95,10 @@ struct journal_entry { #if BITS_PER_LONG == 64 #define journal_entry_set_sector(je, x) do { smp_wmb(); WRITE_ONCE((je)->u.sector, cpu_to_le64(x)); } while (0) -#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector) -#elif defined(CONFIG_LBDAF) -#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32((x) >> 32)); } while (0) -#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector) #else -#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32(0)); } while (0) -#define journal_entry_get_sector(je) le32_to_cpu((je)->u.s.sector_lo) +#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); WRITE_ONCE((je)->u.s.sector_hi, cpu_to_le32((x) >> 32)); } while (0) #endif +#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector) #define journal_entry_is_unused(je) ((je)->u.s.sector_hi == cpu_to_le32(-1)) #define journal_entry_set_unused(je) do { ((je)->u.s.sector_hi = cpu_to_le32(-1)); } while (0) #define journal_entry_is_inprogress(je) ((je)->u.s.sector_hi == cpu_to_le32(-2)) @@ -155,9 +158,18 @@ struct dm_integrity_c { struct workqueue_struct *metadata_wq; struct superblock *sb; unsigned journal_pages; + unsigned n_bitmap_blocks; + struct page_list *journal; struct page_list *journal_io; struct page_list *journal_xor; + struct page_list *recalc_bitmap; + struct page_list *may_write_bitmap; + struct bitmap_block_status *bbs; + unsigned bitmap_flush_interval; + int synchronous_mode; + struct bio_list synchronous_bios; + struct delayed_work bitmap_flush_work; struct crypto_skcipher *journal_crypt; struct scatterlist **journal_scatterlist; @@ -184,6 +196,7 @@ struct dm_integrity_c { __s8 log2_metadata_run; __u8 log2_buffer_sectors; __u8 sectors_per_block; + __u8 log2_blocks_per_bitmap_bit; unsigned char mode; int suspending; @@ -236,17 +249,20 @@ struct dm_integrity_c { bool journal_uptodate; bool just_formatted; + bool recalculate_flag; struct alg_spec internal_hash_alg; struct alg_spec journal_crypt_alg; struct alg_spec journal_mac_alg; atomic64_t number_of_mismatches; + + struct notifier_block reboot_notifier; }; struct dm_integrity_range { sector_t logical_sector; - unsigned n_sectors; + sector_t n_sectors; bool waiting; union { struct rb_node node; @@ -292,6 +308,16 @@ struct journal_io { struct journal_completion *comp; }; +struct bitmap_block_status { + struct work_struct work; + struct dm_integrity_c *ic; + unsigned idx; + unsigned long *bitmap; + struct bio_list bio_queue; + spinlock_t bio_queue_lock; + +}; + static struct kmem_cache *journal_io_cache; #define JOURNAL_IO_MEMPOOL 32 @@ -427,7 +453,9 @@ static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr) static void sb_set_version(struct dm_integrity_c *ic) { - if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) + if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) + ic->sb->version = SB_VERSION_3; + else if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) ic->sb->version = SB_VERSION_2; else ic->sb->version = SB_VERSION_1; @@ -451,6 +479,137 @@ static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags) return dm_io(&io_req, 1, &io_loc, NULL); } +#define BITMAP_OP_TEST_ALL_SET 0 +#define BITMAP_OP_TEST_ALL_CLEAR 1 +#define BITMAP_OP_SET 2 +#define BITMAP_OP_CLEAR 3 + +static bool block_bitmap_op(struct dm_integrity_c *ic, struct page_list *bitmap, + sector_t sector, sector_t n_sectors, int mode) +{ + unsigned long bit, end_bit, this_end_bit, page, end_page; + unsigned long *data; + + if (unlikely(((sector | n_sectors) & ((1 << ic->sb->log2_sectors_per_block) - 1)) != 0)) { + DMCRIT("invalid bitmap access (%llx,%llx,%d,%d,%d)", + (unsigned long long)sector, + (unsigned long long)n_sectors, + ic->sb->log2_sectors_per_block, + ic->log2_blocks_per_bitmap_bit, + mode); + BUG(); + } + + if (unlikely(!n_sectors)) + return true; + + bit = sector >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit); + end_bit = (sector + n_sectors - 1) >> + (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit); + + page = bit / (PAGE_SIZE * 8); + bit %= PAGE_SIZE * 8; + + end_page = end_bit / (PAGE_SIZE * 8); + end_bit %= PAGE_SIZE * 8; + +repeat: + if (page < end_page) { + this_end_bit = PAGE_SIZE * 8 - 1; + } else { + this_end_bit = end_bit; + } + + data = lowmem_page_address(bitmap[page].page); + + if (mode == BITMAP_OP_TEST_ALL_SET) { + while (bit <= this_end_bit) { + if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) { + do { + if (data[bit / BITS_PER_LONG] != -1) + return false; + bit += BITS_PER_LONG; + } while (this_end_bit >= bit + BITS_PER_LONG - 1); + continue; + } + if (!test_bit(bit, data)) + return false; + bit++; + } + } else if (mode == BITMAP_OP_TEST_ALL_CLEAR) { + while (bit <= this_end_bit) { + if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) { + do { + if (data[bit / BITS_PER_LONG] != 0) + return false; + bit += BITS_PER_LONG; + } while (this_end_bit >= bit + BITS_PER_LONG - 1); + continue; + } + if (test_bit(bit, data)) + return false; + bit++; + } + } else if (mode == BITMAP_OP_SET) { + while (bit <= this_end_bit) { + if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) { + do { + data[bit / BITS_PER_LONG] = -1; + bit += BITS_PER_LONG; + } while (this_end_bit >= bit + BITS_PER_LONG - 1); + continue; + } + __set_bit(bit, data); + bit++; + } + } else if (mode == BITMAP_OP_CLEAR) { + if (!bit && this_end_bit == PAGE_SIZE * 8 - 1) + clear_page(data); + else while (bit <= this_end_bit) { + if (!(bit % BITS_PER_LONG) && this_end_bit >= bit + BITS_PER_LONG - 1) { + do { + data[bit / BITS_PER_LONG] = 0; + bit += BITS_PER_LONG; + } while (this_end_bit >= bit + BITS_PER_LONG - 1); + continue; + } + __clear_bit(bit, data); + bit++; + } + } else { + BUG(); + } + + if (unlikely(page < end_page)) { + bit = 0; + page++; + goto repeat; + } + + return true; +} + +static void block_bitmap_copy(struct dm_integrity_c *ic, struct page_list *dst, struct page_list *src) +{ + unsigned n_bitmap_pages = DIV_ROUND_UP(ic->n_bitmap_blocks, PAGE_SIZE / BITMAP_BLOCK_SIZE); + unsigned i; + + for (i = 0; i < n_bitmap_pages; i++) { + unsigned long *dst_data = lowmem_page_address(dst[i].page); + unsigned long *src_data = lowmem_page_address(src[i].page); + copy_page(dst_data, src_data); + } +} + +static struct bitmap_block_status *sector_to_bitmap_block(struct dm_integrity_c *ic, sector_t sector) +{ + unsigned bit = sector >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit); + unsigned bitmap_block = bit / (BITMAP_BLOCK_SIZE * 8); + + BUG_ON(bitmap_block >= ic->n_bitmap_blocks); + return &ic->bbs[bitmap_block]; +} + static void access_journal_check(struct dm_integrity_c *ic, unsigned section, unsigned offset, bool e, const char *function) { @@ -459,8 +618,8 @@ static void access_journal_check(struct dm_integrity_c *ic, unsigned section, un if (unlikely(section >= ic->journal_sections) || unlikely(offset >= limit)) { - printk(KERN_CRIT "%s: invalid access at (%u,%u), limit (%u,%u)\n", - function, section, offset, ic->journal_sections, limit); + DMCRIT("%s: invalid access at (%u,%u), limit (%u,%u)", + function, section, offset, ic->journal_sections, limit); BUG(); } #endif @@ -532,7 +691,6 @@ static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result unsigned j, size; desc->tfm = ic->journal_mac; - desc->flags = 0; r = crypto_shash_init(desc); if (unlikely(r)) { @@ -761,12 +919,12 @@ static void complete_journal_io(unsigned long error, void *context) complete_journal_op(comp); } -static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section, - unsigned n_sections, struct journal_completion *comp) +static void rw_journal_sectors(struct dm_integrity_c *ic, int op, int op_flags, + unsigned sector, unsigned n_sectors, struct journal_completion *comp) { struct dm_io_request io_req; struct dm_io_region io_loc; - unsigned sector, n_sectors, pl_index, pl_offset; + unsigned pl_index, pl_offset; int r; if (unlikely(dm_integrity_failed(ic))) { @@ -775,9 +933,6 @@ static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned return; } - sector = section * ic->journal_section_sectors; - n_sectors = n_sections * ic->journal_section_sectors; - pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT); pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1); @@ -810,6 +965,17 @@ static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned } } +static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section, + unsigned n_sections, struct journal_completion *comp) +{ + unsigned sector, n_sectors; + + sector = section * ic->journal_section_sectors; + n_sectors = n_sections * ic->journal_section_sectors; + + rw_journal_sectors(ic, op, op_flags, sector, n_sectors, comp); +} + static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsigned commit_sections) { struct journal_completion io_comp; @@ -993,6 +1159,12 @@ static void wait_and_add_new_range(struct dm_integrity_c *ic, struct dm_integrit } while (unlikely(new_range->waiting)); } +static void add_new_range_and_wait(struct dm_integrity_c *ic, struct dm_integrity_range *new_range) +{ + if (unlikely(!add_new_range(ic, new_range, true))) + wait_and_add_new_range(ic, new_range); +} + static void init_journal_node(struct journal_node *node) { RB_CLEAR_NODE(&node->node); @@ -1209,6 +1381,14 @@ static void do_endio(struct dm_integrity_c *ic, struct bio *bio) int r = dm_integrity_failed(ic); if (unlikely(r) && !bio->bi_status) bio->bi_status = errno_to_blk_status(r); + if (unlikely(ic->synchronous_mode) && bio_op(bio) == REQ_OP_WRITE) { + unsigned long flags; + spin_lock_irqsave(&ic->endio_wait.lock, flags); + bio_list_add(&ic->synchronous_bios, bio); + queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0); + spin_unlock_irqrestore(&ic->endio_wait.lock, flags); + return; + } bio_endio(bio); } @@ -1276,7 +1456,6 @@ static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector unsigned digest_size; req->tfm = ic->internal_hash; - req->flags = 0; r = crypto_shash_init(req); if (unlikely(r < 0)) { @@ -1483,7 +1662,8 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio) else wanted_tag_size *= ic->tag_size; if (unlikely(wanted_tag_size != bip->bip_iter.bi_size)) { - DMERR("Invalid integrity data size %u, expected %u", bip->bip_iter.bi_size, wanted_tag_size); + DMERR("Invalid integrity data size %u, expected %u", + bip->bip_iter.bi_size, wanted_tag_size); return DM_MAPIO_KILL; } } @@ -1687,7 +1867,7 @@ retry: unsigned ws, we, range_sectors; dio->range.n_sectors = min(dio->range.n_sectors, - ic->free_sectors << ic->sb->log2_sectors_per_block); + (sector_t)ic->free_sectors << ic->sb->log2_sectors_per_block); if (unlikely(!dio->range.n_sectors)) { if (from_map) goto offload_to_thread; @@ -1770,6 +1950,20 @@ offload_to_thread: goto journal_read_write; } + if (ic->mode == 'B' && dio->write) { + if (!block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector, + dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) { + struct bitmap_block_status *bbs; + + bbs = sector_to_bitmap_block(ic, dio->range.logical_sector); + spin_lock(&bbs->bio_queue_lock); + bio_list_add(&bbs->bio_queue, bio); + spin_unlock(&bbs->bio_queue_lock); + queue_work(ic->writer_wq, &bbs->work); + return; + } + } + dio->in_flight = (atomic_t)ATOMIC_INIT(2); if (need_sync_io) { @@ -1796,10 +1990,15 @@ offload_to_thread: if (need_sync_io) { wait_for_completion_io(&read_comp); - if (unlikely(ic->recalc_wq != NULL) && - ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) && + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) && dio->range.logical_sector + dio->range.n_sectors > le64_to_cpu(ic->sb->recalc_sector)) goto skip_check; + if (ic->mode == 'B') { + if (!block_bitmap_op(ic, ic->recalc_bitmap, dio->range.logical_sector, + dio->range.n_sectors, BITMAP_OP_TEST_ALL_CLEAR)) + goto skip_check; + } + if (likely(!bio->bi_status)) integrity_metadata(&dio->work); else @@ -1837,8 +2036,16 @@ static void pad_uncommitted(struct dm_integrity_c *ic) wraparound_section(ic, &ic->free_section); ic->n_uncommitted_sections++; } - WARN_ON(ic->journal_sections * ic->journal_section_entries != - (ic->n_uncommitted_sections + ic->n_committed_sections) * ic->journal_section_entries + ic->free_sectors); + if (WARN_ON(ic->journal_sections * ic->journal_section_entries != + (ic->n_uncommitted_sections + ic->n_committed_sections) * + ic->journal_section_entries + ic->free_sectors)) { + DMCRIT("journal_sections %u, journal_section_entries %u, " + "n_uncommitted_sections %u, n_committed_sections %u, " + "journal_section_entries %u, free_sectors %u", + ic->journal_sections, ic->journal_section_entries, + ic->n_uncommitted_sections, ic->n_committed_sections, + ic->journal_section_entries, ic->free_sectors); + } } static void integrity_commit(struct work_struct *w) @@ -1987,8 +2194,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned write_start, io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block; spin_lock_irq(&ic->endio_wait.lock); - if (unlikely(!add_new_range(ic, &io->range, true))) - wait_and_add_new_range(ic, &io->range); + add_new_range_and_wait(ic, &io->range); if (likely(!from_replay)) { struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries]; @@ -2126,11 +2332,14 @@ static void integrity_recalc(struct work_struct *w) sector_t area, offset; sector_t metadata_block; unsigned metadata_offset; + sector_t logical_sector, n_sectors; __u8 *t; unsigned i; int r; unsigned super_counter = 0; + DEBUG_print("start recalculation... (position %llx)\n", le64_to_cpu(ic->sb->recalc_sector)); + spin_lock_irq(&ic->endio_wait.lock); next_chunk: @@ -2139,21 +2348,49 @@ next_chunk: goto unlock_ret; range.logical_sector = le64_to_cpu(ic->sb->recalc_sector); - if (unlikely(range.logical_sector >= ic->provided_data_sectors)) + if (unlikely(range.logical_sector >= ic->provided_data_sectors)) { + if (ic->mode == 'B') { + DEBUG_print("queue_delayed_work: bitmap_flush_work\n"); + queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0); + } goto unlock_ret; + } get_area_and_offset(ic, range.logical_sector, &area, &offset); range.n_sectors = min((sector_t)RECALC_SECTORS, ic->provided_data_sectors - range.logical_sector); if (!ic->meta_dev) - range.n_sectors = min(range.n_sectors, (1U << ic->sb->log2_interleave_sectors) - (unsigned)offset); - - if (unlikely(!add_new_range(ic, &range, true))) - wait_and_add_new_range(ic, &range); + range.n_sectors = min(range.n_sectors, ((sector_t)1U << ic->sb->log2_interleave_sectors) - (unsigned)offset); + add_new_range_and_wait(ic, &range); spin_unlock_irq(&ic->endio_wait.lock); + logical_sector = range.logical_sector; + n_sectors = range.n_sectors; + + if (ic->mode == 'B') { + if (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector, n_sectors, BITMAP_OP_TEST_ALL_CLEAR)) { + goto advance_and_next; + } + while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector, + ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) { + logical_sector += ic->sectors_per_block; + n_sectors -= ic->sectors_per_block; + cond_resched(); + } + while (block_bitmap_op(ic, ic->recalc_bitmap, logical_sector + n_sectors - ic->sectors_per_block, + ic->sectors_per_block, BITMAP_OP_TEST_ALL_CLEAR)) { + n_sectors -= ic->sectors_per_block; + cond_resched(); + } + get_area_and_offset(ic, logical_sector, &area, &offset); + } + + DEBUG_print("recalculating: %lx, %lx\n", logical_sector, n_sectors); if (unlikely(++super_counter == RECALC_WRITE_SUPER)) { recalc_write_super(ic); + if (ic->mode == 'B') { + queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval); + } super_counter = 0; } @@ -2168,7 +2405,7 @@ next_chunk: io_req.client = ic->io; io_loc.bdev = ic->dev->bdev; io_loc.sector = get_data_sector(ic, area, offset); - io_loc.count = range.n_sectors; + io_loc.count = n_sectors; r = dm_io(&io_req, 1, &io_loc, NULL); if (unlikely(r)) { @@ -2177,8 +2414,8 @@ next_chunk: } t = ic->recalc_tags; - for (i = 0; i < range.n_sectors; i += ic->sectors_per_block) { - integrity_sector_checksum(ic, range.logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t); + for (i = 0; i < n_sectors; i += ic->sectors_per_block) { + integrity_sector_checksum(ic, logical_sector + i, ic->recalc_buffer + (i << SECTOR_SHIFT), t); t += ic->tag_size; } @@ -2190,6 +2427,9 @@ next_chunk: goto err; } +advance_and_next: + cond_resched(); + spin_lock_irq(&ic->endio_wait.lock); remove_range_unlocked(ic, &range); ic->sb->recalc_sector = cpu_to_le64(range.logical_sector + range.n_sectors); @@ -2205,6 +2445,103 @@ unlock_ret: recalc_write_super(ic); } +static void bitmap_block_work(struct work_struct *w) +{ + struct bitmap_block_status *bbs = container_of(w, struct bitmap_block_status, work); + struct dm_integrity_c *ic = bbs->ic; + struct bio *bio; + struct bio_list bio_queue; + struct bio_list waiting; + + bio_list_init(&waiting); + + spin_lock(&bbs->bio_queue_lock); + bio_queue = bbs->bio_queue; + bio_list_init(&bbs->bio_queue); + spin_unlock(&bbs->bio_queue_lock); + + while ((bio = bio_list_pop(&bio_queue))) { + struct dm_integrity_io *dio; + + dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); + + if (block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector, + dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) { + remove_range(ic, &dio->range); + INIT_WORK(&dio->work, integrity_bio_wait); + queue_work(ic->wait_wq, &dio->work); + } else { + block_bitmap_op(ic, ic->journal, dio->range.logical_sector, + dio->range.n_sectors, BITMAP_OP_SET); + bio_list_add(&waiting, bio); + } + } + + if (bio_list_empty(&waiting)) + return; + + rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, + bbs->idx * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), + BITMAP_BLOCK_SIZE >> SECTOR_SHIFT, NULL); + + while ((bio = bio_list_pop(&waiting))) { + struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); + + block_bitmap_op(ic, ic->may_write_bitmap, dio->range.logical_sector, + dio->range.n_sectors, BITMAP_OP_SET); + + remove_range(ic, &dio->range); + INIT_WORK(&dio->work, integrity_bio_wait); + queue_work(ic->wait_wq, &dio->work); + } + + queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval); +} + +static void bitmap_flush_work(struct work_struct *work) +{ + struct dm_integrity_c *ic = container_of(work, struct dm_integrity_c, bitmap_flush_work.work); + struct dm_integrity_range range; + unsigned long limit; + struct bio *bio; + + dm_integrity_flush_buffers(ic); + + range.logical_sector = 0; + range.n_sectors = ic->provided_data_sectors; + + spin_lock_irq(&ic->endio_wait.lock); + add_new_range_and_wait(ic, &range); + spin_unlock_irq(&ic->endio_wait.lock); + + dm_integrity_flush_buffers(ic); + if (ic->meta_dev) + blkdev_issue_flush(ic->dev->bdev, GFP_NOIO, NULL); + + limit = ic->provided_data_sectors; + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { + limit = le64_to_cpu(ic->sb->recalc_sector) + >> (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit) + << (ic->sb->log2_sectors_per_block + ic->log2_blocks_per_bitmap_bit); + } + /*DEBUG_print("zeroing journal\n");*/ + block_bitmap_op(ic, ic->journal, 0, limit, BITMAP_OP_CLEAR); + block_bitmap_op(ic, ic->may_write_bitmap, 0, limit, BITMAP_OP_CLEAR); + + rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0, + ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + + spin_lock_irq(&ic->endio_wait.lock); + remove_range_unlocked(ic, &range); + while (unlikely((bio = bio_list_pop(&ic->synchronous_bios)) != NULL)) { + bio_endio(bio); + spin_unlock_irq(&ic->endio_wait.lock); + spin_lock_irq(&ic->endio_wait.lock); + } + spin_unlock_irq(&ic->endio_wait.lock); +} + + static void init_journal(struct dm_integrity_c *ic, unsigned start_section, unsigned n_sections, unsigned char commit_seq) { @@ -2401,9 +2738,37 @@ clear_journal: init_journal_node(&ic->journal_tree[i]); } +static void dm_integrity_enter_synchronous_mode(struct dm_integrity_c *ic) +{ + DEBUG_print("dm_integrity_enter_synchronous_mode\n"); + + if (ic->mode == 'B') { + ic->bitmap_flush_interval = msecs_to_jiffies(10) + 1; + ic->synchronous_mode = 1; + + cancel_delayed_work_sync(&ic->bitmap_flush_work); + queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, 0); + flush_workqueue(ic->commit_wq); + } +} + +static int dm_integrity_reboot(struct notifier_block *n, unsigned long code, void *x) +{ + struct dm_integrity_c *ic = container_of(n, struct dm_integrity_c, reboot_notifier); + + DEBUG_print("dm_integrity_reboot\n"); + + dm_integrity_enter_synchronous_mode(ic); + + return NOTIFY_DONE; +} + static void dm_integrity_postsuspend(struct dm_target *ti) { struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private; + int r; + + WARN_ON(unregister_reboot_notifier(&ic->reboot_notifier)); del_timer_sync(&ic->autocommit_timer); @@ -2412,6 +2777,9 @@ static void dm_integrity_postsuspend(struct dm_target *ti) if (ic->recalc_wq) drain_workqueue(ic->recalc_wq); + if (ic->mode == 'B') + cancel_delayed_work_sync(&ic->bitmap_flush_work); + queue_work(ic->commit_wq, &ic->commit_work); drain_workqueue(ic->commit_wq); @@ -2422,6 +2790,18 @@ static void dm_integrity_postsuspend(struct dm_target *ti) dm_integrity_flush_buffers(ic); } + if (ic->mode == 'B') { + dm_integrity_flush_buffers(ic); +#if 1 + /* set to 0 to test bitmap replay code */ + init_journal(ic, 0, ic->journal_sections, 0); + ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP); + r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA); + if (unlikely(r)) + dm_integrity_io_error(ic, "writing superblock", r); +#endif + } + WRITE_ONCE(ic->suspending, 0); BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress)); @@ -2432,11 +2812,70 @@ static void dm_integrity_postsuspend(struct dm_target *ti) static void dm_integrity_resume(struct dm_target *ti) { struct dm_integrity_c *ic = (struct dm_integrity_c *)ti->private; + int r; + DEBUG_print("resume\n"); + + if (ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) { + DEBUG_print("resume dirty_bitmap\n"); + rw_journal_sectors(ic, REQ_OP_READ, 0, 0, + ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + if (ic->mode == 'B') { + if (ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit) { + block_bitmap_copy(ic, ic->recalc_bitmap, ic->journal); + block_bitmap_copy(ic, ic->may_write_bitmap, ic->journal); + if (!block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, + BITMAP_OP_TEST_ALL_CLEAR)) { + ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); + ic->sb->recalc_sector = cpu_to_le64(0); + } + } else { + DEBUG_print("non-matching blocks_per_bitmap_bit: %u, %u\n", + ic->sb->log2_blocks_per_bitmap_bit, ic->log2_blocks_per_bitmap_bit); + ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit; + block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET); + block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_SET); + block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_SET); + rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0, + ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); + ic->sb->recalc_sector = cpu_to_le64(0); + } + } else { + if (!(ic->sb->log2_blocks_per_bitmap_bit == ic->log2_blocks_per_bitmap_bit && + block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_TEST_ALL_CLEAR))) { + ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); + ic->sb->recalc_sector = cpu_to_le64(0); + } + init_journal(ic, 0, ic->journal_sections, 0); + replay_journal(ic); + ic->sb->flags &= ~cpu_to_le32(SB_FLAG_DIRTY_BITMAP); + } + r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA); + if (unlikely(r)) + dm_integrity_io_error(ic, "writing superblock", r); + } else { + replay_journal(ic); + if (ic->mode == 'B') { + int mode; + ic->sb->flags |= cpu_to_le32(SB_FLAG_DIRTY_BITMAP); + ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit; + r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA); + if (unlikely(r)) + dm_integrity_io_error(ic, "writing superblock", r); + + mode = ic->recalculate_flag ? BITMAP_OP_SET : BITMAP_OP_CLEAR; + block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, mode); + block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, mode); + block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, mode); + rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0, + ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); + } + } - replay_journal(ic); - - if (ic->recalc_wq && ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { + DEBUG_print("testing recalc: %x\n", ic->sb->flags); + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { __u64 recalc_pos = le64_to_cpu(ic->sb->recalc_sector); + DEBUG_print("recalc pos: %lx / %lx\n", (long)recalc_pos, ic->provided_data_sectors); if (recalc_pos < ic->provided_data_sectors) { queue_work(ic->recalc_wq, &ic->recalc_work); } else if (recalc_pos > ic->provided_data_sectors) { @@ -2444,6 +2883,16 @@ static void dm_integrity_resume(struct dm_target *ti) recalc_write_super(ic); } } + + ic->reboot_notifier.notifier_call = dm_integrity_reboot; + ic->reboot_notifier.next = NULL; + ic->reboot_notifier.priority = INT_MAX - 1; /* be notified after md and before hardware drivers */ + WARN_ON(register_reboot_notifier(&ic->reboot_notifier)); + +#if 0 + /* set to 1 to stress test synchronous mode */ + dm_integrity_enter_synchronous_mode(ic); +#endif } static void dm_integrity_status(struct dm_target *ti, status_type_t type, @@ -2468,10 +2917,14 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, __u64 watermark_percentage = (__u64)(ic->journal_entries - ic->free_sectors_threshold) * 100; watermark_percentage += ic->journal_entries / 2; do_div(watermark_percentage, ic->journal_entries); - arg_count = 5; + arg_count = 3; arg_count += !!ic->meta_dev; arg_count += ic->sectors_per_block != 1; arg_count += !!(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)); + arg_count += ic->mode == 'J'; + arg_count += ic->mode == 'J'; + arg_count += ic->mode == 'B'; + arg_count += ic->mode == 'B'; arg_count += !!ic->internal_hash_alg.alg_string; arg_count += !!ic->journal_crypt_alg.alg_string; arg_count += !!ic->journal_mac_alg.alg_string; @@ -2481,13 +2934,19 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, DMEMIT(" meta_device:%s", ic->meta_dev->name); if (ic->sectors_per_block != 1) DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT); - if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) + if (ic->recalculate_flag) DMEMIT(" recalculate"); DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS); DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors); DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors); - DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage); - DMEMIT(" commit_time:%u", ic->autocommit_msec); + if (ic->mode == 'J') { + DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage); + DMEMIT(" commit_time:%u", ic->autocommit_msec); + } + if (ic->mode == 'B') { + DMEMIT(" sectors_per_bit:%llu", (unsigned long long)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit); + DMEMIT(" bitmap_flush_interval:%u", jiffies_to_msecs(ic->bitmap_flush_interval)); + } #define EMIT_ALG(a, n) \ do { \ @@ -2568,7 +3027,7 @@ static int calculate_device_limits(struct dm_integrity_c *ic) if (last_sector < ic->start || last_sector >= ic->meta_device_sectors) return -EINVAL; } else { - __u64 meta_size = ic->provided_data_sectors * ic->tag_size; + __u64 meta_size = (ic->provided_data_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size; meta_size = (meta_size + ((1U << (ic->log2_buffer_sectors + SECTOR_SHIFT)) - 1)) >> (ic->log2_buffer_sectors + SECTOR_SHIFT); meta_size <<= ic->log2_buffer_sectors; @@ -2665,37 +3124,37 @@ static void dm_integrity_set(struct dm_target *ti, struct dm_integrity_c *ic) blk_queue_max_integrity_segments(disk->queue, UINT_MAX); } -static void dm_integrity_free_page_list(struct dm_integrity_c *ic, struct page_list *pl) +static void dm_integrity_free_page_list(struct page_list *pl) { unsigned i; if (!pl) return; - for (i = 0; i < ic->journal_pages; i++) - if (pl[i].page) - __free_page(pl[i].page); + for (i = 0; pl[i].page; i++) + __free_page(pl[i].page); kvfree(pl); } -static struct page_list *dm_integrity_alloc_page_list(struct dm_integrity_c *ic) +static struct page_list *dm_integrity_alloc_page_list(unsigned n_pages) { - size_t page_list_desc_size = ic->journal_pages * sizeof(struct page_list); struct page_list *pl; unsigned i; - pl = kvmalloc(page_list_desc_size, GFP_KERNEL | __GFP_ZERO); + pl = kvmalloc_array(n_pages + 1, sizeof(struct page_list), GFP_KERNEL | __GFP_ZERO); if (!pl) return NULL; - for (i = 0; i < ic->journal_pages; i++) { + for (i = 0; i < n_pages; i++) { pl[i].page = alloc_page(GFP_KERNEL); if (!pl[i].page) { - dm_integrity_free_page_list(ic, pl); + dm_integrity_free_page_list(pl); return NULL; } if (i) pl[i - 1].next = &pl[i]; } + pl[i].page = NULL; + pl[i].next = NULL; return pl; } @@ -2708,7 +3167,8 @@ static void dm_integrity_free_journal_scatterlist(struct dm_integrity_c *ic, str kvfree(sl); } -static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic, struct page_list *pl) +static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_integrity_c *ic, + struct page_list *pl) { struct scatterlist **sl; unsigned i; @@ -2727,7 +3187,8 @@ static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_int unsigned idx; page_list_location(ic, i, 0, &start_index, &start_offset); - page_list_location(ic, i, ic->journal_section_sectors - 1, &end_index, &end_offset); + page_list_location(ic, i, ic->journal_section_sectors - 1, + &end_index, &end_offset); n_pages = (end_index - start_index + 1); @@ -2848,7 +3309,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error) } ic->journal_pages = journal_pages; - ic->journal = dm_integrity_alloc_page_list(ic); + ic->journal = dm_integrity_alloc_page_list(ic->journal_pages); if (!ic->journal) { *error = "Could not allocate memory for journal"; r = -ENOMEM; @@ -2880,7 +3341,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error) DEBUG_print("cipher %s, block size %u iv size %u\n", ic->journal_crypt_alg.alg_string, blocksize, ivsize); - ic->journal_io = dm_integrity_alloc_page_list(ic); + ic->journal_io = dm_integrity_alloc_page_list(ic->journal_pages); if (!ic->journal_io) { *error = "Could not allocate memory for journal io"; r = -ENOMEM; @@ -2904,7 +3365,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error) goto bad; } - ic->journal_xor = dm_integrity_alloc_page_list(ic); + ic->journal_xor = dm_integrity_alloc_page_list(ic->journal_pages); if (!ic->journal_xor) { *error = "Could not allocate memory for journal xor"; r = -ENOMEM; @@ -2928,7 +3389,8 @@ static int create_journal(struct dm_integrity_c *ic, char **error) sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids); memset(crypt_iv, 0x00, ivsize); - skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, crypt_iv); + skcipher_request_set_crypt(req, sg, sg, + PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, crypt_iv); init_completion(&comp.comp); comp.in_flight = (atomic_t)ATOMIC_INIT(1); if (do_crypt(true, req, &comp)) @@ -3069,7 +3531,7 @@ bad: * device * offset from the start of the device * tag size - * D - direct writes, J - journal writes, R - recovery mode + * D - direct writes, J - journal writes, B - bitmap mode, R - recovery mode * number of optional arguments * optional arguments: * journal_sectors @@ -3077,10 +3539,14 @@ bad: * buffer_sectors * journal_watermark * commit_time + * meta_device + * block_size + * sectors_per_bit + * bitmap_flush_interval * internal_hash * journal_crypt * journal_mac - * block_size + * recalculate */ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) { @@ -3093,10 +3559,13 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) {0, 9, "Invalid number of feature args"}, }; unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec; - bool recalculate; bool should_write_sb; __u64 threshold; unsigned long long start; + __s8 log2_sectors_per_bitmap_bit = -1; + __s8 log2_blocks_per_bitmap_bit; + __u64 bits_in_journal; + __u64 n_bitmap_bits; #define DIRECT_ARGUMENTS 4 @@ -3120,6 +3589,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) init_waitqueue_head(&ic->copy_to_journal_wait); init_completion(&ic->crypto_backoff); atomic64_set(&ic->number_of_mismatches, 0); + ic->bitmap_flush_interval = BITMAP_FLUSH_INTERVAL; r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ic->dev); if (r) { @@ -3142,10 +3612,11 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) } } - if (!strcmp(argv[3], "J") || !strcmp(argv[3], "D") || !strcmp(argv[3], "R")) + if (!strcmp(argv[3], "J") || !strcmp(argv[3], "B") || + !strcmp(argv[3], "D") || !strcmp(argv[3], "R")) { ic->mode = argv[3][0]; - else { - ti->error = "Invalid mode (expecting J, D, R)"; + } else { + ti->error = "Invalid mode (expecting J, B, D, R)"; r = -EINVAL; goto bad; } @@ -3155,7 +3626,6 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) buffer_sectors = DEFAULT_BUFFER_SECTORS; journal_watermark = DEFAULT_JOURNAL_WATERMARK; sync_msec = DEFAULT_SYNC_MSEC; - recalculate = false; ic->sectors_per_block = 1; as.argc = argc - DIRECT_ARGUMENTS; @@ -3167,6 +3637,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) while (extra_args--) { const char *opt_string; unsigned val; + unsigned long long llval; opt_string = dm_shift_arg(&as); if (!opt_string) { r = -EINVAL; @@ -3188,7 +3659,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) dm_put_device(ti, ic->meta_dev); ic->meta_dev = NULL; } - r = dm_get_device(ti, strchr(opt_string, ':') + 1, dm_table_get_mode(ti->table), &ic->meta_dev); + r = dm_get_device(ti, strchr(opt_string, ':') + 1, + dm_table_get_mode(ti->table), &ic->meta_dev); if (r) { ti->error = "Device lookup failed"; goto bad; @@ -3202,6 +3674,14 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } ic->sectors_per_block = val >> SECTOR_SHIFT; + } else if (sscanf(opt_string, "sectors_per_bit:%llu%c", &llval, &dummy) == 1) { + log2_sectors_per_bitmap_bit = !llval ? 0 : __ilog2_u64(llval); + } else if (sscanf(opt_string, "bitmap_flush_interval:%u%c", &val, &dummy) == 1) { + if (val >= (uint64_t)UINT_MAX * 1000 / HZ) { + r = -EINVAL; + ti->error = "Invalid bitmap_flush_interval argument"; + } + ic->bitmap_flush_interval = msecs_to_jiffies(val); } else if (!strncmp(opt_string, "internal_hash:", strlen("internal_hash:"))) { r = get_alg_and_key(opt_string, &ic->internal_hash_alg, &ti->error, "Invalid internal_hash argument"); @@ -3218,7 +3698,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) if (r) goto bad; } else if (!strcmp(opt_string, "recalculate")) { - recalculate = true; + ic->recalculate_flag = true; } else { r = -EINVAL; ti->error = "Invalid argument"; @@ -3234,7 +3714,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) if (!journal_sectors) { journal_sectors = min((sector_t)DEFAULT_MAX_JOURNAL_SECTORS, - ic->data_device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR); + ic->data_device_sectors >> DEFAULT_JOURNAL_SIZE_FACTOR); } if (!buffer_sectors) @@ -3269,6 +3749,12 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) else ic->log2_tag_size = -1; + if (ic->mode == 'B' && !ic->internal_hash) { + r = -EINVAL; + ti->error = "Bitmap mode can be only used with internal hash"; + goto bad; + } + ic->autocommit_jiffies = msecs_to_jiffies(sync_msec); ic->autocommit_msec = sync_msec; timer_setup(&ic->autocommit_timer, autocommit_fn, 0); @@ -3314,7 +3800,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) } INIT_WORK(&ic->commit_work, integrity_commit); - if (ic->mode == 'J') { + if (ic->mode == 'J' || ic->mode == 'B') { ic->writer_wq = alloc_workqueue("dm-integrity-writer", WQ_MEM_RECLAIM, 1); if (!ic->writer_wq) { ti->error = "Cannot allocate workqueue"; @@ -3355,7 +3841,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) should_write_sb = true; } - if (!ic->sb->version || ic->sb->version > SB_VERSION_2) { + if (!ic->sb->version || ic->sb->version > SB_VERSION_3) { r = -EINVAL; ti->error = "Unknown version"; goto bad; @@ -3415,6 +3901,27 @@ try_smaller_buffer: ti->error = "The device is too small"; goto bad; } + + if (log2_sectors_per_bitmap_bit < 0) + log2_sectors_per_bitmap_bit = __fls(DEFAULT_SECTORS_PER_BITMAP_BIT); + if (log2_sectors_per_bitmap_bit < ic->sb->log2_sectors_per_block) + log2_sectors_per_bitmap_bit = ic->sb->log2_sectors_per_block; + + bits_in_journal = ((__u64)ic->journal_section_sectors * ic->journal_sections) << (SECTOR_SHIFT + 3); + if (bits_in_journal > UINT_MAX) + bits_in_journal = UINT_MAX; + while (bits_in_journal < (ic->provided_data_sectors + ((sector_t)1 << log2_sectors_per_bitmap_bit) - 1) >> log2_sectors_per_bitmap_bit) + log2_sectors_per_bitmap_bit++; + + log2_blocks_per_bitmap_bit = log2_sectors_per_bitmap_bit - ic->sb->log2_sectors_per_block; + ic->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit; + if (should_write_sb) { + ic->sb->log2_blocks_per_bitmap_bit = log2_blocks_per_bitmap_bit; + } + n_bitmap_bits = ((ic->provided_data_sectors >> ic->sb->log2_sectors_per_block) + + (((sector_t)1 << log2_blocks_per_bitmap_bit) - 1)) >> log2_blocks_per_bitmap_bit; + ic->n_bitmap_blocks = DIV_ROUND_UP(n_bitmap_bits, BITMAP_BLOCK_SIZE * 8); + if (!ic->meta_dev) ic->log2_buffer_sectors = min(ic->log2_buffer_sectors, (__u8)__ffs(ic->metadata_run)); @@ -3439,25 +3946,21 @@ try_smaller_buffer: DEBUG_print(" journal_sections %u\n", (unsigned)le32_to_cpu(ic->sb->journal_sections)); DEBUG_print(" journal_entries %u\n", ic->journal_entries); DEBUG_print(" log2_interleave_sectors %d\n", ic->sb->log2_interleave_sectors); - DEBUG_print(" device_sectors 0x%llx\n", (unsigned long long)ic->device_sectors); + DEBUG_print(" data_device_sectors 0x%llx\n", i_size_read(ic->dev->bdev->bd_inode) >> SECTOR_SHIFT); DEBUG_print(" initial_sectors 0x%x\n", ic->initial_sectors); DEBUG_print(" metadata_run 0x%x\n", ic->metadata_run); DEBUG_print(" log2_metadata_run %d\n", ic->log2_metadata_run); DEBUG_print(" provided_data_sectors 0x%llx (%llu)\n", (unsigned long long)ic->provided_data_sectors, (unsigned long long)ic->provided_data_sectors); DEBUG_print(" log2_buffer_sectors %u\n", ic->log2_buffer_sectors); + DEBUG_print(" bits_in_journal %llu\n", (unsigned long long)bits_in_journal); - if (recalculate && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) { + if (ic->recalculate_flag && !(ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING))) { ic->sb->flags |= cpu_to_le32(SB_FLAG_RECALCULATING); ic->sb->recalc_sector = cpu_to_le64(0); } - if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { - if (!ic->internal_hash) { - r = -EINVAL; - ti->error = "Recalculate is only valid with internal hash"; - goto bad; - } + if (ic->internal_hash) { ic->recalc_wq = alloc_workqueue("dm-integrity-recalc", WQ_MEM_RECLAIM, 1); if (!ic->recalc_wq ) { ti->error = "Cannot allocate workqueue"; @@ -3494,6 +3997,45 @@ try_smaller_buffer: r = create_journal(ic, &ti->error); if (r) goto bad; + + } + + if (ic->mode == 'B') { + unsigned i; + unsigned n_bitmap_pages = DIV_ROUND_UP(ic->n_bitmap_blocks, PAGE_SIZE / BITMAP_BLOCK_SIZE); + + ic->recalc_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages); + if (!ic->recalc_bitmap) { + r = -ENOMEM; + goto bad; + } + ic->may_write_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages); + if (!ic->may_write_bitmap) { + r = -ENOMEM; + goto bad; + } + ic->bbs = kvmalloc_array(ic->n_bitmap_blocks, sizeof(struct bitmap_block_status), GFP_KERNEL); + if (!ic->bbs) { + r = -ENOMEM; + goto bad; + } + INIT_DELAYED_WORK(&ic->bitmap_flush_work, bitmap_flush_work); + for (i = 0; i < ic->n_bitmap_blocks; i++) { + struct bitmap_block_status *bbs = &ic->bbs[i]; + unsigned sector, pl_index, pl_offset; + + INIT_WORK(&bbs->work, bitmap_block_work); + bbs->ic = ic; + bbs->idx = i; + bio_list_init(&bbs->bio_queue); + spin_lock_init(&bbs->bio_queue_lock); + + sector = i * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT); + pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT); + pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1); + + bbs->bitmap = lowmem_page_address(ic->journal[pl_index].page) + pl_offset; + } } if (should_write_sb) { @@ -3518,6 +4060,17 @@ try_smaller_buffer: if (r) goto bad; } + if (ic->mode == 'B') { + unsigned max_io_len = ((sector_t)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit) * (BITMAP_BLOCK_SIZE * 8); + if (!max_io_len) + max_io_len = 1U << 31; + DEBUG_print("max_io_len: old %u, new %u\n", ti->max_io_len, max_io_len); + if (!ti->max_io_len || ti->max_io_len > max_io_len) { + r = dm_set_target_max_io_len(ti, max_io_len); + if (r) + goto bad; + } + } if (!ic->internal_hash) dm_integrity_set(ti, ic); @@ -3526,6 +4079,7 @@ try_smaller_buffer: ti->flush_supported = true; return 0; + bad: dm_integrity_dtr(ti); return r; @@ -3548,10 +4102,9 @@ static void dm_integrity_dtr(struct dm_target *ti) destroy_workqueue(ic->writer_wq); if (ic->recalc_wq) destroy_workqueue(ic->recalc_wq); - if (ic->recalc_buffer) - vfree(ic->recalc_buffer); - if (ic->recalc_tags) - kvfree(ic->recalc_tags); + vfree(ic->recalc_buffer); + kvfree(ic->recalc_tags); + kvfree(ic->bbs); if (ic->bufio) dm_bufio_client_destroy(ic->bufio); mempool_exit(&ic->journal_io_mempool); @@ -3561,9 +4114,11 @@ static void dm_integrity_dtr(struct dm_target *ti) dm_put_device(ti, ic->dev); if (ic->meta_dev) dm_put_device(ti, ic->meta_dev); - dm_integrity_free_page_list(ic, ic->journal); - dm_integrity_free_page_list(ic, ic->journal_io); - dm_integrity_free_page_list(ic, ic->journal_xor); + dm_integrity_free_page_list(ic->journal); + dm_integrity_free_page_list(ic->journal_io); + dm_integrity_free_page_list(ic->journal_xor); + dm_integrity_free_page_list(ic->recalc_bitmap); + dm_integrity_free_page_list(ic->may_write_bitmap); if (ic->journal_scatterlist) dm_integrity_free_journal_scatterlist(ic, ic->journal_scatterlist); if (ic->journal_io_scatterlist) @@ -3601,7 +4156,7 @@ static void dm_integrity_dtr(struct dm_target *ti) static struct target_type integrity_target = { .name = "integrity", - .version = {1, 2, 0}, + .version = {1, 3, 0}, .module = THIS_MODULE, .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, .ctr = dm_integrity_ctr, diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index c740153b4e52..1e03bc89e20f 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -2069,7 +2069,7 @@ int __init dm_early_create(struct dm_ioctl *dmi, /* alloc table */ r = dm_table_create(&t, get_mode(dmi), dmi->target_count, md); if (r) - goto err_destroy_dm; + goto err_hash_remove; /* add targets */ for (i = 0; i < dmi->target_count; i++) { @@ -2116,6 +2116,10 @@ int __init dm_early_create(struct dm_ioctl *dmi, err_destroy_table: dm_table_destroy(t); +err_hash_remove: + (void) __hash_remove(__get_name_cell(dmi->name)); + /* release reference from __get_name_cell */ + dm_put(md); err_destroy_dm: dm_put(md); dm_destroy(md); diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 9ea2b0291f20..e549392e0ea5 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -60,6 +60,7 @@ #define WRITE_LOG_VERSION 1ULL #define WRITE_LOG_MAGIC 0x6a736677736872ULL +#define WRITE_LOG_SUPER_SECTOR 0 /* * The disk format for this is braindead simple. @@ -115,6 +116,7 @@ struct log_writes_c { struct list_head logging_blocks; wait_queue_head_t wait; struct task_struct *log_kthread; + struct completion super_done; }; struct pending_block { @@ -180,6 +182,14 @@ static void log_end_io(struct bio *bio) bio_put(bio); } +static void log_end_super(struct bio *bio) +{ + struct log_writes_c *lc = bio->bi_private; + + complete(&lc->super_done); + log_end_io(bio); +} + /* * Meant to be called if there is an error, it will free all the pages * associated with the block. @@ -215,7 +225,8 @@ static int write_metadata(struct log_writes_c *lc, void *entry, bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = sector; bio_set_dev(bio, lc->logdev->bdev); - bio->bi_end_io = log_end_io; + bio->bi_end_io = (sector == WRITE_LOG_SUPER_SECTOR) ? + log_end_super : log_end_io; bio->bi_private = lc; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); @@ -418,11 +429,18 @@ static int log_super(struct log_writes_c *lc) super.nr_entries = cpu_to_le64(lc->logged_entries); super.sectorsize = cpu_to_le32(lc->sectorsize); - if (write_metadata(lc, &super, sizeof(super), NULL, 0, 0)) { + if (write_metadata(lc, &super, sizeof(super), NULL, 0, + WRITE_LOG_SUPER_SECTOR)) { DMERR("Couldn't write super"); return -1; } + /* + * Super sector should be writen in-order, otherwise the + * nr_entries could be rewritten incorrectly by an old bio. + */ + wait_for_completion_io(&lc->super_done); + return 0; } @@ -531,6 +549,7 @@ static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv) INIT_LIST_HEAD(&lc->unflushed_blocks); INIT_LIST_HEAD(&lc->logging_blocks); init_waitqueue_head(&lc->wait); + init_completion(&lc->super_done); atomic_set(&lc->io_blocks, 0); atomic_set(&lc->pending_blocks, 0); diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 2ee5e357a0a7..dbcc1e41cd57 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -544,8 +544,23 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, return DM_MAPIO_REMAPPED; } -static void multipath_release_clone(struct request *clone) +static void multipath_release_clone(struct request *clone, + union map_info *map_context) { + if (unlikely(map_context)) { + /* + * non-NULL map_context means caller is still map + * method; must undo multipath_clone_and_map() + */ + struct dm_mpath_io *mpio = get_mpio(map_context); + struct pgpath *pgpath = mpio->pgpath; + + if (pgpath && pgpath->pg->ps.type->end_io) + pgpath->pg->ps.type->end_io(&pgpath->pg->ps, + &pgpath->path, + mpio->nr_bytes); + } + blk_put_request(clone); } @@ -882,6 +897,7 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps if (attached_handler_name || m->hw_handler_name) { INIT_DELAYED_WORK(&p->activate_path, activate_path_work); r = setup_scsi_dh(p->path.dev->bdev, m, &attached_handler_name, &ti->error); + kfree(attached_handler_name); if (r) { dm_put_device(ti, p->path.dev); goto bad; @@ -896,7 +912,6 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps return p; bad: - kfree(attached_handler_name); free_pgpath(p); return ERR_PTR(r); } diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index b66745bd08bb..5f7063f05ae0 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -168,7 +168,7 @@ static void dm_end_request(struct request *clone, blk_status_t error) struct request *rq = tio->orig; blk_rq_unprep_clone(clone); - tio->ti->type->release_clone_rq(clone); + tio->ti->type->release_clone_rq(clone, NULL); rq_end_stats(md, rq); blk_mq_end_request(rq, error); @@ -201,7 +201,7 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_ rq_end_stats(md, rq); if (tio->clone) { blk_rq_unprep_clone(tio->clone); - tio->ti->type->release_clone_rq(tio->clone); + tio->ti->type->release_clone_rq(tio->clone, NULL); } dm_mq_delay_requeue_request(rq, delay_ms); @@ -398,7 +398,7 @@ static int map_request(struct dm_rq_target_io *tio) case DM_MAPIO_REMAPPED: if (setup_clone(clone, rq, tio, GFP_ATOMIC)) { /* -ENOMEM */ - ti->type->release_clone_rq(clone); + ti->type->release_clone_rq(clone, &tio->info); return DM_MAPIO_REQUEUE; } @@ -408,7 +408,7 @@ static int map_request(struct dm_rq_target_io *tio) ret = dm_dispatch_clone_request(clone, rq); if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { blk_rq_unprep_clone(clone); - tio->ti->type->release_clone_rq(clone); + tio->ti->type->release_clone_rq(clone, &tio->info); tio->clone = NULL; return DM_MAPIO_REQUEUE; } diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index a168963b757d..3107f2b1988b 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -13,6 +13,7 @@ #include <linux/init.h> #include <linux/kdev_t.h> #include <linux/list.h> +#include <linux/list_bl.h> #include <linux/mempool.h> #include <linux/module.h> #include <linux/slab.h> @@ -44,11 +45,11 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge"; struct dm_exception_table { uint32_t hash_mask; unsigned hash_shift; - struct list_head *table; + struct hlist_bl_head *table; }; struct dm_snapshot { - struct mutex lock; + struct rw_semaphore lock; struct dm_dev *origin; struct dm_dev *cow; @@ -76,7 +77,9 @@ struct dm_snapshot { atomic_t pending_exceptions_count; - /* Protected by "lock" */ + spinlock_t pe_allocation_lock; + + /* Protected by "pe_allocation_lock" */ sector_t exception_start_sequence; /* Protected by kcopyd single-threaded callback */ @@ -457,9 +460,9 @@ static int __find_snapshots_sharing_cow(struct dm_snapshot *snap, if (!bdev_equal(s->cow->bdev, snap->cow->bdev)) continue; - mutex_lock(&s->lock); + down_read(&s->lock); active = s->active; - mutex_unlock(&s->lock); + up_read(&s->lock); if (active) { if (snap_src) @@ -618,6 +621,36 @@ static void unregister_snapshot(struct dm_snapshot *s) * The lowest hash_shift bits of the chunk number are ignored, allowing * some consecutive chunks to be grouped together. */ +static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk); + +/* Lock to protect access to the completed and pending exception hash tables. */ +struct dm_exception_table_lock { + struct hlist_bl_head *complete_slot; + struct hlist_bl_head *pending_slot; +}; + +static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk, + struct dm_exception_table_lock *lock) +{ + struct dm_exception_table *complete = &s->complete; + struct dm_exception_table *pending = &s->pending; + + lock->complete_slot = &complete->table[exception_hash(complete, chunk)]; + lock->pending_slot = &pending->table[exception_hash(pending, chunk)]; +} + +static void dm_exception_table_lock(struct dm_exception_table_lock *lock) +{ + hlist_bl_lock(lock->complete_slot); + hlist_bl_lock(lock->pending_slot); +} + +static void dm_exception_table_unlock(struct dm_exception_table_lock *lock) +{ + hlist_bl_unlock(lock->pending_slot); + hlist_bl_unlock(lock->complete_slot); +} + static int dm_exception_table_init(struct dm_exception_table *et, uint32_t size, unsigned hash_shift) { @@ -625,12 +658,12 @@ static int dm_exception_table_init(struct dm_exception_table *et, et->hash_shift = hash_shift; et->hash_mask = size - 1; - et->table = dm_vcalloc(size, sizeof(struct list_head)); + et->table = dm_vcalloc(size, sizeof(struct hlist_bl_head)); if (!et->table) return -ENOMEM; for (i = 0; i < size; i++) - INIT_LIST_HEAD(et->table + i); + INIT_HLIST_BL_HEAD(et->table + i); return 0; } @@ -638,15 +671,16 @@ static int dm_exception_table_init(struct dm_exception_table *et, static void dm_exception_table_exit(struct dm_exception_table *et, struct kmem_cache *mem) { - struct list_head *slot; - struct dm_exception *ex, *next; + struct hlist_bl_head *slot; + struct dm_exception *ex; + struct hlist_bl_node *pos, *n; int i, size; size = et->hash_mask + 1; for (i = 0; i < size; i++) { slot = et->table + i; - list_for_each_entry_safe (ex, next, slot, hash_list) + hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list) kmem_cache_free(mem, ex); } @@ -660,7 +694,7 @@ static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk) static void dm_remove_exception(struct dm_exception *e) { - list_del(&e->hash_list); + hlist_bl_del(&e->hash_list); } /* @@ -670,11 +704,12 @@ static void dm_remove_exception(struct dm_exception *e) static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et, chunk_t chunk) { - struct list_head *slot; + struct hlist_bl_head *slot; + struct hlist_bl_node *pos; struct dm_exception *e; slot = &et->table[exception_hash(et, chunk)]; - list_for_each_entry (e, slot, hash_list) + hlist_bl_for_each_entry(e, pos, slot, hash_list) if (chunk >= e->old_chunk && chunk <= e->old_chunk + dm_consecutive_chunk_count(e)) return e; @@ -721,7 +756,8 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe) static void dm_insert_exception(struct dm_exception_table *eh, struct dm_exception *new_e) { - struct list_head *l; + struct hlist_bl_head *l; + struct hlist_bl_node *pos; struct dm_exception *e = NULL; l = &eh->table[exception_hash(eh, new_e->old_chunk)]; @@ -731,7 +767,7 @@ static void dm_insert_exception(struct dm_exception_table *eh, goto out; /* List is ordered by old_chunk */ - list_for_each_entry_reverse(e, l, hash_list) { + hlist_bl_for_each_entry(e, pos, l, hash_list) { /* Insert after an existing chunk? */ if (new_e->old_chunk == (e->old_chunk + dm_consecutive_chunk_count(e) + 1) && @@ -752,12 +788,24 @@ static void dm_insert_exception(struct dm_exception_table *eh, return; } - if (new_e->old_chunk > e->old_chunk) + if (new_e->old_chunk < e->old_chunk) break; } out: - list_add(&new_e->hash_list, e ? &e->hash_list : l); + if (!e) { + /* + * Either the table doesn't support consecutive chunks or slot + * l is empty. + */ + hlist_bl_add_head(&new_e->hash_list, l); + } else if (new_e->old_chunk < e->old_chunk) { + /* Add before an existing exception */ + hlist_bl_add_before(&new_e->hash_list, &e->hash_list); + } else { + /* Add to l's tail: e is the last exception in this slot */ + hlist_bl_add_behind(&new_e->hash_list, &e->hash_list); + } } /* @@ -766,6 +814,7 @@ out: */ static int dm_add_exception(void *context, chunk_t old, chunk_t new) { + struct dm_exception_table_lock lock; struct dm_snapshot *s = context; struct dm_exception *e; @@ -778,7 +827,17 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new) /* Consecutive_count is implicitly initialised to zero */ e->new_chunk = new; + /* + * Although there is no need to lock access to the exception tables + * here, if we don't then hlist_bl_add_head(), called by + * dm_insert_exception(), will complain about accessing the + * corresponding list without locking it first. + */ + dm_exception_table_lock_init(s, old, &lock); + + dm_exception_table_lock(&lock); dm_insert_exception(&s->complete, e); + dm_exception_table_unlock(&lock); return 0; } @@ -807,7 +866,7 @@ static int calc_max_buckets(void) { /* use a fixed size of 2MB */ unsigned long mem = 2 * 1024 * 1024; - mem /= sizeof(struct list_head); + mem /= sizeof(struct hlist_bl_head); return mem; } @@ -927,7 +986,7 @@ static int remove_single_exception_chunk(struct dm_snapshot *s) int r; chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1; - mutex_lock(&s->lock); + down_write(&s->lock); /* * Process chunks (and associated exceptions) in reverse order @@ -942,7 +1001,7 @@ static int remove_single_exception_chunk(struct dm_snapshot *s) b = __release_queued_bios_after_merge(s); out: - mutex_unlock(&s->lock); + up_write(&s->lock); if (b) flush_bios(b); @@ -1001,9 +1060,9 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s) if (linear_chunks < 0) { DMERR("Read error in exception store: " "shutting down merge"); - mutex_lock(&s->lock); + down_write(&s->lock); s->merge_failed = 1; - mutex_unlock(&s->lock); + up_write(&s->lock); } goto shut; } @@ -1044,10 +1103,10 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s) previous_count = read_pending_exceptions_done_count(); } - mutex_lock(&s->lock); + down_write(&s->lock); s->first_merging_chunk = old_chunk; s->num_merging_chunks = linear_chunks; - mutex_unlock(&s->lock); + up_write(&s->lock); /* Wait until writes to all 'linear_chunks' drain */ for (i = 0; i < linear_chunks; i++) @@ -1089,10 +1148,10 @@ static void merge_callback(int read_err, unsigned long write_err, void *context) return; shut: - mutex_lock(&s->lock); + down_write(&s->lock); s->merge_failed = 1; b = __release_queued_bios_after_merge(s); - mutex_unlock(&s->lock); + up_write(&s->lock); error_bios(b); merge_shutdown(s); @@ -1188,10 +1247,11 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s->snapshot_overflowed = 0; s->active = 0; atomic_set(&s->pending_exceptions_count, 0); + spin_lock_init(&s->pe_allocation_lock); s->exception_start_sequence = 0; s->exception_complete_sequence = 0; s->out_of_order_tree = RB_ROOT; - mutex_init(&s->lock); + init_rwsem(&s->lock); INIT_LIST_HEAD(&s->list); spin_lock_init(&s->pe_lock); s->state_bits = 0; @@ -1357,9 +1417,9 @@ static void snapshot_dtr(struct dm_target *ti) /* Check whether exception handover must be cancelled */ (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); if (snap_src && snap_dest && (s == snap_src)) { - mutex_lock(&snap_dest->lock); + down_write(&snap_dest->lock); snap_dest->valid = 0; - mutex_unlock(&snap_dest->lock); + up_write(&snap_dest->lock); DMERR("Cancelling snapshot handover."); } up_read(&_origins_lock); @@ -1390,8 +1450,6 @@ static void snapshot_dtr(struct dm_target *ti) dm_exception_store_destroy(s->store); - mutex_destroy(&s->lock); - dm_put_device(ti, s->cow); dm_put_device(ti, s->origin); @@ -1467,6 +1525,13 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err) dm_table_event(s->ti->table); } +static void invalidate_snapshot(struct dm_snapshot *s, int err) +{ + down_write(&s->lock); + __invalidate_snapshot(s, err); + up_write(&s->lock); +} + static void pending_complete(void *context, int success) { struct dm_snap_pending_exception *pe = context; @@ -1475,43 +1540,63 @@ static void pending_complete(void *context, int success) struct bio *origin_bios = NULL; struct bio *snapshot_bios = NULL; struct bio *full_bio = NULL; + struct dm_exception_table_lock lock; int error = 0; + dm_exception_table_lock_init(s, pe->e.old_chunk, &lock); + if (!success) { /* Read/write error - snapshot is unusable */ - mutex_lock(&s->lock); - __invalidate_snapshot(s, -EIO); + invalidate_snapshot(s, -EIO); error = 1; + + dm_exception_table_lock(&lock); goto out; } e = alloc_completed_exception(GFP_NOIO); if (!e) { - mutex_lock(&s->lock); - __invalidate_snapshot(s, -ENOMEM); + invalidate_snapshot(s, -ENOMEM); error = 1; + + dm_exception_table_lock(&lock); goto out; } *e = pe->e; - mutex_lock(&s->lock); + down_read(&s->lock); + dm_exception_table_lock(&lock); if (!s->valid) { + up_read(&s->lock); free_completed_exception(e); error = 1; + goto out; } - /* Check for conflicting reads */ - __check_for_conflicting_io(s, pe->e.old_chunk); - /* - * Add a proper exception, and remove the - * in-flight exception from the list. + * Add a proper exception. After inserting the completed exception all + * subsequent snapshot reads to this chunk will be redirected to the + * COW device. This ensures that we do not starve. Moreover, as long + * as the pending exception exists, neither origin writes nor snapshot + * merging can overwrite the chunk in origin. */ dm_insert_exception(&s->complete, e); + up_read(&s->lock); + + /* Wait for conflicting reads to drain */ + if (__chunk_is_tracked(s, pe->e.old_chunk)) { + dm_exception_table_unlock(&lock); + __check_for_conflicting_io(s, pe->e.old_chunk); + dm_exception_table_lock(&lock); + } out: + /* Remove the in-flight exception from the list */ dm_remove_exception(&pe->e); + + dm_exception_table_unlock(&lock); + snapshot_bios = bio_list_get(&pe->snapshot_bios); origin_bios = bio_list_get(&pe->origin_bios); full_bio = pe->full_bio; @@ -1519,8 +1604,6 @@ out: full_bio->bi_end_io = pe->full_bio_end_io; increment_pending_exceptions_done_count(); - mutex_unlock(&s->lock); - /* Submit any pending write bios */ if (error) { if (full_bio) @@ -1660,43 +1743,59 @@ __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk) } /* - * Looks to see if this snapshot already has a pending exception - * for this chunk, otherwise it allocates a new one and inserts - * it into the pending table. + * Inserts a pending exception into the pending table. * - * NOTE: a write lock must be held on snap->lock before calling - * this. + * NOTE: a write lock must be held on the chunk's pending exception table slot + * before calling this. */ static struct dm_snap_pending_exception * -__find_pending_exception(struct dm_snapshot *s, - struct dm_snap_pending_exception *pe, chunk_t chunk) +__insert_pending_exception(struct dm_snapshot *s, + struct dm_snap_pending_exception *pe, chunk_t chunk) { - struct dm_snap_pending_exception *pe2; - - pe2 = __lookup_pending_exception(s, chunk); - if (pe2) { - free_pending_exception(pe); - return pe2; - } - pe->e.old_chunk = chunk; bio_list_init(&pe->origin_bios); bio_list_init(&pe->snapshot_bios); pe->started = 0; pe->full_bio = NULL; + spin_lock(&s->pe_allocation_lock); if (s->store->type->prepare_exception(s->store, &pe->e)) { + spin_unlock(&s->pe_allocation_lock); free_pending_exception(pe); return NULL; } pe->exception_sequence = s->exception_start_sequence++; + spin_unlock(&s->pe_allocation_lock); dm_insert_exception(&s->pending, &pe->e); return pe; } +/* + * Looks to see if this snapshot already has a pending exception + * for this chunk, otherwise it allocates a new one and inserts + * it into the pending table. + * + * NOTE: a write lock must be held on the chunk's pending exception table slot + * before calling this. + */ +static struct dm_snap_pending_exception * +__find_pending_exception(struct dm_snapshot *s, + struct dm_snap_pending_exception *pe, chunk_t chunk) +{ + struct dm_snap_pending_exception *pe2; + + pe2 = __lookup_pending_exception(s, chunk); + if (pe2) { + free_pending_exception(pe); + return pe2; + } + + return __insert_pending_exception(s, pe, chunk); +} + static void remap_exception(struct dm_snapshot *s, struct dm_exception *e, struct bio *bio, chunk_t chunk) { @@ -1714,6 +1813,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) int r = DM_MAPIO_REMAPPED; chunk_t chunk; struct dm_snap_pending_exception *pe = NULL; + struct dm_exception_table_lock lock; init_tracked_chunk(bio); @@ -1723,13 +1823,15 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) } chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector); + dm_exception_table_lock_init(s, chunk, &lock); /* Full snapshots are not usable */ /* To get here the table must be live so s->active is always set. */ if (!s->valid) return DM_MAPIO_KILL; - mutex_lock(&s->lock); + down_read(&s->lock); + dm_exception_table_lock(&lock); if (!s->valid || (unlikely(s->snapshot_overflowed) && bio_data_dir(bio) == WRITE)) { @@ -1752,15 +1854,9 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) if (bio_data_dir(bio) == WRITE) { pe = __lookup_pending_exception(s, chunk); if (!pe) { - mutex_unlock(&s->lock); + dm_exception_table_unlock(&lock); pe = alloc_pending_exception(s); - mutex_lock(&s->lock); - - if (!s->valid || s->snapshot_overflowed) { - free_pending_exception(pe); - r = DM_MAPIO_KILL; - goto out_unlock; - } + dm_exception_table_lock(&lock); e = dm_lookup_exception(&s->complete, chunk); if (e) { @@ -1771,13 +1867,22 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) pe = __find_pending_exception(s, pe, chunk); if (!pe) { + dm_exception_table_unlock(&lock); + up_read(&s->lock); + + down_write(&s->lock); + if (s->store->userspace_supports_overflow) { - s->snapshot_overflowed = 1; - DMERR("Snapshot overflowed: Unable to allocate exception."); + if (s->valid && !s->snapshot_overflowed) { + s->snapshot_overflowed = 1; + DMERR("Snapshot overflowed: Unable to allocate exception."); + } } else __invalidate_snapshot(s, -ENOMEM); + up_write(&s->lock); + r = DM_MAPIO_KILL; - goto out_unlock; + goto out; } } @@ -1789,7 +1894,10 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) bio->bi_iter.bi_size == (s->store->chunk_size << SECTOR_SHIFT)) { pe->started = 1; - mutex_unlock(&s->lock); + + dm_exception_table_unlock(&lock); + up_read(&s->lock); + start_full_bio(pe, bio); goto out; } @@ -1797,9 +1905,12 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) bio_list_add(&pe->snapshot_bios, bio); if (!pe->started) { - /* this is protected by snap->lock */ + /* this is protected by the exception table lock */ pe->started = 1; - mutex_unlock(&s->lock); + + dm_exception_table_unlock(&lock); + up_read(&s->lock); + start_copy(pe); goto out; } @@ -1809,7 +1920,8 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) } out_unlock: - mutex_unlock(&s->lock); + dm_exception_table_unlock(&lock); + up_read(&s->lock); out: return r; } @@ -1845,7 +1957,7 @@ static int snapshot_merge_map(struct dm_target *ti, struct bio *bio) chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector); - mutex_lock(&s->lock); + down_write(&s->lock); /* Full merging snapshots are redirected to the origin */ if (!s->valid) @@ -1876,12 +1988,12 @@ redirect_to_origin: bio_set_dev(bio, s->origin->bdev); if (bio_data_dir(bio) == WRITE) { - mutex_unlock(&s->lock); + up_write(&s->lock); return do_origin(s->origin, bio); } out_unlock: - mutex_unlock(&s->lock); + up_write(&s->lock); return r; } @@ -1913,7 +2025,7 @@ static int snapshot_preresume(struct dm_target *ti) down_read(&_origins_lock); (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); if (snap_src && snap_dest) { - mutex_lock(&snap_src->lock); + down_read(&snap_src->lock); if (s == snap_src) { DMERR("Unable to resume snapshot source until " "handover completes."); @@ -1923,7 +2035,7 @@ static int snapshot_preresume(struct dm_target *ti) "source is suspended."); r = -EINVAL; } - mutex_unlock(&snap_src->lock); + up_read(&snap_src->lock); } up_read(&_origins_lock); @@ -1969,11 +2081,11 @@ static void snapshot_resume(struct dm_target *ti) (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL); if (snap_src && snap_dest) { - mutex_lock(&snap_src->lock); - mutex_lock_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING); + down_write(&snap_src->lock); + down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING); __handover_exceptions(snap_src, snap_dest); - mutex_unlock(&snap_dest->lock); - mutex_unlock(&snap_src->lock); + up_write(&snap_dest->lock); + up_write(&snap_src->lock); } up_read(&_origins_lock); @@ -1988,9 +2100,9 @@ static void snapshot_resume(struct dm_target *ti) /* Now we have correct chunk size, reregister */ reregister_snapshot(s); - mutex_lock(&s->lock); + down_write(&s->lock); s->active = 1; - mutex_unlock(&s->lock); + up_write(&s->lock); } static uint32_t get_origin_minimum_chunksize(struct block_device *bdev) @@ -2030,7 +2142,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type, switch (type) { case STATUSTYPE_INFO: - mutex_lock(&snap->lock); + down_write(&snap->lock); if (!snap->valid) DMEMIT("Invalid"); @@ -2055,7 +2167,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type, DMEMIT("Unknown"); } - mutex_unlock(&snap->lock); + up_write(&snap->lock); break; @@ -2107,9 +2219,10 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, int r = DM_MAPIO_REMAPPED; struct dm_snapshot *snap; struct dm_exception *e; - struct dm_snap_pending_exception *pe; + struct dm_snap_pending_exception *pe, *pe2; struct dm_snap_pending_exception *pe_to_start_now = NULL; struct dm_snap_pending_exception *pe_to_start_last = NULL; + struct dm_exception_table_lock lock; chunk_t chunk; /* Do all the snapshots on this origin */ @@ -2121,52 +2234,59 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, if (dm_target_is_snapshot_merge(snap->ti)) continue; - mutex_lock(&snap->lock); - - /* Only deal with valid and active snapshots */ - if (!snap->valid || !snap->active) - goto next_snapshot; - /* Nothing to do if writing beyond end of snapshot */ if (sector >= dm_table_get_size(snap->ti->table)) - goto next_snapshot; + continue; /* * Remember, different snapshots can have * different chunk sizes. */ chunk = sector_to_chunk(snap->store, sector); + dm_exception_table_lock_init(snap, chunk, &lock); - /* - * Check exception table to see if block - * is already remapped in this snapshot - * and trigger an exception if not. - */ - e = dm_lookup_exception(&snap->complete, chunk); - if (e) + down_read(&snap->lock); + dm_exception_table_lock(&lock); + + /* Only deal with valid and active snapshots */ + if (!snap->valid || !snap->active) goto next_snapshot; pe = __lookup_pending_exception(snap, chunk); if (!pe) { - mutex_unlock(&snap->lock); - pe = alloc_pending_exception(snap); - mutex_lock(&snap->lock); - - if (!snap->valid) { - free_pending_exception(pe); - goto next_snapshot; - } - + /* + * Check exception table to see if block is already + * remapped in this snapshot and trigger an exception + * if not. + */ e = dm_lookup_exception(&snap->complete, chunk); - if (e) { - free_pending_exception(pe); + if (e) goto next_snapshot; - } - pe = __find_pending_exception(snap, pe, chunk); - if (!pe) { - __invalidate_snapshot(snap, -ENOMEM); - goto next_snapshot; + dm_exception_table_unlock(&lock); + pe = alloc_pending_exception(snap); + dm_exception_table_lock(&lock); + + pe2 = __lookup_pending_exception(snap, chunk); + + if (!pe2) { + e = dm_lookup_exception(&snap->complete, chunk); + if (e) { + free_pending_exception(pe); + goto next_snapshot; + } + + pe = __insert_pending_exception(snap, pe, chunk); + if (!pe) { + dm_exception_table_unlock(&lock); + up_read(&snap->lock); + + invalidate_snapshot(snap, -ENOMEM); + continue; + } + } else { + free_pending_exception(pe); + pe = pe2; } } @@ -2193,7 +2313,8 @@ static int __origin_write(struct list_head *snapshots, sector_t sector, } next_snapshot: - mutex_unlock(&snap->lock); + dm_exception_table_unlock(&lock); + up_read(&snap->lock); if (pe_to_start_now) { start_copy(pe_to_start_now); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index cde3b49b2a91..ec8b27e20de3 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -561,7 +561,7 @@ static char **realloc_argv(unsigned *size, char **old_argv) gfp = GFP_NOIO; } argv = kmalloc_array(new_size, sizeof(*argv), gfp); - if (argv) { + if (argv && old_argv) { memcpy(argv, old_argv, *size * sizeof(*argv)); *size = new_size; } @@ -880,13 +880,17 @@ void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type) } EXPORT_SYMBOL_GPL(dm_table_set_type); +/* validate the dax capability of the target device span */ static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) + sector_t start, sector_t len, void *data) { - return bdev_dax_supported(dev->bdev, PAGE_SIZE); + int blocksize = *(int *) data; + + return generic_fsdax_supported(dev->dax_dev, dev->bdev, blocksize, + start, len); } -static bool dm_table_supports_dax(struct dm_table *t) +bool dm_table_supports_dax(struct dm_table *t, int blocksize) { struct dm_target *ti; unsigned i; @@ -899,7 +903,8 @@ static bool dm_table_supports_dax(struct dm_table *t) return false; if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, device_supports_dax, NULL)) + !ti->type->iterate_devices(ti, device_supports_dax, + &blocksize)) return false; } @@ -979,7 +984,7 @@ static int dm_table_determine_type(struct dm_table *t) verify_bio_based: /* We must use this table as bio-based */ t->type = DM_TYPE_BIO_BASED; - if (dm_table_supports_dax(t) || + if (dm_table_supports_dax(t, PAGE_SIZE) || (list_empty(devices) && live_md_type == DM_TYPE_DAX_BIO_BASED)) { t->type = DM_TYPE_DAX_BIO_BASED; } else { @@ -1905,7 +1910,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, } blk_queue_write_cache(q, wc, fua); - if (dm_table_supports_dax(t)) + if (dm_table_supports_dax(t, PAGE_SIZE)) blk_queue_flag_set(QUEUE_FLAG_DAX, q); else blk_queue_flag_clear(QUEUE_FLAG_DAX, q); diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 314d17ca6466..64dd0b34fcf4 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -136,7 +136,8 @@ static int io_err_clone_and_map_rq(struct dm_target *ti, struct request *rq, return DM_MAPIO_KILL; } -static void io_err_release_clone_rq(struct request *clone) +static void io_err_release_clone_rq(struct request *clone, + union map_info *map_context) { } diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index ed3caceaed07..7f0840601737 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -202,6 +202,13 @@ struct dm_pool_metadata { bool fail_io:1; /* + * Set once a thin-pool has been accessed through one of the interfaces + * that imply the pool is in-service (e.g. thin devices created/deleted, + * thin-pool message, metadata snapshots, etc). + */ + bool in_service:1; + + /* * Reading the space map roots can fail, so we read it into these * buffers before the superblock is locked and updated. */ @@ -367,6 +374,32 @@ static int subtree_equal(void *context, const void *value1_le, const void *value /*----------------------------------------------------------------*/ +/* + * Variant that is used for in-core only changes or code that + * shouldn't put the pool in service on its own (e.g. commit). + */ +static inline void __pmd_write_lock(struct dm_pool_metadata *pmd) + __acquires(pmd->root_lock) +{ + down_write(&pmd->root_lock); +} +#define pmd_write_lock_in_core(pmd) __pmd_write_lock((pmd)) + +static inline void pmd_write_lock(struct dm_pool_metadata *pmd) +{ + __pmd_write_lock(pmd); + if (unlikely(!pmd->in_service)) + pmd->in_service = true; +} + +static inline void pmd_write_unlock(struct dm_pool_metadata *pmd) + __releases(pmd->root_lock) +{ + up_write(&pmd->root_lock); +} + +/*----------------------------------------------------------------*/ + static int superblock_lock_zero(struct dm_pool_metadata *pmd, struct dm_block **sblock) { @@ -790,6 +823,9 @@ static int __commit_transaction(struct dm_pool_metadata *pmd) */ BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512); + if (unlikely(!pmd->in_service)) + return 0; + r = __write_changed_details(pmd); if (r < 0) return r; @@ -853,6 +889,7 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, pmd->time = 0; INIT_LIST_HEAD(&pmd->thin_devices); pmd->fail_io = false; + pmd->in_service = false; pmd->bdev = bdev; pmd->data_block_size = data_block_size; @@ -903,7 +940,6 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd) DMWARN("%s: __commit_transaction() failed, error = %d", __func__, r); } - if (!pmd->fail_io) __destroy_persistent_data_objects(pmd); @@ -1032,10 +1068,10 @@ int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev) { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (!pmd->fail_io) r = __create_thin(pmd, dev); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1123,10 +1159,10 @@ int dm_pool_create_snap(struct dm_pool_metadata *pmd, { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (!pmd->fail_io) r = __create_snap(pmd, dev, origin); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1166,10 +1202,10 @@ int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd, { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (!pmd->fail_io) r = __delete_device(pmd, dev); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1180,7 +1216,7 @@ int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd, { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (pmd->fail_io) goto out; @@ -1194,7 +1230,7 @@ int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd, r = 0; out: - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1225,7 +1261,12 @@ static int __reserve_metadata_snap(struct dm_pool_metadata *pmd) * We commit to ensure the btree roots which we increment in a * moment are up to date. */ - __commit_transaction(pmd); + r = __commit_transaction(pmd); + if (r < 0) { + DMWARN("%s: __commit_transaction() failed, error = %d", + __func__, r); + return r; + } /* * Copy the superblock. @@ -1283,10 +1324,10 @@ int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd) { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (!pmd->fail_io) r = __reserve_metadata_snap(pmd); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1331,10 +1372,10 @@ int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd) { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (!pmd->fail_io) r = __release_metadata_snap(pmd); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1377,19 +1418,19 @@ int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev, { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock_in_core(pmd); if (!pmd->fail_io) r = __open_device(pmd, dev, 0, td); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } int dm_pool_close_thin_device(struct dm_thin_device *td) { - down_write(&td->pmd->root_lock); + pmd_write_lock_in_core(td->pmd); __close_device(td); - up_write(&td->pmd->root_lock); + pmd_write_unlock(td->pmd); return 0; } @@ -1570,10 +1611,10 @@ int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block, { int r = -EINVAL; - down_write(&td->pmd->root_lock); + pmd_write_lock(td->pmd); if (!td->pmd->fail_io) r = __insert(td, block, data_block); - up_write(&td->pmd->root_lock); + pmd_write_unlock(td->pmd); return r; } @@ -1657,10 +1698,10 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block) { int r = -EINVAL; - down_write(&td->pmd->root_lock); + pmd_write_lock(td->pmd); if (!td->pmd->fail_io) r = __remove(td, block); - up_write(&td->pmd->root_lock); + pmd_write_unlock(td->pmd); return r; } @@ -1670,10 +1711,10 @@ int dm_thin_remove_range(struct dm_thin_device *td, { int r = -EINVAL; - down_write(&td->pmd->root_lock); + pmd_write_lock(td->pmd); if (!td->pmd->fail_io) r = __remove_range(td, begin, end); - up_write(&td->pmd->root_lock); + pmd_write_unlock(td->pmd); return r; } @@ -1696,13 +1737,13 @@ int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_ { int r = 0; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); for (; b != e; b++) { r = dm_sm_inc_block(pmd->data_sm, b); if (r) break; } - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1711,13 +1752,13 @@ int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_ { int r = 0; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); for (; b != e; b++) { r = dm_sm_dec_block(pmd->data_sm, b); if (r) break; } - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1765,10 +1806,10 @@ int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result) { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (!pmd->fail_io) r = dm_sm_new_block(pmd->data_sm, result); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1777,12 +1818,16 @@ int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) { int r = -EINVAL; - down_write(&pmd->root_lock); + /* + * Care is taken to not have commit be what + * triggers putting the thin-pool in-service. + */ + __pmd_write_lock(pmd); if (pmd->fail_io) goto out; r = __commit_transaction(pmd); - if (r <= 0) + if (r < 0) goto out; /* @@ -1790,7 +1835,7 @@ int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) */ r = __begin_transaction(pmd); out: - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1806,7 +1851,7 @@ int dm_pool_abort_metadata(struct dm_pool_metadata *pmd) { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (pmd->fail_io) goto out; @@ -1817,7 +1862,7 @@ int dm_pool_abort_metadata(struct dm_pool_metadata *pmd) pmd->fail_io = true; out: - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1948,10 +1993,10 @@ int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (!pmd->fail_io) r = __resize_space_map(pmd->data_sm, new_count); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -1960,29 +2005,29 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_cou { int r = -EINVAL; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); if (!pmd->fail_io) { r = __resize_space_map(pmd->metadata_sm, new_count); if (!r) __set_metadata_reserve(pmd); } - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd) { - down_write(&pmd->root_lock); + pmd_write_lock_in_core(pmd); dm_bm_set_read_only(pmd->bm); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); } void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd) { - down_write(&pmd->root_lock); + pmd_write_lock_in_core(pmd); dm_bm_set_read_write(pmd->bm); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); } int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, @@ -1992,9 +2037,9 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, { int r; - down_write(&pmd->root_lock); + pmd_write_lock_in_core(pmd); r = dm_sm_register_threshold_callback(pmd->metadata_sm, threshold, fn, context); - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } @@ -2005,7 +2050,7 @@ int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd) struct dm_block *sblock; struct thin_disk_superblock *disk_super; - down_write(&pmd->root_lock); + pmd_write_lock(pmd); pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG; r = superblock_lock(pmd, &sblock); @@ -2019,7 +2064,7 @@ int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd) dm_bm_unlock(sblock); out: - up_write(&pmd->root_lock); + pmd_write_unlock(pmd); return r; } diff --git a/drivers/md/dm-uevent.c b/drivers/md/dm-uevent.c index 8efe033bab55..8671267200d8 100644 --- a/drivers/md/dm-uevent.c +++ b/drivers/md/dm-uevent.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Device Mapper Uevent Support (dm-uevent) * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * Copyright IBM Corporation, 2007 * Author: Mike Anderson <andmike@linux.vnet.ibm.com> */ diff --git a/drivers/md/dm-uevent.h b/drivers/md/dm-uevent.h index 2eccc8bd671a..d30d226f2a18 100644 --- a/drivers/md/dm-uevent.h +++ b/drivers/md/dm-uevent.h @@ -1,20 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Device Mapper Uevent Support * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * Copyright IBM Corporation, 2007 * Author: Mike Anderson <andmike@linux.vnet.ibm.com> */ diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index b634fa23f4c4..3ceeb6b404ed 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2015 Google, Inc. * * Author: Sami Tolvanen <samitolvanen@google.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. */ #include "dm-verity-fec.h" diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h index 6ad803b2b36c..42fbd3a7fc9f 100644 --- a/drivers/md/dm-verity-fec.h +++ b/drivers/md/dm-verity-fec.h @@ -1,12 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (C) 2015 Google, Inc. * * Author: Sami Tolvanen <samitolvanen@google.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. */ #ifndef DM_VERITY_FEC_H diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index f4c31ffaa88e..ea24ff0612e3 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012 Red Hat, Inc. * @@ -5,8 +6,6 @@ * * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors * - * This file is released under the GPLv2. - * * In the file "/sys/module/dm_verity/parameters/prefetch_cluster" you can set * default prefetch value. Data are read in "prefetch_cluster" chunks from the * hash device. Setting this greatly improves performance when data and hash @@ -236,8 +235,8 @@ static int verity_handle_err(struct dm_verity *v, enum verity_block_type type, BUG(); } - DMERR("%s: %s block %llu is corrupted", v->data_dev->name, type_str, - block); + DMERR_LIMIT("%s: %s block %llu is corrupted", v->data_dev->name, + type_str, block); if (v->corrupted_errs == DM_VERITY_MAX_CORRUPTED_ERRS) DMERR("%s: reached maximum errors", v->data_dev->name); diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index 3441c10b840c..eeaf940aef6d 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012 Red Hat, Inc. * Copyright (C) 2015 Google, Inc. @@ -5,8 +6,6 @@ * Author: Mikulas Patocka <mpatocka@redhat.com> * * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors - * - * This file is released under the GPLv2. */ #ifndef DM_VERITY_H diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index f7822875589e..1cb137f0ef9d 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -190,7 +190,6 @@ struct writeback_struct { struct dm_writecache *wc; struct wc_entry **wc_list; unsigned wc_list_n; - unsigned page_offset; struct page *page; struct wc_entry *wc_list_inline[WB_LIST_INLINE]; struct bio bio; @@ -546,21 +545,20 @@ static struct wc_entry *writecache_find_entry(struct dm_writecache *wc, e = container_of(node, struct wc_entry, rb_node); if (read_original_sector(wc, e) == block) break; + node = (read_original_sector(wc, e) >= block ? e->rb_node.rb_left : e->rb_node.rb_right); if (unlikely(!node)) { - if (!(flags & WFE_RETURN_FOLLOWING)) { + if (!(flags & WFE_RETURN_FOLLOWING)) return NULL; - } if (read_original_sector(wc, e) >= block) { - break; + return e; } else { node = rb_next(&e->rb_node); - if (unlikely(!node)) { + if (unlikely(!node)) return NULL; - } e = container_of(node, struct wc_entry, rb_node); - break; + return e; } } } @@ -571,7 +569,7 @@ static struct wc_entry *writecache_find_entry(struct dm_writecache *wc, node = rb_prev(&e->rb_node); else node = rb_next(&e->rb_node); - if (!node) + if (unlikely(!node)) return e; e2 = container_of(node, struct wc_entry, rb_node); if (read_original_sector(wc, e2) != block) @@ -804,7 +802,7 @@ static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_ writecache_free_entry(wc, e); } - if (!node) + if (unlikely(!node)) break; e = container_of(node, struct wc_entry, rb_node); @@ -1478,10 +1476,9 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set); wb = container_of(bio, struct writeback_struct, bio); wb->wc = wc; - wb->bio.bi_end_io = writecache_writeback_endio; - bio_set_dev(&wb->bio, wc->dev->bdev); - wb->bio.bi_iter.bi_sector = read_original_sector(wc, e); - wb->page_offset = PAGE_SIZE; + bio->bi_end_io = writecache_writeback_endio; + bio_set_dev(bio, wc->dev->bdev); + bio->bi_iter.bi_sector = read_original_sector(wc, e); if (max_pages <= WB_LIST_INLINE || unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *), GFP_NOIO | __GFP_NORETRY | @@ -1507,12 +1504,12 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba wb->wc_list[wb->wc_list_n++] = f; e = f; } - bio_set_op_attrs(&wb->bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA); + bio_set_op_attrs(bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA); if (writecache_has_error(wc)) { bio->bi_status = BLK_STS_IOERR; - bio_endio(&wb->bio); + bio_endio(bio); } else { - submit_bio(&wb->bio); + submit_bio(bio); } __writeback_throttle(wc, wbl); diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index fa68336560c3..d8334cd45d7c 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1169,6 +1169,9 @@ static int dmz_init_zones(struct dmz_metadata *zmd) goto out; } + if (!nr_blkz) + break; + /* Process report */ for (i = 0; i < nr_blkz; i++) { ret = dmz_init_zone(zmd, zone, &blkz[i]); @@ -1204,6 +1207,8 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) /* Get zone information from disk */ ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone), &blkz, &nr_blkz, GFP_NOIO); + if (!nr_blkz) + ret = -EIO; if (ret) { dmz_dev_err(zmd->dev, "Get zone %u report failed", dmz_id(zmd, zone)); diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 8865c1709e16..51d029bbb740 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -643,7 +643,8 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path) q = bdev_get_queue(dev->bdev); dev->capacity = i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; - aligned_capacity = dev->capacity & ~(blk_queue_zone_sectors(q) - 1); + aligned_capacity = dev->capacity & + ~((sector_t)blk_queue_zone_sectors(q) - 1); if (ti->begin || ((ti->len != dev->capacity) && (ti->len != aligned_capacity))) { ti->error = "Partial mapping not supported"; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 043f0761e4a0..5475081dcbd6 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -781,7 +781,8 @@ static void close_table_device(struct table_device *td, struct mapped_device *md } static struct table_device *find_table_device(struct list_head *l, dev_t dev, - fmode_t mode) { + fmode_t mode) +{ struct table_device *td; list_for_each_entry(td, l, list) @@ -792,7 +793,8 @@ static struct table_device *find_table_device(struct list_head *l, dev_t dev, } int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, - struct dm_dev **result) { + struct dm_dev **result) +{ int r; struct table_device *td; @@ -1105,6 +1107,25 @@ static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, return ret; } +static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bdev, + int blocksize, sector_t start, sector_t len) +{ + struct mapped_device *md = dax_get_private(dax_dev); + struct dm_table *map; + int srcu_idx; + bool ret; + + map = dm_get_live_table(md, &srcu_idx); + if (!map) + return false; + + ret = dm_table_supports_dax(map, blocksize); + + dm_put_live_table(md, srcu_idx); + + return ret; +} + static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) { @@ -1467,7 +1488,7 @@ static unsigned get_num_write_zeroes_bios(struct dm_target *ti) static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti, unsigned num_bios) { - unsigned len = ci->sector_count; + unsigned len; /* * Even though the device advertised support for this type of @@ -1478,6 +1499,8 @@ static int __send_changing_extent_only(struct clone_info *ci, struct dm_target * if (!num_bios) return -EOPNOTSUPP; + len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); + __send_duplicate_bios(ci, ti, num_bios, &len); ci->sector += len; @@ -1906,7 +1929,6 @@ static void cleanup_mapped_device(struct mapped_device *md) static struct mapped_device *alloc_dev(int minor) { int r, numa_node_id = dm_get_numa_node(); - struct dax_device *dax_dev = NULL; struct mapped_device *md; void *old_md; @@ -1969,11 +1991,10 @@ static struct mapped_device *alloc_dev(int minor) sprintf(md->disk->disk_name, "dm-%d", minor); if (IS_ENABLED(CONFIG_DAX_DRIVER)) { - dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops); - if (!dax_dev) + md->dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops); + if (!md->dax_dev) goto bad; } - md->dax_dev = dax_dev; add_disk_no_queue_reg(md->disk); format_dev_t(md->name, MKDEV(_major, minor)); @@ -3192,6 +3213,7 @@ static const struct block_device_operations dm_blk_dops = { static const struct dax_operations dm_dax_ops = { .direct_access = dm_dax_direct_access, + .dax_supported = dm_dax_supported, .copy_from_iter = dm_dax_copy_from_iter, .copy_to_iter = dm_dax_copy_to_iter, }; diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 2d539b82ec08..17e3db54404c 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -72,6 +72,7 @@ bool dm_table_bio_based(struct dm_table *t); bool dm_table_request_based(struct dm_table *t); void dm_table_free_md_mempools(struct dm_table *t); struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); +bool dm_table_supports_dax(struct dm_table *t, int blocksize); void dm_lock_md_type(struct mapped_device *md); void dm_unlock_md_type(struct mapped_device *md); diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 1cd4f991792c..c01d41198f5e 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * bitmap.c two-level bitmap (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 * @@ -490,10 +491,10 @@ void md_bitmap_print_sb(struct bitmap *bitmap) pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic)); pr_debug(" version: %d\n", le32_to_cpu(sb->version)); pr_debug(" uuid: %08x.%08x.%08x.%08x\n", - le32_to_cpu(*(__u32 *)(sb->uuid+0)), - le32_to_cpu(*(__u32 *)(sb->uuid+4)), - le32_to_cpu(*(__u32 *)(sb->uuid+8)), - le32_to_cpu(*(__u32 *)(sb->uuid+12))); + le32_to_cpu(*(__le32 *)(sb->uuid+0)), + le32_to_cpu(*(__le32 *)(sb->uuid+4)), + le32_to_cpu(*(__le32 *)(sb->uuid+8)), + le32_to_cpu(*(__le32 *)(sb->uuid+12))); pr_debug(" events: %llu\n", (unsigned long long) le64_to_cpu(sb->events)); pr_debug("events cleared: %llu\n", diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 8dff19d5502e..813a99ffa86f 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1,11 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2015, SUSE - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * */ diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c index c2fdf899de14..50ad4ba86f0e 100644 --- a/drivers/md/md-faulty.c +++ b/drivers/md/md-faulty.c @@ -1,19 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * faulty.c : Multiple Devices driver for Linux * * Copyright (C) 2004 Neil Brown * * fautly-device-simulator personality for md - * - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * You should have received a copy of the GNU General Public License - * (for example /usr/src/linux/COPYING); if not, write to the Free - * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 5998d78aa189..7354466ddc90 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* linear.c : Multiple Devices driver for Linux Copyright (C) 1994-96 Marc ZYNGIER @@ -6,14 +7,6 @@ Linear mode management functions. - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - You should have received a copy of the GNU General Public License - (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include <linux/blkdev.h> diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c index 881487de1e25..6780938d2991 100644 --- a/drivers/md/md-multipath.c +++ b/drivers/md/md-multipath.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * multipath.c : Multiple Devices driver for Linux * @@ -8,15 +9,6 @@ * MULTIPATH management functions. * * derived from raid1.c. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * You should have received a copy of the GNU General Public License - * (for example /usr/src/linux/COPYING); if not, write to the Free - * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include <linux/blkdev.h> diff --git a/drivers/md/md.c b/drivers/md/md.c index 05ffffb8b769..9801d540fea1 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* md.c : Multiple Devices driver for Linux Copyright (C) 1998, 1999, 2000 Ingo Molnar @@ -22,14 +23,6 @@ - persistent bitmap code Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - You should have received a copy of the GNU General Public License - (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. Errors, Warnings, etc. Please use: @@ -88,8 +81,7 @@ static struct kobj_type md_ktype; struct md_cluster_operations *md_cluster_ops; EXPORT_SYMBOL(md_cluster_ops); -struct module *md_cluster_mod; -EXPORT_SYMBOL(md_cluster_mod); +static struct module *md_cluster_mod; static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; @@ -132,24 +124,6 @@ static inline int speed_max(struct mddev *mddev) mddev->sync_speed_max : sysctl_speed_limit_max; } -static void * flush_info_alloc(gfp_t gfp_flags, void *data) -{ - return kzalloc(sizeof(struct flush_info), gfp_flags); -} -static void flush_info_free(void *flush_info, void *data) -{ - kfree(flush_info); -} - -static void * flush_bio_alloc(gfp_t gfp_flags, void *data) -{ - return kzalloc(sizeof(struct flush_bio), gfp_flags); -} -static void flush_bio_free(void *flush_bio, void *data) -{ - kfree(flush_bio); -} - static struct ctl_table_header *raid_table_header; static struct ctl_table raid_table[] = { @@ -423,54 +397,31 @@ static int md_congested(void *data, int bits) /* * Generic flush handling for md */ -static void submit_flushes(struct work_struct *ws) -{ - struct flush_info *fi = container_of(ws, struct flush_info, flush_work); - struct mddev *mddev = fi->mddev; - struct bio *bio = fi->bio; - bio->bi_opf &= ~REQ_PREFLUSH; - md_handle_request(mddev, bio); - - mempool_free(fi, mddev->flush_pool); -} - -static void md_end_flush(struct bio *fbio) +static void md_end_flush(struct bio *bio) { - struct flush_bio *fb = fbio->bi_private; - struct md_rdev *rdev = fb->rdev; - struct flush_info *fi = fb->fi; - struct bio *bio = fi->bio; - struct mddev *mddev = fi->mddev; + struct md_rdev *rdev = bio->bi_private; + struct mddev *mddev = rdev->mddev; rdev_dec_pending(rdev, mddev); - if (atomic_dec_and_test(&fi->flush_pending)) { - if (bio->bi_iter.bi_size == 0) { - /* an empty barrier - all done */ - bio_endio(bio); - mempool_free(fi, mddev->flush_pool); - } else { - INIT_WORK(&fi->flush_work, submit_flushes); - queue_work(md_wq, &fi->flush_work); - } + if (atomic_dec_and_test(&mddev->flush_pending)) { + /* The pre-request flush has finished */ + queue_work(md_wq, &mddev->flush_work); } - - mempool_free(fb, mddev->flush_bio_pool); - bio_put(fbio); + bio_put(bio); } -void md_flush_request(struct mddev *mddev, struct bio *bio) +static void md_submit_flush_data(struct work_struct *ws); + +static void submit_flushes(struct work_struct *ws) { + struct mddev *mddev = container_of(ws, struct mddev, flush_work); struct md_rdev *rdev; - struct flush_info *fi; - - fi = mempool_alloc(mddev->flush_pool, GFP_NOIO); - - fi->bio = bio; - fi->mddev = mddev; - atomic_set(&fi->flush_pending, 1); + mddev->start_flush = ktime_get_boottime(); + INIT_WORK(&mddev->flush_work, md_submit_flush_data); + atomic_set(&mddev->flush_pending, 1); rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && @@ -480,37 +431,74 @@ void md_flush_request(struct mddev *mddev, struct bio *bio) * we reclaim rcu_read_lock */ struct bio *bi; - struct flush_bio *fb; atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending); rcu_read_unlock(); - - fb = mempool_alloc(mddev->flush_bio_pool, GFP_NOIO); - fb->fi = fi; - fb->rdev = rdev; - bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); - bio_set_dev(bi, rdev->bdev); bi->bi_end_io = md_end_flush; - bi->bi_private = fb; + bi->bi_private = rdev; + bio_set_dev(bi, rdev->bdev); bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - - atomic_inc(&fi->flush_pending); + atomic_inc(&mddev->flush_pending); submit_bio(bi); - rcu_read_lock(); rdev_dec_pending(rdev, mddev); } rcu_read_unlock(); + if (atomic_dec_and_test(&mddev->flush_pending)) + queue_work(md_wq, &mddev->flush_work); +} - if (atomic_dec_and_test(&fi->flush_pending)) { - if (bio->bi_iter.bi_size == 0) { +static void md_submit_flush_data(struct work_struct *ws) +{ + struct mddev *mddev = container_of(ws, struct mddev, flush_work); + struct bio *bio = mddev->flush_bio; + + /* + * must reset flush_bio before calling into md_handle_request to avoid a + * deadlock, because other bios passed md_handle_request suspend check + * could wait for this and below md_handle_request could wait for those + * bios because of suspend check + */ + mddev->last_flush = mddev->start_flush; + mddev->flush_bio = NULL; + wake_up(&mddev->sb_wait); + + if (bio->bi_iter.bi_size == 0) { + /* an empty barrier - all done */ + bio_endio(bio); + } else { + bio->bi_opf &= ~REQ_PREFLUSH; + md_handle_request(mddev, bio); + } +} + +void md_flush_request(struct mddev *mddev, struct bio *bio) +{ + ktime_t start = ktime_get_boottime(); + spin_lock_irq(&mddev->lock); + wait_event_lock_irq(mddev->sb_wait, + !mddev->flush_bio || + ktime_after(mddev->last_flush, start), + mddev->lock); + if (!ktime_after(mddev->last_flush, start)) { + WARN_ON(mddev->flush_bio); + mddev->flush_bio = bio; + bio = NULL; + } + spin_unlock_irq(&mddev->lock); + + if (!bio) { + INIT_WORK(&mddev->flush_work, submit_flushes); + queue_work(md_wq, &mddev->flush_work); + } else { + /* flush was performed for some other bio while we waited. */ + if (bio->bi_iter.bi_size == 0) /* an empty barrier - all done */ bio_endio(bio); - mempool_free(fi, mddev->flush_pool); - } else { - INIT_WORK(&fi->flush_work, submit_flushes); - queue_work(md_wq, &fi->flush_work); + else { + bio->bi_opf &= ~REQ_PREFLUSH; + mddev->pers->make_request(mddev, bio); } } } @@ -560,6 +548,7 @@ void mddev_init(struct mddev *mddev) atomic_set(&mddev->openers, 0); atomic_set(&mddev->active_io, 0); spin_lock_init(&mddev->lock); + atomic_set(&mddev->flush_pending, 0); init_waitqueue_head(&mddev->sb_wait); init_waitqueue_head(&mddev->recovery_wait); mddev->reshape_position = MaxSector; @@ -1109,8 +1098,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor * (not needed for Linear and RAID0 as metadata doesn't * record this size) */ - if (IS_ENABLED(CONFIG_LBDAF) && (u64)rdev->sectors >= (2ULL << 32) && - sb->level >= 1) + if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1) rdev->sectors = (sector_t)(2ULL << 32) - 2; if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) @@ -1408,8 +1396,7 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) /* Limit to 4TB as metadata cannot record more than that. * 4TB == 2^32 KB, or 2*2^32 sectors. */ - if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) && - rdev->mddev->level >= 1) + if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) num_sectors = (sector_t)(2ULL << 32) - 2; do { md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, @@ -1553,7 +1540,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ */ s32 offset; sector_t bb_sector; - u64 *bbp; + __le64 *bbp; int i; int sectors = le16_to_cpu(sb->bblog_size); if (sectors > (PAGE_SIZE / 512)) @@ -1565,7 +1552,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ if (!sync_page_io(rdev, bb_sector, sectors << 9, rdev->bb_page, REQ_OP_READ, 0, true)) return -EIO; - bbp = (u64 *)page_address(rdev->bb_page); + bbp = (__le64 *)page_address(rdev->bb_page); rdev->badblocks.shift = sb->bblog_shift; for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { u64 bb = le64_to_cpu(*bbp); @@ -1877,7 +1864,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) md_error(mddev, rdev); else { struct badblocks *bb = &rdev->badblocks; - u64 *bbp = (u64 *)page_address(rdev->bb_page); + __le64 *bbp = (__le64 *)page_address(rdev->bb_page); u64 *p = bb->page; sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); if (bb->changed) { @@ -2855,8 +2842,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) err = 0; } } else if (cmd_match(buf, "re-add")) { - if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && - rdev->saved_raid_disk >= 0) { + if (!rdev->mddev->pers) + err = -EINVAL; + else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && + rdev->saved_raid_disk >= 0) { /* clear_bit is performed _after_ all the devices * have their local Faulty bit cleared. If any writes * happen in the meantime in the local node, they @@ -3384,10 +3373,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; - rv = mddev ? mddev_lock(mddev): -EBUSY; + rv = mddev ? mddev_lock(mddev) : -ENODEV; if (!rv) { if (rdev->mddev == NULL) - rv = -EBUSY; + rv = -ENODEV; else rv = entry->store(rdev, page, length); mddev_unlock(mddev); @@ -5511,22 +5500,6 @@ int md_run(struct mddev *mddev) if (err) return err; } - if (mddev->flush_pool == NULL) { - mddev->flush_pool = mempool_create(NR_FLUSH_INFOS, flush_info_alloc, - flush_info_free, mddev); - if (!mddev->flush_pool) { - err = -ENOMEM; - goto abort; - } - } - if (mddev->flush_bio_pool == NULL) { - mddev->flush_bio_pool = mempool_create(NR_FLUSH_BIOS, flush_bio_alloc, - flush_bio_free, mddev); - if (!mddev->flush_bio_pool) { - err = -ENOMEM; - goto abort; - } - } spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); @@ -5686,11 +5659,8 @@ int md_run(struct mddev *mddev) return 0; abort: - mempool_destroy(mddev->flush_bio_pool); - mddev->flush_bio_pool = NULL; - mempool_destroy(mddev->flush_pool); - mddev->flush_pool = NULL; - + bioset_exit(&mddev->bio_set); + bioset_exit(&mddev->sync_set); return err; } EXPORT_SYMBOL_GPL(md_run); @@ -5894,14 +5864,6 @@ static void __md_stop(struct mddev *mddev) mddev->to_remove = &md_redundancy_group; module_put(pers->owner); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - if (mddev->flush_bio_pool) { - mempool_destroy(mddev->flush_bio_pool); - mddev->flush_bio_pool = NULL; - } - if (mddev->flush_pool) { - mempool_destroy(mddev->flush_pool); - mddev->flush_pool = NULL; - } } void md_stop(struct mddev *mddev) @@ -7645,9 +7607,9 @@ static void status_unused(struct seq_file *seq) static int status_resync(struct seq_file *seq, struct mddev *mddev) { sector_t max_sectors, resync, res; - unsigned long dt, db; - sector_t rt; - int scale; + unsigned long dt, db = 0; + sector_t rt, curr_mark_cnt, resync_mark_cnt; + int scale, recovery_active; unsigned int per_milli; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || @@ -7736,22 +7698,30 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) * db: blocks written from mark until now * rt: remaining time * - * rt is a sector_t, so could be 32bit or 64bit. - * So we divide before multiply in case it is 32bit and close - * to the limit. - * We scale the divisor (db) by 32 to avoid losing precision - * near the end of resync when the number of remaining sectors - * is close to 'db'. - * We then divide rt by 32 after multiplying by db to compensate. - * The '+1' avoids division by zero if db is very small. + * rt is a sector_t, which is always 64bit now. We are keeping + * the original algorithm, but it is not really necessary. + * + * Original algorithm: + * So we divide before multiply in case it is 32bit and close + * to the limit. + * We scale the divisor (db) by 32 to avoid losing precision + * near the end of resync when the number of remaining sectors + * is close to 'db'. + * We then divide rt by 32 after multiplying by db to compensate. + * The '+1' avoids division by zero if db is very small. */ dt = ((jiffies - mddev->resync_mark) / HZ); if (!dt) dt++; - db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) - - mddev->resync_mark_cnt; + + curr_mark_cnt = mddev->curr_mark_cnt; + recovery_active = atomic_read(&mddev->recovery_active); + resync_mark_cnt = mddev->resync_mark_cnt; + + if (curr_mark_cnt >= (recovery_active + resync_mark_cnt)) + db = curr_mark_cnt - (recovery_active + resync_mark_cnt); rt = max_sectors - resync; /* number of remaining sectors */ - sector_div(rt, db/32+1); + rt = div64_u64(rt, db/32+1); rt *= dt; rt >>= 5; @@ -9257,7 +9227,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) * reshape is happening in the remote node, we need to * update reshape_position and call start_reshape. */ - mddev->reshape_position = sb->reshape_position; + mddev->reshape_position = le64_to_cpu(sb->reshape_position); if (mddev->pers->update_reshape_pos) mddev->pers->update_reshape_pos(mddev); if (mddev->pers->start_reshape) diff --git a/drivers/md/md.h b/drivers/md/md.h index c52afb52c776..7c930c091193 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -1,15 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* md.h : kernel internal structure of the Linux MD driver Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - You should have received a copy of the GNU General Public License - (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef _MD_MD_H @@ -252,19 +245,6 @@ enum mddev_sb_flags { MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ }; -#define NR_FLUSH_INFOS 8 -#define NR_FLUSH_BIOS 64 -struct flush_info { - struct bio *bio; - struct mddev *mddev; - struct work_struct flush_work; - atomic_t flush_pending; -}; -struct flush_bio { - struct flush_info *fi; - struct md_rdev *rdev; -}; - struct mddev { void *private; struct md_personality *pers; @@ -470,8 +450,16 @@ struct mddev { * metadata and bitmap writes */ - mempool_t *flush_pool; - mempool_t *flush_bio_pool; + /* Generic flush handling. + * The last to finish preflush schedules a worker to submit + * the rest of the request (without the REQ_PREFLUSH flag). + */ + struct bio *flush_bio; + atomic_t flush_pending; + ktime_t start_flush, last_flush; /* last_flush is when the last completed + * flush was started. + */ + struct work_struct flush_work; struct work_struct event_work; /* used by dm to report failure event */ void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); struct md_cluster_info *cluster_info; diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig index a53cbc928af1..baaec1ae29c1 100644 --- a/drivers/md/persistent-data/Kconfig +++ b/drivers/md/persistent-data/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only config DM_PERSISTENT_DATA tristate depends on BLK_DEV_DM diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index 3972232b8037..749ec268d957 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -35,7 +35,10 @@ #define MAX_HOLDERS 4 #define MAX_STACK 10 -typedef unsigned long stack_entries[MAX_STACK]; +struct stack_store { + unsigned int nr_entries; + unsigned long entries[MAX_STACK]; +}; struct block_lock { spinlock_t lock; @@ -44,8 +47,7 @@ struct block_lock { struct task_struct *holders[MAX_HOLDERS]; #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING - struct stack_trace traces[MAX_HOLDERS]; - stack_entries entries[MAX_HOLDERS]; + struct stack_store traces[MAX_HOLDERS]; #endif }; @@ -73,7 +75,7 @@ static void __add_holder(struct block_lock *lock, struct task_struct *task) { unsigned h = __find_holder(lock, NULL); #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING - struct stack_trace *t; + struct stack_store *t; #endif get_task_struct(task); @@ -81,11 +83,7 @@ static void __add_holder(struct block_lock *lock, struct task_struct *task) #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING t = lock->traces + h; - t->nr_entries = 0; - t->max_entries = MAX_STACK; - t->entries = lock->entries[h]; - t->skip = 2; - save_stack_trace(t); + t->nr_entries = stack_trace_save(t->entries, MAX_STACK, 2); #endif } @@ -106,7 +104,8 @@ static int __check_holder(struct block_lock *lock) DMERR("recursive lock detected in metadata"); #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING DMERR("previously held here:"); - print_stack_trace(lock->traces + i, 4); + stack_trace_print(lock->traces[i].entries, + lock->traces[i].nr_entries, 4); DMERR("subsequent acquisition attempted here:"); dump_stack(); diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index 0a3b8ae4a29c..b8a62188f6be 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -190,6 +190,8 @@ static int sm_find_free(void *addr, unsigned begin, unsigned end, static int sm_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm) { + memset(ll, 0, sizeof(struct ll_disk)); + ll->tm = tm; ll->bitmap_info.tm = tm; diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index f3fb5bb8c82a..bf5cf184a260 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* raid0.c : Multiple Devices driver for Linux Copyright (C) 1994-96 Marc ZYNGIER @@ -7,14 +8,6 @@ RAID-0 management functions. - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - You should have received a copy of the GNU General Public License - (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include <linux/blkdev.h> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index fdf451aac369..2aa36e570e04 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * raid1.c : Multiple Devices driver for Linux * @@ -20,15 +21,6 @@ * * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology: * - persistent bitmap code - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * You should have received a copy of the GNU General Public License - * (for example /usr/src/linux/COPYING); if not, write to the Free - * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include <linux/slab.h> @@ -2110,7 +2102,7 @@ static void process_checks(struct r1bio *r1_bio) } r1_bio->read_disk = primary; for (i = 0; i < conf->raid_disks * 2; i++) { - int j; + int j = 0; struct bio *pbio = r1_bio->bios[primary]; struct bio *sbio = r1_bio->bios[i]; blk_status_t status = sbio->bi_status; @@ -2125,8 +2117,8 @@ static void process_checks(struct r1bio *r1_bio) /* Now we can 'fixup' the error value */ sbio->bi_status = 0; - bio_for_each_segment_all(bi, sbio, j, iter_all) - page_len[j] = bi->bv_len; + bio_for_each_segment_all(bi, sbio, iter_all) + page_len[j++] = bi->bv_len; if (!status) { for (j = vcnt; j-- ; ) { diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 3b6880dd648d..aea11476fee6 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * raid10.c : Multiple Devices driver for Linux * @@ -6,16 +7,6 @@ * RAID-10 support for md. * * Base on code in raid1.c. See raid1.c for further copyright information. - * - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * You should have received a copy of the GNU General Public License - * (for example /usr/src/linux/COPYING); if not, write to the Free - * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include <linux/slab.h> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index cbbe6b6535be..9b6da759dca2 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1,16 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2015 Shaohua Li <shli@fb.com> * Copyright (C) 2016 Song Liu <songliubraving@fb.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * */ #include <linux/kernel.h> #include <linux/wait.h> diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 17e9e7d51097..18a4064a61a8 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Partial Parity Log for closing the RAID5 write hole * Copyright (c) 2017, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #include <linux/kernel.h> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c033bfcb209e..b83bce2beb66 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * raid5.c : Multiple Devices driver for Linux * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman @@ -7,15 +8,6 @@ * RAID-4/5/6 management functions. * Thanks to Penguin Computing for making the RAID-6 development possible * by donating a test server! - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * You should have received a copy of the GNU General Public License - * (for example /usr/src/linux/COPYING); if not, write to the Free - * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* @@ -711,6 +703,8 @@ static bool is_full_stripe_write(struct stripe_head *sh) } static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) + __acquires(&sh1->stripe_lock) + __acquires(&sh2->stripe_lock) { if (sh1 > sh2) { spin_lock_irq(&sh2->stripe_lock); @@ -722,6 +716,8 @@ static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) } static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) + __releases(&sh1->stripe_lock) + __releases(&sh2->stripe_lock) { spin_unlock(&sh1->stripe_lock); spin_unlock_irq(&sh2->stripe_lock); @@ -4187,7 +4183,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, /* now write out any block on a failed drive, * or P or Q if they were recomputed */ - BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ + dev = NULL; if (s->failed == 2) { dev = &sh->dev[s->failed_num[1]]; s->locked++; @@ -4212,6 +4208,14 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); } + if (WARN_ONCE(dev && !test_bit(R5_UPTODATE, &dev->flags), + "%s: disk%td not up to date\n", + mdname(conf->mddev), + dev - (struct r5dev *) &sh->dev)) { + clear_bit(R5_LOCKED, &dev->flags); + clear_bit(R5_Wantwrite, &dev->flags); + s->locked--; + } clear_bit(STRIPE_DEGRADED, &sh->state); set_bit(STRIPE_INSYNC, &sh->state); @@ -6166,6 +6170,8 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, static int handle_active_stripes(struct r5conf *conf, int group, struct r5worker *worker, struct list_head *temp_inactive_list) + __releases(&conf->device_lock) + __acquires(&conf->device_lock) { struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; int i, batch_size = 0, hash; |