diff options
Diffstat (limited to 'drivers/md')
61 files changed, 1710 insertions, 1029 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 921888df6764..30ba3573626c 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -27,7 +27,7 @@ config BLK_DEV_MD More information about Software RAID on Linux is contained in the Software RAID mini-HOWTO, available from - <http://www.tldp.org/docs.html#howto>. There you will also learn + <https://www.tldp.org/docs.html#howto>. There you will also learn where to get the supporting user space utilities raidtools. If unsure, say N. @@ -71,7 +71,7 @@ config MD_RAID0 Information about Software RAID on Linux is contained in the Software-RAID mini-HOWTO, available from - <http://www.tldp.org/docs.html#howto>. There you will also + <https://www.tldp.org/docs.html#howto>. There you will also learn where to get the supporting user space utilities raidtools. To compile this as a module, choose M here: the module @@ -93,7 +93,7 @@ config MD_RAID1 Information about Software RAID on Linux is contained in the Software-RAID mini-HOWTO, available from - <http://www.tldp.org/docs.html#howto>. There you will also + <https://www.tldp.org/docs.html#howto>. There you will also learn where to get the supporting user space utilities raidtools. If you want to use such a RAID-1 set, say Y. To compile this code @@ -148,7 +148,7 @@ config MD_RAID456 Information about Software RAID on Linux is contained in the Software-RAID mini-HOWTO, available from - <http://www.tldp.org/docs.html#howto>. There you will also + <https://www.tldp.org/docs.html#howto>. There you will also learn where to get the supporting user space utilities raidtools. If you want to use such a RAID-4/RAID-5/RAID-6 set, say Y. To diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 31840f95cd40..6d3e234dc46a 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -43,6 +43,9 @@ obj-$(CONFIG_MD_FAULTY) += faulty.o obj-$(CONFIG_MD_CLUSTER) += md-cluster.o obj-$(CONFIG_BCACHE) += bcache/ obj-$(CONFIG_BLK_DEV_MD) += md-mod.o +ifeq ($(CONFIG_BLK_DEV_MD),y) +obj-y += md-autodetect.o +endif obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o obj-$(CONFIG_BLK_DEV_DM_BUILTIN) += dm-builtin.o obj-$(CONFIG_DM_UNSTRIPED) += dm-unstripe.o diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index bf7dd96db9b3..d1ca4d059c20 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -27,7 +27,7 @@ config BCACHE_CLOSURES_DEBUG interface to list them, which makes it possible to see asynchronous operations that get stuck. -config BCACHE_ASYNC_REGISTRAION +config BCACHE_ASYNC_REGISTRATION bool "Asynchronous device registration (EXPERIMENTAL)" depends on BCACHE help diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index fd714628da6a..5b87e59676b8 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_BCACHE) += bcache.o bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ - util.o writeback.o + util.o writeback.o features.o diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index a1df0d95151c..52035a78d836 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -87,7 +87,7 @@ void bch_rescale_priorities(struct cache_set *c, int sectors) { struct cache *ca; struct bucket *b; - unsigned int next = c->nbuckets * c->sb.bucket_size / 1024; + unsigned long next = c->nbuckets * c->sb.bucket_size / 1024; unsigned int i; int r; diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 221e0191b687..4fd03d2496d8 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -264,7 +264,7 @@ struct bcache_device { #define BCACHE_DEV_UNLINK_DONE 2 #define BCACHE_DEV_WB_RUNNING 3 #define BCACHE_DEV_RATE_DW_RUNNING 4 - unsigned int nr_stripes; + int nr_stripes; unsigned int stripe_size; atomic_t *stripe_sectors_dirty; unsigned long *full_dirty_stripes; @@ -762,11 +762,32 @@ struct bbio { #define bucket_bytes(c) ((c)->sb.bucket_size << 9) #define block_bytes(c) ((c)->sb.block_size << 9) -#define prios_per_bucket(c) \ - ((bucket_bytes(c) - sizeof(struct prio_set)) / \ +static inline unsigned int meta_bucket_pages(struct cache_sb *sb) +{ + unsigned int n, max_pages; + + max_pages = min_t(unsigned int, + __rounddown_pow_of_two(USHRT_MAX) / PAGE_SECTORS, + MAX_ORDER_NR_PAGES); + + n = sb->bucket_size / PAGE_SECTORS; + if (n > max_pages) + n = max_pages; + + return n; +} + +static inline unsigned int meta_bucket_bytes(struct cache_sb *sb) +{ + return meta_bucket_pages(sb) << PAGE_SHIFT; +} + +#define prios_per_bucket(ca) \ + ((meta_bucket_bytes(&(ca)->sb) - sizeof(struct prio_set)) / \ sizeof(struct bucket_disk)) -#define prio_buckets(c) \ - DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c)) + +#define prio_buckets(ca) \ + DIV_ROUND_UP((size_t) (ca)->sb.nbuckets, prios_per_bucket(ca)) static inline size_t sector_to_bucket(struct cache_set *c, sector_t s) { @@ -929,7 +950,7 @@ static inline void closure_bio_submit(struct cache_set *c, bio_endio(bio); return; } - generic_make_request(bio); + submit_bio_noacct(bio); } /* diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 4995fcaefe29..67a2c47f4201 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -322,7 +322,7 @@ int bch_btree_keys_alloc(struct btree_keys *b, b->page_order = page_order; - t->data = (void *) __get_free_pages(gfp, b->page_order); + t->data = (void *) __get_free_pages(__GFP_COMP|gfp, b->page_order); if (!t->data) goto err; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 6548a601edf0..3d8bd0692af3 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -738,7 +738,7 @@ void bch_btree_cache_free(struct cache_set *c) if (c->verify_data) list_move(&c->verify_data->list, &c->btree_cache); - free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c))); + free_pages((unsigned long) c->verify_ondisk, ilog2(meta_bucket_pages(&c->sb))); #endif list_splice(&c->btree_cache_freeable, @@ -785,7 +785,15 @@ int bch_btree_cache_alloc(struct cache_set *c) mutex_init(&c->verify_lock); c->verify_ondisk = (void *) - __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c))); + __get_free_pages(GFP_KERNEL|__GFP_COMP, ilog2(meta_bucket_pages(&c->sb))); + if (!c->verify_ondisk) { + /* + * Don't worry about the mca_rereserve buckets + * allocated in previous for-loop, they will be + * handled properly in bch_cache_set_unregister(). + */ + return -ENOMEM; + } c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); @@ -959,7 +967,7 @@ err: * bch_btree_node_get - find a btree node in the cache and lock it, reading it * in from disk if necessary. * - * If IO is necessary and running under generic_make_request, returns -EAGAIN. + * If IO is necessary and running under submit_bio_noacct, returns -EAGAIN. * * The btree node will have either a read or a write lock held, depending on * level and op->lock. diff --git a/drivers/md/bcache/features.c b/drivers/md/bcache/features.c new file mode 100644 index 000000000000..4442df48d28c --- /dev/null +++ b/drivers/md/bcache/features.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Feature set bits and string conversion. + * Inspired by ext4's features compat/incompat/ro_compat related code. + * + * Copyright 2020 Coly Li <colyli@suse.de> + * + */ +#include <linux/bcache.h> +#include "bcache.h" +#include "features.h" + +struct feature { + int compat; + unsigned int mask; + const char *string; +}; + +static struct feature feature_list[] = { + {BCH_FEATURE_INCOMPAT, BCH_FEATURE_INCOMPAT_LARGE_BUCKET, + "large_bucket"}, + {0, 0, 0 }, +}; + +#define compose_feature_string(type) \ +({ \ + struct feature *f; \ + bool first = true; \ + \ + for (f = &feature_list[0]; f->compat != 0; f++) { \ + if (f->compat != BCH_FEATURE_ ## type) \ + continue; \ + if (BCH_HAS_ ## type ## _FEATURE(&c->sb, f->mask)) { \ + if (first) { \ + out += snprintf(out, buf + size - out, \ + "["); \ + } else { \ + out += snprintf(out, buf + size - out, \ + " ["); \ + } \ + } else if (!first) { \ + out += snprintf(out, buf + size - out, " "); \ + } \ + \ + out += snprintf(out, buf + size - out, "%s", f->string);\ + \ + if (BCH_HAS_ ## type ## _FEATURE(&c->sb, f->mask)) \ + out += snprintf(out, buf + size - out, "]"); \ + \ + first = false; \ + } \ + if (!first) \ + out += snprintf(out, buf + size - out, "\n"); \ +}) + +int bch_print_cache_set_feature_compat(struct cache_set *c, char *buf, int size) +{ + char *out = buf; + compose_feature_string(COMPAT); + return out - buf; +} + +int bch_print_cache_set_feature_ro_compat(struct cache_set *c, char *buf, int size) +{ + char *out = buf; + compose_feature_string(RO_COMPAT); + return out - buf; +} + +int bch_print_cache_set_feature_incompat(struct cache_set *c, char *buf, int size) +{ + char *out = buf; + compose_feature_string(INCOMPAT); + return out - buf; +} diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h new file mode 100644 index 000000000000..a1653c478041 --- /dev/null +++ b/drivers/md/bcache/features.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _BCACHE_FEATURES_H +#define _BCACHE_FEATURES_H + +#include <linux/bcache.h> +#include <linux/kernel.h> +#include <linux/types.h> + +#define BCH_FEATURE_COMPAT 0 +#define BCH_FEATURE_RO_COMPAT 1 +#define BCH_FEATURE_INCOMPAT 2 +#define BCH_FEATURE_TYPE_MASK 0x03 + +/* Feature set definition */ +/* Incompat feature set */ +#define BCH_FEATURE_INCOMPAT_LARGE_BUCKET 0x0001 /* 32bit bucket size */ + +#define BCH_FEATURE_COMPAT_SUUP 0 +#define BCH_FEATURE_RO_COMPAT_SUUP 0 +#define BCH_FEATURE_INCOMPAT_SUUP BCH_FEATURE_INCOMPAT_LARGE_BUCKET + +#define BCH_HAS_COMPAT_FEATURE(sb, mask) \ + ((sb)->feature_compat & (mask)) +#define BCH_HAS_RO_COMPAT_FEATURE(sb, mask) \ + ((sb)->feature_ro_compat & (mask)) +#define BCH_HAS_INCOMPAT_FEATURE(sb, mask) \ + ((sb)->feature_incompat & (mask)) + +#define BCH_FEATURE_COMPAT_FUNCS(name, flagname) \ +static inline int bch_has_feature_##name(struct cache_sb *sb) \ +{ \ + return (((sb)->feature_compat & \ + BCH##_FEATURE_COMPAT_##flagname) != 0); \ +} \ +static inline void bch_set_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_compat |= \ + BCH##_FEATURE_COMPAT_##flagname; \ +} \ +static inline void bch_clear_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_compat &= \ + ~BCH##_FEATURE_COMPAT_##flagname; \ +} + +#define BCH_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ +static inline int bch_has_feature_##name(struct cache_sb *sb) \ +{ \ + return (((sb)->feature_ro_compat & \ + BCH##_FEATURE_RO_COMPAT_##flagname) != 0); \ +} \ +static inline void bch_set_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_ro_compat |= \ + BCH##_FEATURE_RO_COMPAT_##flagname; \ +} \ +static inline void bch_clear_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_ro_compat &= \ + ~BCH##_FEATURE_RO_COMPAT_##flagname; \ +} + +#define BCH_FEATURE_INCOMPAT_FUNCS(name, flagname) \ +static inline int bch_has_feature_##name(struct cache_sb *sb) \ +{ \ + return (((sb)->feature_incompat & \ + BCH##_FEATURE_INCOMPAT_##flagname) != 0); \ +} \ +static inline void bch_set_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_incompat |= \ + BCH##_FEATURE_INCOMPAT_##flagname; \ +} \ +static inline void bch_clear_feature_##name(struct cache_sb *sb) \ +{ \ + (sb)->feature_incompat &= \ + ~BCH##_FEATURE_INCOMPAT_##flagname; \ +} + +BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LARGE_BUCKET); + +int bch_print_cache_set_feature_compat(struct cache_set *c, char *buf, int size); +int bch_print_cache_set_feature_ro_compat(struct cache_set *c, char *buf, int size); +int bch_print_cache_set_feature_incompat(struct cache_set *c, char *buf, int size); + +#endif diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index b25ee33b0d0b..a14a445618b4 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -26,7 +26,7 @@ struct bio *bch_bbio_alloc(struct cache_set *c) struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO); struct bio *bio = &b->bio; - bio_init(bio, bio->bi_inline_vecs, bucket_pages(c)); + bio_init(bio, bio->bi_inline_vecs, meta_bucket_pages(&c->sb)); return bio; } diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 90aac4e2333f..77fbfd52edcf 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -217,10 +217,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) */ pr_debug("falling back to linear search\n"); - for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets); - l < ca->sb.njournal_buckets; - l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, - l + 1)) + for_each_clear_bit(l, bitmap, ca->sb.njournal_buckets) if (read_bucket(l)) goto bsearch; @@ -999,8 +996,8 @@ int bch_journal_alloc(struct cache_set *c) j->w[1].c = c; if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || - !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || - !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) + !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS)) || + !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS))) return -ENOMEM; return 0; diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 7891fb512736..5872d6470470 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -145,8 +145,8 @@ static void read_moving(struct cache_set *c) continue; } - io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) - * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + io = kzalloc(struct_size(io, bio.bio.bi_inline_vecs, + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), GFP_KERNEL); if (!io) goto err; @@ -206,8 +206,8 @@ void bch_moving_gc(struct cache_set *c) mutex_lock(&c->bucket_lock); for_each_cache(ca, c, i) { - unsigned int sectors_to_move = 0; - unsigned int reserve_sectors = ca->sb.bucket_size * + unsigned long sectors_to_move = 0; + unsigned long reserve_sectors = ca->sb.bucket_size * fifo_used(&ca->free[RESERVE_MOVINGGC]); ca->heap.used = 0; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 7acf024e99f3..c7cadaafa947 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -668,7 +668,9 @@ static void backing_request_endio(struct bio *bio) static void bio_complete(struct search *s) { if (s->orig_bio) { - bio_end_io_acct(s->orig_bio, s->start_time); + /* Count on bcache device */ + disk_end_io_acct(s->d->disk, bio_op(s->orig_bio), s->start_time); + trace_bcache_request_end(s->d, s->orig_bio); s->orig_bio->bi_status = s->iop.status; bio_endio(s->orig_bio); @@ -728,8 +730,8 @@ static inline struct search *search_alloc(struct bio *bio, s->recoverable = 1; s->write = op_is_write(bio_op(bio)); s->read_dirty_data = 0; - s->start_time = bio_start_io_acct(bio); - + /* Count on the bcache device */ + s->start_time = disk_start_io_acct(d->disk, bio_sectors(bio), bio_op(bio)); s->iop.c = d->c; s->iop.bio = NULL; s->iop.inode = d->id; @@ -1080,7 +1082,8 @@ static void detached_dev_end_io(struct bio *bio) bio->bi_end_io = ddip->bi_end_io; bio->bi_private = ddip->bi_private; - bio_end_io_acct(bio, ddip->start_time); + /* Count on the bcache device */ + disk_end_io_acct(ddip->d->disk, bio_op(bio), ddip->start_time); if (bio->bi_status) { struct cached_dev *dc = container_of(ddip->d, @@ -1105,7 +1108,8 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) */ ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO); ddip->d = d; - ddip->start_time = bio_start_io_acct(bio); + /* Count on the bcache device */ + ddip->start_time = disk_start_io_acct(d->disk, bio_sectors(bio), bio_op(bio)); ddip->bi_end_io = bio->bi_end_io; ddip->bi_private = bio->bi_private; bio->bi_end_io = detached_dev_end_io; @@ -1115,7 +1119,7 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) !blk_queue_discard(bdev_get_queue(dc->bdev))) bio->bi_end_io(bio); else - generic_make_request(bio); + submit_bio_noacct(bio); } static void quit_max_writeback_rate(struct cache_set *c, @@ -1158,7 +1162,7 @@ static void quit_max_writeback_rate(struct cache_set *c, /* Cached devices - read & write stuff */ -blk_qc_t cached_dev_make_request(struct request_queue *q, struct bio *bio) +blk_qc_t cached_dev_submit_bio(struct bio *bio) { struct search *s; struct bcache_device *d = bio->bi_disk->private_data; @@ -1197,7 +1201,7 @@ blk_qc_t cached_dev_make_request(struct request_queue *q, struct bio *bio) if (!bio->bi_iter.bi_size) { /* * can't call bch_journal_meta from under - * generic_make_request + * submit_bio_noacct */ continue_at_nobarrier(&s->cl, cached_dev_nodata, @@ -1228,36 +1232,8 @@ static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); } -static int cached_dev_congested(void *data, int bits) -{ - struct bcache_device *d = data; - struct cached_dev *dc = container_of(d, struct cached_dev, disk); - struct request_queue *q = bdev_get_queue(dc->bdev); - int ret = 0; - - if (bdi_congested(q->backing_dev_info, bits)) - return 1; - - if (cached_dev_get(dc)) { - unsigned int i; - struct cache *ca; - - for_each_cache(ca, d->c, i) { - q = bdev_get_queue(ca->bdev); - ret |= bdi_congested(q->backing_dev_info, bits); - } - - cached_dev_put(dc); - } - - return ret; -} - void bch_cached_dev_request_init(struct cached_dev *dc) { - struct gendisk *g = dc->disk.disk; - - g->queue->backing_dev_info->congested_fn = cached_dev_congested; dc->disk.cache_miss = cached_dev_cache_miss; dc->disk.ioctl = cached_dev_ioctl; } @@ -1291,7 +1267,7 @@ static void flash_dev_nodata(struct closure *cl) continue_at(cl, search_free, NULL); } -blk_qc_t flash_dev_make_request(struct request_queue *q, struct bio *bio) +blk_qc_t flash_dev_submit_bio(struct bio *bio) { struct search *s; struct closure *cl; @@ -1311,8 +1287,7 @@ blk_qc_t flash_dev_make_request(struct request_queue *q, struct bio *bio) if (!bio->bi_iter.bi_size) { /* - * can't call bch_journal_meta from under - * generic_make_request + * can't call bch_journal_meta from under submit_bio_noacct */ continue_at_nobarrier(&s->cl, flash_dev_nodata, @@ -1342,27 +1317,8 @@ static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode, return -ENOTTY; } -static int flash_dev_congested(void *data, int bits) -{ - struct bcache_device *d = data; - struct request_queue *q; - struct cache *ca; - unsigned int i; - int ret = 0; - - for_each_cache(ca, d->c, i) { - q = bdev_get_queue(ca->bdev); - ret |= bdi_congested(q->backing_dev_info, bits); - } - - return ret; -} - void bch_flash_dev_request_init(struct bcache_device *d) { - struct gendisk *g = d->disk; - - g->queue->backing_dev_info->congested_fn = flash_dev_congested; d->cache_miss = flash_dev_cache_miss; d->ioctl = flash_dev_ioctl; } diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index bb005c93dd72..82b38366a95d 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -37,10 +37,10 @@ unsigned int bch_get_congested(const struct cache_set *c); void bch_data_insert(struct closure *cl); void bch_cached_dev_request_init(struct cached_dev *dc); -blk_qc_t cached_dev_make_request(struct request_queue *q, struct bio *bio); +blk_qc_t cached_dev_submit_bio(struct bio *bio); void bch_flash_dev_request_init(struct bcache_device *d); -blk_qc_t flash_dev_make_request(struct request_queue *q, struct bio *bio); +blk_qc_t flash_dev_submit_bio(struct bio *bio); extern struct kmem_cache *bch_search_cache; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 2014016f9a60..1bbdc410ee3c 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -13,6 +13,7 @@ #include "extents.h" #include "request.h" #include "writeback.h" +#include "features.h" #include <linux/blkdev.h> #include <linux/debugfs.h> @@ -59,6 +60,92 @@ struct workqueue_struct *bch_journal_wq; /* Superblock */ +static unsigned int get_bucket_size(struct cache_sb *sb, struct cache_sb_disk *s) +{ + unsigned int bucket_size = le16_to_cpu(s->bucket_size); + + if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES && + bch_has_feature_large_bucket(sb)) + bucket_size |= le16_to_cpu(s->bucket_size_hi) << 16; + + return bucket_size; +} + +static const char *read_super_common(struct cache_sb *sb, struct block_device *bdev, + struct cache_sb_disk *s) +{ + const char *err; + unsigned int i; + + sb->first_bucket= le16_to_cpu(s->first_bucket); + sb->nbuckets = le64_to_cpu(s->nbuckets); + sb->bucket_size = get_bucket_size(sb, s); + + sb->nr_in_set = le16_to_cpu(s->nr_in_set); + sb->nr_this_dev = le16_to_cpu(s->nr_this_dev); + + err = "Too many journal buckets"; + if (sb->keys > SB_JOURNAL_BUCKETS) + goto err; + + err = "Too many buckets"; + if (sb->nbuckets > LONG_MAX) + goto err; + + err = "Not enough buckets"; + if (sb->nbuckets < 1 << 7) + goto err; + + err = "Bad block size (not power of 2)"; + if (!is_power_of_2(sb->block_size)) + goto err; + + err = "Bad block size (larger than page size)"; + if (sb->block_size > PAGE_SECTORS) + goto err; + + err = "Bad bucket size (not power of 2)"; + if (!is_power_of_2(sb->bucket_size)) + goto err; + + err = "Bad bucket size (smaller than page size)"; + if (sb->bucket_size < PAGE_SECTORS) + goto err; + + err = "Invalid superblock: device too small"; + if (get_capacity(bdev->bd_disk) < + sb->bucket_size * sb->nbuckets) + goto err; + + err = "Bad UUID"; + if (bch_is_zero(sb->set_uuid, 16)) + goto err; + + err = "Bad cache device number in set"; + if (!sb->nr_in_set || + sb->nr_in_set <= sb->nr_this_dev || + sb->nr_in_set > MAX_CACHES_PER_SET) + goto err; + + err = "Journal buckets not sequential"; + for (i = 0; i < sb->keys; i++) + if (sb->d[i] != sb->first_bucket + i) + goto err; + + err = "Too many journal buckets"; + if (sb->first_bucket + sb->keys > sb->nbuckets) + goto err; + + err = "Invalid superblock: first bucket comes before end of super"; + if (sb->first_bucket * sb->bucket_size < 16) + goto err; + + err = NULL; +err: + return err; +} + + static const char *read_super(struct cache_sb *sb, struct block_device *bdev, struct cache_sb_disk **res) { @@ -84,7 +171,6 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, sb->flags = le64_to_cpu(s->flags); sb->seq = le64_to_cpu(s->seq); sb->last_mount = le32_to_cpu(s->last_mount); - sb->first_bucket = le16_to_cpu(s->first_bucket); sb->keys = le16_to_cpu(s->keys); for (i = 0; i < SB_JOURNAL_BUCKETS; i++) @@ -101,10 +187,6 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, if (memcmp(sb->magic, bcache_magic, 16)) goto err; - err = "Too many journal buckets"; - if (sb->keys > SB_JOURNAL_BUCKETS) - goto err; - err = "Bad checksum"; if (s->csum != csum_set(s)) goto err; @@ -124,6 +206,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, sb->data_offset = BDEV_DATA_START_DEFAULT; break; case BCACHE_SB_VERSION_BDEV_WITH_OFFSET: + case BCACHE_SB_VERSION_BDEV_WITH_FEATURES: sb->data_offset = le64_to_cpu(s->data_offset); err = "Bad data offset"; @@ -133,55 +216,21 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, break; case BCACHE_SB_VERSION_CDEV: case BCACHE_SB_VERSION_CDEV_WITH_UUID: - sb->nbuckets = le64_to_cpu(s->nbuckets); - sb->bucket_size = le16_to_cpu(s->bucket_size); - - sb->nr_in_set = le16_to_cpu(s->nr_in_set); - sb->nr_this_dev = le16_to_cpu(s->nr_this_dev); - - err = "Too many buckets"; - if (sb->nbuckets > LONG_MAX) - goto err; - - err = "Not enough buckets"; - if (sb->nbuckets < 1 << 7) - goto err; - - err = "Bad block/bucket size"; - if (!is_power_of_2(sb->block_size) || - sb->block_size > PAGE_SECTORS || - !is_power_of_2(sb->bucket_size) || - sb->bucket_size < PAGE_SECTORS) - goto err; - - err = "Invalid superblock: device too small"; - if (get_capacity(bdev->bd_disk) < - sb->bucket_size * sb->nbuckets) - goto err; - - err = "Bad UUID"; - if (bch_is_zero(sb->set_uuid, 16)) - goto err; - - err = "Bad cache device number in set"; - if (!sb->nr_in_set || - sb->nr_in_set <= sb->nr_this_dev || - sb->nr_in_set > MAX_CACHES_PER_SET) - goto err; - - err = "Journal buckets not sequential"; - for (i = 0; i < sb->keys; i++) - if (sb->d[i] != sb->first_bucket + i) - goto err; - - err = "Too many journal buckets"; - if (sb->first_bucket + sb->keys > sb->nbuckets) + err = read_super_common(sb, bdev, s); + if (err) goto err; - - err = "Invalid superblock: first bucket comes before end of super"; - if (sb->first_bucket * sb->bucket_size < 16) + break; + case BCACHE_SB_VERSION_CDEV_WITH_FEATURES: + /* + * Feature bits are needed in read_super_common(), + * convert them firstly. + */ + sb->feature_compat = le64_to_cpu(s->feature_compat); + sb->feature_incompat = le64_to_cpu(s->feature_incompat); + sb->feature_ro_compat = le64_to_cpu(s->feature_ro_compat); + err = read_super_common(sb, bdev, s); + if (err) goto err; - break; default: err = "Unsupported superblock version"; @@ -217,7 +266,6 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out, offset_in_page(out)); out->offset = cpu_to_le64(sb->offset); - out->version = cpu_to_le64(sb->version); memcpy(out->uuid, sb->uuid, 16); memcpy(out->set_uuid, sb->set_uuid, 16); @@ -233,6 +281,13 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out, for (i = 0; i < sb->keys; i++) out->d[i] = cpu_to_le64(sb->d[i]); + if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) { + out->feature_compat = cpu_to_le64(sb->feature_compat); + out->feature_incompat = cpu_to_le64(sb->feature_incompat); + out->feature_ro_compat = cpu_to_le64(sb->feature_ro_compat); + } + + out->version = cpu_to_le64(sb->version); out->csum = csum_set(out); pr_debug("ver %llu, flags %llu, seq %llu\n", @@ -289,17 +344,20 @@ void bcache_write_super(struct cache_set *c) { struct closure *cl = &c->sb_write; struct cache *ca; - unsigned int i; + unsigned int i, version = BCACHE_SB_VERSION_CDEV_WITH_UUID; down(&c->sb_write_mutex); closure_init(cl, &c->cl); c->sb.seq++; + if (c->sb.version > version) + version = c->sb.version; + for_each_cache(ca, c, i) { struct bio *bio = &ca->sb_bio; - ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID; + ca->sb.version = version; ca->sb.seq = c->sb.seq; ca->sb.last_mount = c->sb.last_mount; @@ -423,6 +481,7 @@ static int __uuid_write(struct cache_set *c) BKEY_PADDED(key) k; struct closure cl; struct cache *ca; + unsigned int size; closure_init_stack(&cl); lockdep_assert_held(&bch_register_lock); @@ -430,7 +489,8 @@ static int __uuid_write(struct cache_set *c) if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true)) return 1; - SET_KEY_SIZE(&k.key, c->sb.bucket_size); + size = meta_bucket_pages(&c->sb) * PAGE_SECTORS; + SET_KEY_SIZE(&k.key, size); uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl); closure_sync(&cl); @@ -518,7 +578,7 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op, bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size; bio_set_dev(bio, ca->bdev); - bio->bi_iter.bi_size = bucket_bytes(ca); + bio->bi_iter.bi_size = meta_bucket_bytes(&ca->sb); bio->bi_end_io = prio_endio; bio->bi_private = ca; @@ -576,7 +636,7 @@ int bch_prio_write(struct cache *ca, bool wait) p->next_bucket = ca->prio_buckets[i + 1]; p->magic = pset_magic(&ca->sb); - p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); + p->csum = bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8); bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait); BUG_ON(bucket == -1); @@ -629,7 +689,7 @@ static int prio_read(struct cache *ca, uint64_t bucket) prio_io(ca, bucket, REQ_OP_READ, 0); if (p->csum != - bch_crc64(&p->magic, bucket_bytes(ca) - 8)) { + bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8)) { pr_warn("bad csum reading priorities\n"); goto out; } @@ -680,7 +740,16 @@ static int ioctl_dev(struct block_device *b, fmode_t mode, return d->ioctl(d, mode, cmd, arg); } -static const struct block_device_operations bcache_ops = { +static const struct block_device_operations bcache_cached_ops = { + .submit_bio = cached_dev_submit_bio, + .open = open_dev, + .release = release_dev, + .ioctl = ioctl_dev, + .owner = THIS_MODULE, +}; + +static const struct block_device_operations bcache_flash_ops = { + .submit_bio = flash_dev_submit_bio, .open = open_dev, .release = release_dev, .ioctl = ioctl_dev, @@ -820,25 +889,25 @@ static void bcache_device_free(struct bcache_device *d) } static int bcache_device_init(struct bcache_device *d, unsigned int block_size, - sector_t sectors, make_request_fn make_request_fn, - struct block_device *cached_bdev) + sector_t sectors, struct block_device *cached_bdev, + const struct block_device_operations *ops) { struct request_queue *q; const size_t max_stripes = min_t(size_t, INT_MAX, SIZE_MAX / sizeof(atomic_t)); - size_t n; + uint64_t n; int idx; if (!d->stripe_size) d->stripe_size = 1 << 31; - d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size); - - if (!d->nr_stripes || d->nr_stripes > max_stripes) { - pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)\n", - (unsigned int)d->nr_stripes); + n = DIV_ROUND_UP_ULL(sectors, d->stripe_size); + if (!n || n > max_stripes) { + pr_err("nr_stripes too large or invalid: %llu (start sector beyond end of disk?)\n", + n); return -ENOMEM; } + d->nr_stripes = n; n = d->nr_stripes * sizeof(atomic_t); d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL); @@ -868,16 +937,14 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, d->disk->major = bcache_major; d->disk->first_minor = idx_to_first_minor(idx); - d->disk->fops = &bcache_ops; + d->disk->fops = ops; d->disk->private_data = d; - q = blk_alloc_queue(make_request_fn, NUMA_NO_NODE); + q = blk_alloc_queue(NUMA_NO_NODE); if (!q) return -ENOMEM; d->disk->queue = q; - q->queuedata = d; - q->backing_dev_info->congested_data = d; q->limits.max_hw_sectors = UINT_MAX; q->limits.max_sectors = UINT_MAX; q->limits.max_segment_size = UINT_MAX; @@ -1356,7 +1423,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) ret = bcache_device_init(&dc->disk, block_size, dc->bdev->bd_part->nr_sects - dc->sb.data_offset, - cached_dev_make_request, dc->bdev); + dc->bdev, &bcache_cached_ops); if (ret) return ret; @@ -1469,7 +1536,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) kobject_init(&d->kobj, &bch_flash_dev_ktype); if (bcache_device_init(d, block_bytes(c), u->sectors, - flash_dev_make_request, NULL)) + NULL, &bcache_flash_ops)) goto err; bcache_device_attach(d, c, u - c->uuids); @@ -1613,7 +1680,7 @@ static void cache_set_free(struct closure *cl) } bch_bset_sort_state_free(&c->sort); - free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); + free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->sb))); if (c->moving_gc_wq) destroy_workqueue(c->moving_gc_wq); @@ -1776,7 +1843,10 @@ void bch_cache_set_unregister(struct cache_set *c) } #define alloc_bucket_pages(gfp, c) \ - ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c)))) + ((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(bucket_pages(c)))) + +#define alloc_meta_bucket_pages(gfp, sb) \ + ((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(meta_bucket_pages(sb)))) struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) { @@ -1807,12 +1877,19 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->sb.bucket_size = sb->bucket_size; c->sb.nr_in_set = sb->nr_in_set; c->sb.last_mount = sb->last_mount; + c->sb.version = sb->version; + if (c->sb.version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) { + c->sb.feature_compat = sb->feature_compat; + c->sb.feature_ro_compat = sb->feature_ro_compat; + c->sb.feature_incompat = sb->feature_incompat; + } + c->bucket_bits = ilog2(sb->bucket_size); c->block_bits = ilog2(sb->block_size); - c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); + c->nr_uuids = meta_bucket_bytes(&c->sb) / sizeof(struct uuid_entry); c->devices_max_used = 0; atomic_set(&c->attached_dev_nr, 0); - c->btree_pages = bucket_pages(c); + c->btree_pages = meta_bucket_pages(&c->sb); if (c->btree_pages > BTREE_MAX_PAGES) c->btree_pages = max_t(int, c->btree_pages / 4, BTREE_MAX_PAGES); @@ -1838,24 +1915,46 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) INIT_LIST_HEAD(&c->btree_cache_freed); INIT_LIST_HEAD(&c->data_buckets); - iter_size = (sb->bucket_size / sb->block_size + 1) * + iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size + 1) * sizeof(struct btree_iter_set); - if (!(c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL)) || - mempool_init_slab_pool(&c->search, 32, bch_search_cache) || - mempool_init_kmalloc_pool(&c->bio_meta, 2, - sizeof(struct bbio) + sizeof(struct bio_vec) * - bucket_pages(c)) || - mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || - bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio), - BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) || - !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || - !(c->moving_gc_wq = alloc_workqueue("bcache_gc", - WQ_MEM_RECLAIM, 0)) || - bch_journal_alloc(c) || - bch_btree_cache_alloc(c) || - bch_open_buckets_alloc(c) || - bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages))) + c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL); + if (!c->devices) + goto err; + + if (mempool_init_slab_pool(&c->search, 32, bch_search_cache)) + goto err; + + if (mempool_init_kmalloc_pool(&c->bio_meta, 2, + sizeof(struct bbio) + + sizeof(struct bio_vec) * meta_bucket_pages(&c->sb))) + goto err; + + if (mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size)) + goto err; + + if (bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio), + BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) + goto err; + + c->uuids = alloc_meta_bucket_pages(GFP_KERNEL, &c->sb); + if (!c->uuids) + goto err; + + c->moving_gc_wq = alloc_workqueue("bcache_gc", WQ_MEM_RECLAIM, 0); + if (!c->moving_gc_wq) + goto err; + + if (bch_journal_alloc(c)) + goto err; + + if (bch_btree_cache_alloc(c)) + goto err; + + if (bch_open_buckets_alloc(c)) + goto err; + + if (bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages))) goto err; c->congested_read_threshold_us = 2000; @@ -2100,7 +2199,14 @@ found: sysfs_create_link(&c->kobj, &ca->kobj, buf)) goto err; - if (ca->sb.seq > c->sb.seq) { + /* + * A special case is both ca->sb.seq and c->sb.seq are 0, + * such condition happens on a new created cache device whose + * super block is never flushed yet. In this case c->sb.version + * and other members should be updated too, otherwise we will + * have a mistaken super block version in cache set. + */ + if (ca->sb.seq > c->sb.seq || c->sb.seq == 0) { c->sb.version = ca->sb.version; memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16); c->sb.flags = ca->sb.flags; @@ -2138,7 +2244,7 @@ void bch_cache_release(struct kobject *kobj) ca->set->cache[ca->sb.nr_this_dev] = NULL; } - free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); + free_pages((unsigned long) ca->disk_buckets, ilog2(meta_bucket_pages(&ca->sb))); kfree(ca->prio_buckets); vfree(ca->buckets); @@ -2235,7 +2341,7 @@ static int cache_alloc(struct cache *ca) goto err_prio_buckets_alloc; } - ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca); + ca->disk_buckets = alloc_meta_bucket_pages(GFP_KERNEL, &ca->sb); if (!ca->disk_buckets) { err = "ca->disk_buckets alloc failed"; goto err_disk_buckets_alloc; @@ -2782,7 +2888,7 @@ static int __init bcache_init(void) static const struct attribute *files[] = { &ksysfs_register.attr, &ksysfs_register_quiet.attr, -#ifdef CONFIG_BCACHE_ASYNC_REGISTRAION +#ifdef CONFIG_BCACHE_ASYNC_REGISTRATION &ksysfs_register_async.attr, #endif &ksysfs_pendings_cleanup.attr, diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 0dadec5a78f6..ac06c0bc3c0a 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -11,6 +11,7 @@ #include "btree.h" #include "request.h" #include "writeback.h" +#include "features.h" #include <linux/blkdev.h> #include <linux/sort.h> @@ -88,6 +89,9 @@ read_attribute(btree_used_percent); read_attribute(average_key_size); read_attribute(dirty_data); read_attribute(bset_tree_stats); +read_attribute(feature_compat); +read_attribute(feature_ro_compat); +read_attribute(feature_incompat); read_attribute(state); read_attribute(cache_read_races); @@ -779,6 +783,13 @@ SHOW(__bch_cache_set) if (attr == &sysfs_bset_tree_stats) return bch_bset_print_stats(c, buf); + if (attr == &sysfs_feature_compat) + return bch_print_cache_set_feature_compat(c, buf, PAGE_SIZE); + if (attr == &sysfs_feature_ro_compat) + return bch_print_cache_set_feature_ro_compat(c, buf, PAGE_SIZE); + if (attr == &sysfs_feature_incompat) + return bch_print_cache_set_feature_incompat(c, buf, PAGE_SIZE); + return 0; } SHOW_LOCKED(bch_cache_set) @@ -987,6 +998,9 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_io_disable, &sysfs_cutoff_writeback, &sysfs_cutoff_writeback_sync, + &sysfs_feature_compat, + &sysfs_feature_ro_compat, + &sysfs_feature_incompat, NULL }; KTYPE(bch_cache_set_internal); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 1cf1e5016cb9..4f4ad6b3d43a 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -459,10 +459,8 @@ static void read_dirty(struct cached_dev *dc) for (i = 0; i < nk; i++) { w = keys[i]; - io = kzalloc(sizeof(struct dirty_io) + - sizeof(struct bio_vec) * - DIV_ROUND_UP(KEY_SIZE(&w->key), - PAGE_SECTORS), + io = kzalloc(struct_size(io, bio.bi_inline_vecs, + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), GFP_KERNEL); if (!io) goto err; @@ -523,15 +521,19 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode, uint64_t offset, int nr_sectors) { struct bcache_device *d = c->devices[inode]; - unsigned int stripe_offset, stripe, sectors_dirty; + unsigned int stripe_offset, sectors_dirty; + int stripe; if (!d) return; + stripe = offset_to_stripe(d, offset); + if (stripe < 0) + return; + if (UUID_FLASH_ONLY(&c->uuids[inode])) atomic_long_add(nr_sectors, &c->flash_dev_dirty_sectors); - stripe = offset_to_stripe(d, offset); stripe_offset = offset & (d->stripe_size - 1); while (nr_sectors) { @@ -571,12 +573,12 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k) static void refill_full_stripes(struct cached_dev *dc) { struct keybuf *buf = &dc->writeback_keys; - unsigned int start_stripe, stripe, next_stripe; + unsigned int start_stripe, next_stripe; + int stripe; bool wrapped = false; stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned)); - - if (stripe >= dc->disk.nr_stripes) + if (stripe < 0) stripe = 0; start_stripe = stripe; @@ -825,10 +827,8 @@ static int bch_dirty_init_thread(void *arg) struct btree_iter iter; struct bkey *k, *p; int cur_idx, prev_idx, skip_nr; - int i; k = p = NULL; - i = 0; cur_idx = prev_idx = 0; bch_btree_iter_init(&c->root->keys, &iter, NULL); diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index b029843ce5b6..3f1230e22de0 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -52,10 +52,22 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) return ret; } -static inline unsigned int offset_to_stripe(struct bcache_device *d, +static inline int offset_to_stripe(struct bcache_device *d, uint64_t offset) { do_div(offset, d->stripe_size); + + /* d->nr_stripes is in range [1, INT_MAX] */ + if (unlikely(offset >= d->nr_stripes)) { + pr_err("Invalid stripe %llu (>= nr_stripes %d).\n", + offset, d->nr_stripes); + return -EINVAL; + } + + /* + * Here offset is definitly smaller than INT_MAX, + * return it as int will never overflow. + */ return offset; } @@ -63,7 +75,10 @@ static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc, uint64_t offset, unsigned int nr_sectors) { - unsigned int stripe = offset_to_stripe(&dc->disk, offset); + int stripe = offset_to_stripe(&dc->disk, offset); + + if (stripe < 0) + return false; while (1) { if (atomic_read(dc->disk.stripe_sectors_dirty + stripe)) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 6d1565021d74..9c1a86bde658 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -108,7 +108,10 @@ struct dm_bufio_client { int async_write_error; struct list_head client_list; + struct shrinker shrinker; + struct work_struct shrink_work; + atomic_long_t need_shrink; }; /* @@ -1634,8 +1637,7 @@ static unsigned long get_retain_buffers(struct dm_bufio_client *c) return retain_bytes; } -static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, - gfp_t gfp_mask) +static void __scan(struct dm_bufio_client *c) { int l; struct dm_buffer *b, *tmp; @@ -1646,42 +1648,58 @@ static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, for (l = 0; l < LIST_SIZE; l++) { list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) { - if (__try_evict_buffer(b, gfp_mask)) + if (count - freed <= retain_target) + atomic_long_set(&c->need_shrink, 0); + if (!atomic_long_read(&c->need_shrink)) + return; + if (__try_evict_buffer(b, GFP_KERNEL)) { + atomic_long_dec(&c->need_shrink); freed++; - if (!--nr_to_scan || ((count - freed) <= retain_target)) - return freed; + } cond_resched(); } } - return freed; } -static unsigned long -dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +static void shrink_work(struct work_struct *w) +{ + struct dm_bufio_client *c = container_of(w, struct dm_bufio_client, shrink_work); + + dm_bufio_lock(c); + __scan(c); + dm_bufio_unlock(c); +} + +static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { struct dm_bufio_client *c; - unsigned long freed; c = container_of(shrink, struct dm_bufio_client, shrinker); - if (sc->gfp_mask & __GFP_FS) - dm_bufio_lock(c); - else if (!dm_bufio_trylock(c)) - return SHRINK_STOP; + atomic_long_add(sc->nr_to_scan, &c->need_shrink); + queue_work(dm_bufio_wq, &c->shrink_work); - freed = __scan(c, sc->nr_to_scan, sc->gfp_mask); - dm_bufio_unlock(c); - return freed; + return sc->nr_to_scan; } -static unsigned long -dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker); unsigned long count = READ_ONCE(c->n_buffers[LIST_CLEAN]) + READ_ONCE(c->n_buffers[LIST_DIRTY]); unsigned long retain_target = get_retain_buffers(c); + unsigned long queued_for_cleanup = atomic_long_read(&c->need_shrink); + + if (unlikely(count < retain_target)) + count = 0; + else + count -= retain_target; - return (count < retain_target) ? 0 : (count - retain_target); + if (unlikely(count < queued_for_cleanup)) + count = 0; + else + count -= queued_for_cleanup; + + return count; } /* @@ -1772,6 +1790,9 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign __free_buffer_wake(b); } + INIT_WORK(&c->shrink_work, shrink_work); + atomic_long_set(&c->need_shrink, 0); + c->shrinker.count_objects = dm_bufio_shrink_count; c->shrinker.scan_objects = dm_bufio_shrink_scan; c->shrinker.seeks = 1; @@ -1817,6 +1838,7 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c) drop_buffers(c); unregister_shrinker(&c->shrinker); + flush_work(&c->shrink_work); mutex_lock(&dm_bufio_clients_lock); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index d3bb355819a4..96c93802ee4d 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -421,8 +421,6 @@ struct cache { struct rw_semaphore quiesce_lock; - struct dm_target_callbacks callbacks; - /* * origin_blocks entries, discarded if set. */ @@ -886,7 +884,7 @@ static void accounted_complete(struct cache *cache, struct bio *bio) static void accounted_request(struct cache *cache, struct bio *bio) { accounted_begin(cache, bio); - generic_make_request(bio); + submit_bio_noacct(bio); } static void issue_op(struct bio *bio, void *context) @@ -1792,7 +1790,7 @@ static bool process_bio(struct cache *cache, struct bio *bio) bool commit_needed; if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED) - generic_make_request(bio); + submit_bio_noacct(bio); return commit_needed; } @@ -1858,7 +1856,7 @@ static bool process_discard_bio(struct cache *cache, struct bio *bio) if (cache->features.discard_passdown) { remap_to_origin(cache, bio); - generic_make_request(bio); + submit_bio_noacct(bio); } else bio_endio(bio); @@ -2423,20 +2421,6 @@ static void set_cache_size(struct cache *cache, dm_cblock_t size) cache->cache_size = size; } -static int is_congested(struct dm_dev *dev, int bdi_bits) -{ - struct request_queue *q = bdev_get_queue(dev->bdev); - return bdi_congested(q->backing_dev_info, bdi_bits); -} - -static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) -{ - struct cache *cache = container_of(cb, struct cache, callbacks); - - return is_congested(cache->origin_dev, bdi_bits) || - is_congested(cache->cache_dev, bdi_bits); -} - #define DEFAULT_MIGRATION_THRESHOLD 2048 static int cache_create(struct cache_args *ca, struct cache **result) @@ -2471,9 +2455,6 @@ static int cache_create(struct cache_args *ca, struct cache **result) goto bad; } - cache->callbacks.congested_fn = cache_is_congested; - dm_table_add_target_callbacks(ti->table, &cache->callbacks); - cache->metadata_dev = ca->metadata_dev; cache->origin_dev = ca->origin_dev; cache->cache_dev = ca->cache_dev; diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index 5ce96ddf1ce1..bdb255edc200 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -68,7 +68,6 @@ struct hash_table_bucket; struct clone { struct dm_target *ti; - struct dm_target_callbacks callbacks; struct dm_dev *metadata_dev; struct dm_dev *dest_dev; @@ -330,7 +329,7 @@ static void submit_bios(struct bio_list *bios) blk_start_plug(&plug); while ((bio = bio_list_pop(bios))) - generic_make_request(bio); + submit_bio_noacct(bio); blk_finish_plug(&plug); } @@ -346,7 +345,7 @@ static void submit_bios(struct bio_list *bios) static void issue_bio(struct clone *clone, struct bio *bio) { if (!bio_triggers_commit(clone, bio)) { - generic_make_request(bio); + submit_bio_noacct(bio); return; } @@ -473,7 +472,7 @@ static void complete_discard_bio(struct clone *clone, struct bio *bio, bool succ bio_region_range(clone, bio, &rs, &nr_regions); trim_bio(bio, region_to_sector(clone, rs), nr_regions << clone->region_shift); - generic_make_request(bio); + submit_bio_noacct(bio); } else bio_endio(bio); } @@ -865,7 +864,7 @@ static void hydration_overwrite(struct dm_clone_region_hydration *hd, struct bio bio->bi_private = hd; atomic_inc(&hd->clone->hydrations_in_flight); - generic_make_request(bio); + submit_bio_noacct(bio); } /* @@ -1281,7 +1280,7 @@ static void process_deferred_flush_bios(struct clone *clone) */ bio_endio(bio); } else { - generic_make_request(bio); + submit_bio_noacct(bio); } } } @@ -1518,18 +1517,6 @@ error: DMEMIT("Error"); } -static int clone_is_congested(struct dm_target_callbacks *cb, int bdi_bits) -{ - struct request_queue *dest_q, *source_q; - struct clone *clone = container_of(cb, struct clone, callbacks); - - source_q = bdev_get_queue(clone->source_dev->bdev); - dest_q = bdev_get_queue(clone->dest_dev->bdev); - - return (bdi_congested(dest_q->backing_dev_info, bdi_bits) | - bdi_congested(source_q->backing_dev_info, bdi_bits)); -} - static sector_t get_dev_size(struct dm_dev *dev) { return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; @@ -1930,8 +1917,6 @@ static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto out_with_mempool; mutex_init(&clone->commit_lock); - clone->callbacks.congested_fn = clone_is_congested; - dm_table_add_target_callbacks(ti->table, &clone->callbacks); /* Enable flushes */ ti->num_flush_bios = 1; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 000ddfab5ba0..148960721254 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -69,6 +69,7 @@ struct dm_crypt_io { u8 *integrity_metadata; bool integrity_metadata_from_pool; struct work_struct work; + struct tasklet_struct tasklet; struct convert_context ctx; @@ -127,7 +128,9 @@ struct iv_elephant_private { * and encrypts / decrypts at the same time. */ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID, - DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD }; + DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD, + DM_CRYPT_NO_READ_WORKQUEUE, DM_CRYPT_NO_WRITE_WORKQUEUE, + DM_CRYPT_WRITE_INLINE }; enum cipher_flags { CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cihper */ @@ -300,7 +303,7 @@ static struct crypto_aead *any_tfm_aead(struct crypt_config *cc) * elephant: The extended version of eboiv with additional Elephant diffuser * used with Bitlocker CBC mode. * This mode was used in older Windows systems - * http://download.microsoft.com/download/0/2/3/0238acaf-d3bf-4a6d-b3d6-0a0be4bbb36e/bitlockercipher200608.pdf + * https://download.microsoft.com/download/0/2/3/0238acaf-d3bf-4a6d-b3d6-0a0be4bbb36e/bitlockercipher200608.pdf */ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, @@ -407,7 +410,7 @@ static void crypt_iv_lmk_dtr(struct crypt_config *cc) crypto_free_shash(lmk->hash_tfm); lmk->hash_tfm = NULL; - kzfree(lmk->seed); + kfree_sensitive(lmk->seed); lmk->seed = NULL; } @@ -558,9 +561,9 @@ static void crypt_iv_tcw_dtr(struct crypt_config *cc) { struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; - kzfree(tcw->iv_seed); + kfree_sensitive(tcw->iv_seed); tcw->iv_seed = NULL; - kzfree(tcw->whitening); + kfree_sensitive(tcw->whitening); tcw->whitening = NULL; if (tcw->crc32_tfm && !IS_ERR(tcw->crc32_tfm)) @@ -994,8 +997,8 @@ static int crypt_iv_elephant(struct crypt_config *cc, struct dm_crypt_request *d kunmap_atomic(data); out: - kzfree(ks); - kzfree(es); + kfree_sensitive(ks); + kfree_sensitive(es); skcipher_request_free(req); return r; } @@ -1523,7 +1526,7 @@ static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_ * Encrypt / decrypt data from one bio to another one (can be the same one) */ static blk_status_t crypt_convert(struct crypt_config *cc, - struct convert_context *ctx) + struct convert_context *ctx, bool atomic) { unsigned int tag_offset = 0; unsigned int sector_step = cc->sector_size >> SECTOR_SHIFT; @@ -1566,7 +1569,8 @@ static blk_status_t crypt_convert(struct crypt_config *cc, atomic_dec(&ctx->cc_pending); ctx->cc_sector += sector_step; tag_offset++; - cond_resched(); + if (!atomic) + cond_resched(); continue; /* * There was a data integrity error. @@ -1789,7 +1793,7 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) return 1; } - generic_make_request(clone); + submit_bio_noacct(clone); return 0; } @@ -1815,7 +1819,7 @@ static void kcryptd_io_write(struct dm_crypt_io *io) { struct bio *clone = io->ctx.bio_out; - generic_make_request(clone); + submit_bio_noacct(clone); } #define crypt_io_from_node(node) rb_entry((node), struct dm_crypt_io, rb_node) @@ -1892,8 +1896,9 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) clone->bi_iter.bi_sector = cc->start + io->sector; - if (likely(!async) && test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) { - generic_make_request(clone); + if ((likely(!async) && test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) || + test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags)) { + submit_bio_noacct(clone); return; } @@ -1915,9 +1920,32 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) spin_unlock_irqrestore(&cc->write_thread_lock, flags); } +static bool kcryptd_crypt_write_inline(struct crypt_config *cc, + struct convert_context *ctx) + +{ + if (!test_bit(DM_CRYPT_WRITE_INLINE, &cc->flags)) + return false; + + /* + * Note: zone append writes (REQ_OP_ZONE_APPEND) do not have ordering + * constraints so they do not need to be issued inline by + * kcryptd_crypt_write_convert(). + */ + switch (bio_op(ctx->bio_in)) { + case REQ_OP_WRITE: + case REQ_OP_WRITE_SAME: + case REQ_OP_WRITE_ZEROES: + return true; + default: + return false; + } +} + static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) { struct crypt_config *cc = io->cc; + struct convert_context *ctx = &io->ctx; struct bio *clone; int crypt_finished; sector_t sector = io->sector; @@ -1927,7 +1955,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) * Prevent io from disappearing until this function completes. */ crypt_inc_pending(io); - crypt_convert_init(cc, &io->ctx, NULL, io->base_bio, sector); + crypt_convert_init(cc, ctx, NULL, io->base_bio, sector); clone = crypt_alloc_buffer(io, io->base_bio->bi_iter.bi_size); if (unlikely(!clone)) { @@ -1941,10 +1969,16 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) sector += bio_sectors(clone); crypt_inc_pending(io); - r = crypt_convert(cc, &io->ctx); + r = crypt_convert(cc, ctx, + test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags)); if (r) io->error = r; - crypt_finished = atomic_dec_and_test(&io->ctx.cc_pending); + crypt_finished = atomic_dec_and_test(&ctx->cc_pending); + if (!crypt_finished && kcryptd_crypt_write_inline(cc, ctx)) { + /* Wait for completion signaled by kcryptd_async_done() */ + wait_for_completion(&ctx->restart); + crypt_finished = 1; + } /* Encryption was already finished, submit io now */ if (crypt_finished) { @@ -1971,7 +2005,8 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) crypt_convert_init(cc, &io->ctx, io->base_bio, io->base_bio, io->sector); - r = crypt_convert(cc, &io->ctx); + r = crypt_convert(cc, &io->ctx, + test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags)); if (r) io->error = r; @@ -2015,10 +2050,21 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, if (!atomic_dec_and_test(&ctx->cc_pending)) return; - if (bio_data_dir(io->base_bio) == READ) + /* + * The request is fully completed: for inline writes, let + * kcryptd_crypt_write_convert() do the IO submission. + */ + if (bio_data_dir(io->base_bio) == READ) { kcryptd_crypt_read_done(io); - else - kcryptd_crypt_write_io_submit(io, 1); + return; + } + + if (kcryptd_crypt_write_inline(cc, ctx)) { + complete(&ctx->restart); + return; + } + + kcryptd_crypt_write_io_submit(io, 1); } static void kcryptd_crypt(struct work_struct *work) @@ -2031,10 +2077,28 @@ static void kcryptd_crypt(struct work_struct *work) kcryptd_crypt_write_convert(io); } +static void kcryptd_crypt_tasklet(unsigned long work) +{ + kcryptd_crypt((struct work_struct *)work); +} + static void kcryptd_queue_crypt(struct dm_crypt_io *io) { struct crypt_config *cc = io->cc; + if ((bio_data_dir(io->base_bio) == READ && test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags)) || + (bio_data_dir(io->base_bio) == WRITE && test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags))) { + if (in_irq()) { + /* Crypto API's "skcipher_walk_first() refuses to work in hard IRQ context */ + tasklet_init(&io->tasklet, kcryptd_crypt_tasklet, (unsigned long)&io->work); + tasklet_schedule(&io->tasklet); + return; + } + + kcryptd_crypt(&io->work); + return; + } + INIT_WORK(&io->work, kcryptd_crypt); queue_work(cc->crypt_queue, &io->work); } @@ -2294,7 +2358,7 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string key = request_key(type, key_desc + 1, NULL); if (IS_ERR(key)) { - kzfree(new_key_string); + kfree_sensitive(new_key_string); return PTR_ERR(key); } @@ -2304,7 +2368,7 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string if (ret < 0) { up_read(&key->sem); key_put(key); - kzfree(new_key_string); + kfree_sensitive(new_key_string); return ret; } @@ -2318,10 +2382,10 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string if (!ret) { set_bit(DM_CRYPT_KEY_VALID, &cc->flags); - kzfree(cc->key_string); + kfree_sensitive(cc->key_string); cc->key_string = new_key_string; } else - kzfree(new_key_string); + kfree_sensitive(new_key_string); return ret; } @@ -2382,7 +2446,7 @@ static int crypt_set_key(struct crypt_config *cc, char *key) clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); /* wipe references to any kernel keyring key */ - kzfree(cc->key_string); + kfree_sensitive(cc->key_string); cc->key_string = NULL; /* Decode key from its hex representation. */ @@ -2414,7 +2478,7 @@ static int crypt_wipe_key(struct crypt_config *cc) return r; } - kzfree(cc->key_string); + kfree_sensitive(cc->key_string); cc->key_string = NULL; r = crypt_setkey(cc); memset(&cc->key, 0, cc->key_size * sizeof(u8)); @@ -2493,15 +2557,15 @@ static void crypt_dtr(struct dm_target *ti) if (cc->dev) dm_put_device(ti, cc->dev); - kzfree(cc->cipher_string); - kzfree(cc->key_string); - kzfree(cc->cipher_auth); - kzfree(cc->authenc_key); + kfree_sensitive(cc->cipher_string); + kfree_sensitive(cc->key_string); + kfree_sensitive(cc->cipher_auth); + kfree_sensitive(cc->authenc_key); mutex_destroy(&cc->bio_alloc_lock); /* Must zero key material before freeing */ - kzfree(cc); + kfree_sensitive(cc); spin_lock(&dm_crypt_clients_lock); WARN_ON(!dm_crypt_clients_n); @@ -2838,7 +2902,7 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar struct crypt_config *cc = ti->private; struct dm_arg_set as; static const struct dm_arg _args[] = { - {0, 6, "Invalid number of feature args"}, + {0, 8, "Invalid number of feature args"}, }; unsigned int opt_params, val; const char *opt_string, *sval; @@ -2868,6 +2932,10 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar else if (!strcasecmp(opt_string, "submit_from_crypt_cpus")) set_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags); + else if (!strcasecmp(opt_string, "no_read_workqueue")) + set_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags); + else if (!strcasecmp(opt_string, "no_write_workqueue")) + set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags); else if (sscanf(opt_string, "integrity:%u:", &val) == 1) { if (val == 0 || val > MAX_TAG_SIZE) { ti->error = "Invalid integrity arguments"; @@ -2908,6 +2976,21 @@ static int crypt_ctr_optional(struct dm_target *ti, unsigned int argc, char **ar return 0; } +#ifdef CONFIG_BLK_DEV_ZONED + +static int crypt_report_zones(struct dm_target *ti, + struct dm_report_zones_args *args, unsigned int nr_zones) +{ + struct crypt_config *cc = ti->private; + sector_t sector = cc->start + dm_target_offset(ti, args->next_sector); + + args->start = cc->start; + return blkdev_report_zones(cc->dev->bdev, sector, nr_zones, + dm_report_zones_cb, args); +} + +#endif + /* * Construct an encryption mapping: * <cipher> [<key>|:<key_size>:<user|logon>:<key_description>] <iv_offset> <dev_path> <start> @@ -3041,6 +3124,16 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } cc->start = tmpll; + /* + * For zoned block devices, we need to preserve the issuer write + * ordering. To do so, disable write workqueues and force inline + * encryption completion. + */ + if (bdev_is_zoned(cc->dev->bdev)) { + set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags); + set_bit(DM_CRYPT_WRITE_INLINE, &cc->flags); + } + if (crypt_integrity_aead(cc) || cc->integrity_iv_size) { ret = crypt_integrity_ctr(cc, ti); if (ret) @@ -3196,6 +3289,8 @@ static void crypt_status(struct dm_target *ti, status_type_t type, num_feature_args += !!ti->num_discard_bios; num_feature_args += test_bit(DM_CRYPT_SAME_CPU, &cc->flags); num_feature_args += test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags); + num_feature_args += test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags); + num_feature_args += test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags); num_feature_args += cc->sector_size != (1 << SECTOR_SHIFT); num_feature_args += test_bit(CRYPT_IV_LARGE_SECTORS, &cc->cipher_flags); if (cc->on_disk_tag_size) @@ -3208,6 +3303,10 @@ static void crypt_status(struct dm_target *ti, status_type_t type, DMEMIT(" same_cpu_crypt"); if (test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) DMEMIT(" submit_from_crypt_cpus"); + if (test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags)) + DMEMIT(" no_read_workqueue"); + if (test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags)) + DMEMIT(" no_write_workqueue"); if (cc->on_disk_tag_size) DMEMIT(" integrity:%u:%s", cc->on_disk_tag_size, cc->cipher_auth); if (cc->sector_size != (1 << SECTOR_SHIFT)) @@ -3320,10 +3419,14 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type crypt_target = { .name = "crypt", - .version = {1, 21, 0}, + .version = {1, 22, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, +#ifdef CONFIG_BLK_DEV_ZONED + .features = DM_TARGET_ZONED_HM, + .report_zones = crypt_report_zones, +#endif .map = crypt_map, .status = crypt_status, .postsuspend = crypt_postsuspend, diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index f496213f8b67..2628a832787b 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -72,7 +72,7 @@ static void flush_bios(struct bio *bio) while (bio) { n = bio->bi_next; bio->bi_next = NULL; - generic_make_request(bio); + submit_bio_noacct(bio); bio = n; } } diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c index ff03b90072c5..072ea913cebc 100644 --- a/drivers/md/dm-dust.c +++ b/drivers/md/dm-dust.c @@ -138,20 +138,22 @@ static int dust_add_block(struct dust_device *dd, unsigned long long block, return 0; } -static int dust_query_block(struct dust_device *dd, unsigned long long block) +static int dust_query_block(struct dust_device *dd, unsigned long long block, char *result, + unsigned int maxlen, unsigned int *sz_ptr) { struct badblock *bblock; unsigned long flags; + unsigned int sz = *sz_ptr; spin_lock_irqsave(&dd->dust_lock, flags); bblock = dust_rb_search(&dd->badblocklist, block); if (bblock != NULL) - DMINFO("%s: block %llu found in badblocklist", __func__, block); + DMEMIT("%s: block %llu found in badblocklist", __func__, block); else - DMINFO("%s: block %llu not found in badblocklist", __func__, block); + DMEMIT("%s: block %llu not found in badblocklist", __func__, block); spin_unlock_irqrestore(&dd->dust_lock, flags); - return 0; + return 1; } static int __dust_map_read(struct dust_device *dd, sector_t thisblock) @@ -259,11 +261,13 @@ static bool __dust_clear_badblocks(struct rb_root *tree, return true; } -static int dust_clear_badblocks(struct dust_device *dd) +static int dust_clear_badblocks(struct dust_device *dd, char *result, unsigned int maxlen, + unsigned int *sz_ptr) { unsigned long flags; struct rb_root badblocklist; unsigned long long badblock_count; + unsigned int sz = *sz_ptr; spin_lock_irqsave(&dd->dust_lock, flags); badblocklist = dd->badblocklist; @@ -273,11 +277,36 @@ static int dust_clear_badblocks(struct dust_device *dd) spin_unlock_irqrestore(&dd->dust_lock, flags); if (!__dust_clear_badblocks(&badblocklist, badblock_count)) - DMINFO("%s: no badblocks found", __func__); + DMEMIT("%s: no badblocks found", __func__); else - DMINFO("%s: badblocks cleared", __func__); + DMEMIT("%s: badblocks cleared", __func__); - return 0; + return 1; +} + +static int dust_list_badblocks(struct dust_device *dd, char *result, unsigned int maxlen, + unsigned int *sz_ptr) +{ + unsigned long flags; + struct rb_root badblocklist; + struct rb_node *node; + struct badblock *bblk; + unsigned int sz = *sz_ptr; + unsigned long long num = 0; + + spin_lock_irqsave(&dd->dust_lock, flags); + badblocklist = dd->badblocklist; + for (node = rb_first(&badblocklist); node; node = rb_next(node)) { + bblk = rb_entry(node, struct badblock, node); + DMEMIT("%llu\n", bblk->bb); + num++; + } + + spin_unlock_irqrestore(&dd->dust_lock, flags); + if (!num) + DMEMIT("No blocks in badblocklist"); + + return 1; } /* @@ -383,7 +412,7 @@ static void dust_dtr(struct dm_target *ti) } static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, - char *result_buf, unsigned int maxlen) + char *result, unsigned int maxlen) { struct dust_device *dd = ti->private; sector_t size = i_size_read(dd->dev->bdev->bd_inode) >> SECTOR_SHIFT; @@ -393,6 +422,7 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, unsigned char wr_fail_cnt; unsigned int tmp_ui; unsigned long flags; + unsigned int sz = 0; char dummy; if (argc == 1) { @@ -410,18 +440,20 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, r = 0; } else if (!strcasecmp(argv[0], "countbadblocks")) { spin_lock_irqsave(&dd->dust_lock, flags); - DMINFO("countbadblocks: %llu badblock(s) found", + DMEMIT("countbadblocks: %llu badblock(s) found", dd->badblock_count); spin_unlock_irqrestore(&dd->dust_lock, flags); - r = 0; + r = 1; } else if (!strcasecmp(argv[0], "clearbadblocks")) { - r = dust_clear_badblocks(dd); + r = dust_clear_badblocks(dd, result, maxlen, &sz); } else if (!strcasecmp(argv[0], "quiet")) { if (!dd->quiet_mode) dd->quiet_mode = true; else dd->quiet_mode = false; r = 0; + } else if (!strcasecmp(argv[0], "listbadblocks")) { + r = dust_list_badblocks(dd, result, maxlen, &sz); } else { invalid_msg = true; } @@ -441,7 +473,7 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, else if (!strcasecmp(argv[0], "removebadblock")) r = dust_remove_block(dd, block); else if (!strcasecmp(argv[0], "queryblock")) - r = dust_query_block(dd, block); + r = dust_query_block(dd, block, result, maxlen, &sz); else invalid_msg = true; diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index 44451276f128..cb85610527c2 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -363,7 +363,7 @@ static int ebs_map(struct dm_target *ti, struct bio *bio) bio_set_dev(bio, ec->dev->bdev); bio->bi_iter.bi_sector = ec->start + dm_target_offset(ti, bio->bi_iter.bi_sector); - if (unlikely(bio->bi_opf & REQ_OP_FLUSH)) + if (unlikely(bio_op(bio) == REQ_OP_FLUSH)) return DM_MAPIO_REMAPPED; /* * Only queue for bufio processing in case of partial or overlapping buffers diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index bdb84b8e7162..b24e3839bb3a 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -1137,7 +1137,6 @@ static int metadata_get_stats(struct era_metadata *md, void *ptr) struct era { struct dm_target *ti; - struct dm_target_callbacks callbacks; struct dm_dev *metadata_dev; struct dm_dev *origin_dev; @@ -1265,7 +1264,7 @@ static void process_deferred_bios(struct era *era) bio_io_error(bio); else while ((bio = bio_list_pop(&marked_bios))) - generic_make_request(bio); + submit_bio_noacct(bio); } static void process_rpc_calls(struct era *era) @@ -1375,18 +1374,6 @@ static void stop_worker(struct era *era) /*---------------------------------------------------------------- * Target methods *--------------------------------------------------------------*/ -static int dev_is_congested(struct dm_dev *dev, int bdi_bits) -{ - struct request_queue *q = bdev_get_queue(dev->bdev); - return bdi_congested(q->backing_dev_info, bdi_bits); -} - -static int era_is_congested(struct dm_target_callbacks *cb, int bdi_bits) -{ - struct era *era = container_of(cb, struct era, callbacks); - return dev_is_congested(era->origin_dev, bdi_bits); -} - static void era_destroy(struct era *era) { if (era->md) @@ -1514,8 +1501,6 @@ static int era_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->flush_supported = true; ti->num_discard_bios = 1; - era->callbacks.congested_fn = era_is_congested; - dm_table_add_target_callbacks(ti->table, &era->callbacks); return 0; } diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c index b869316d3722..b0c45c6ebe0b 100644 --- a/drivers/md/dm-init.c +++ b/drivers/md/dm-init.c @@ -36,7 +36,7 @@ struct dm_device { struct list_head list; }; -const char * const dm_allowed_targets[] __initconst = { +static const char * const dm_allowed_targets[] __initconst = { "crypt", "delay", "linear", diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index a83a1de1e03f..8c8d940e532e 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -2115,12 +2115,12 @@ offload_to_thread: dio->in_flight = (atomic_t)ATOMIC_INIT(1); dio->completion = NULL; - generic_make_request(bio); + submit_bio_noacct(bio); return; } - generic_make_request(bio); + submit_bio_noacct(bio); if (need_sync_io) { wait_for_completion_io(&read_comp); @@ -3405,8 +3405,8 @@ static struct scatterlist **dm_integrity_alloc_journal_scatterlist(struct dm_int static void free_alg(struct alg_spec *a) { - kzfree(a->alg_string); - kzfree(a->key); + kfree_sensitive(a->alg_string); + kfree_sensitive(a->key); memset(a, 0, sizeof *a); } @@ -4337,7 +4337,7 @@ static void dm_integrity_dtr(struct dm_target *ti) for (i = 0; i < ic->journal_sections; i++) { struct skcipher_request *req = ic->sk_requests[i]; if (req) { - kzfree(req->iv); + kfree_sensitive(req->iv); skcipher_request_free(req); } } diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 81ffc59d05c9..4312007d2d34 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -306,7 +306,7 @@ static void do_region(int op, int op_flags, unsigned region, struct request_queue *q = bdev_get_queue(where->bdev); unsigned short logical_block_size = queue_logical_block_size(q); sector_t num_sectors; - unsigned int uninitialized_var(special_cmd_max_sectors); + unsigned int special_cmd_max_sectors; /* * Reject unsupported discard and write same requests. diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 489935d5f22d..28122e850ea1 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1168,7 +1168,7 @@ static void retrieve_status(struct dm_table *table, spec->sector_start = ti->begin; spec->length = ti->len; strncpy(spec->target_type, ti->type->name, - sizeof(spec->target_type)); + sizeof(spec->target_type) - 1); outptr += sizeof(struct dm_target_spec); remaining = len - (outptr - outbuf); @@ -1844,7 +1844,7 @@ static int ctl_ioctl(struct file *file, uint command, struct dm_ioctl __user *us int ioctl_flags; int param_flags; unsigned int cmd; - struct dm_ioctl *uninitialized_var(param); + struct dm_ioctl *param; ioctl_fn fn = NULL; size_t input_param_size; struct dm_ioctl param_kernel; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 78cff42d987e..53645a6f474c 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -128,6 +128,20 @@ static void queue_if_no_path_timeout_work(struct timer_list *t); #define MPATHF_PG_INIT_REQUIRED 5 /* pg_init needs calling? */ #define MPATHF_PG_INIT_DELAY_RETRY 6 /* Delay pg_init retry? */ +static bool mpath_double_check_test_bit(int MPATHF_bit, struct multipath *m) +{ + bool r = test_bit(MPATHF_bit, &m->flags); + + if (r) { + unsigned long flags; + spin_lock_irqsave(&m->lock, flags); + r = test_bit(MPATHF_bit, &m->flags); + spin_unlock_irqrestore(&m->lock, flags); + } + + return r; +} + /*----------------------------------------------- * Allocation routines *-----------------------------------------------*/ @@ -335,6 +349,8 @@ static int pg_init_all_paths(struct multipath *m) static void __switch_pg(struct multipath *m, struct priority_group *pg) { + lockdep_assert_held(&m->lock); + m->current_pg = pg; /* Must we initialise the PG first, and queue I/O till it's ready? */ @@ -382,7 +398,9 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) unsigned bypassed = 1; if (!atomic_read(&m->nr_valid_paths)) { + spin_lock_irqsave(&m->lock, flags); clear_bit(MPATHF_QUEUE_IO, &m->flags); + spin_unlock_irqrestore(&m->lock, flags); goto failed; } @@ -422,8 +440,11 @@ check_current_pg: continue; pgpath = choose_path_in_pg(m, pg, nr_bytes); if (!IS_ERR_OR_NULL(pgpath)) { - if (!bypassed) + if (!bypassed) { + spin_lock_irqsave(&m->lock, flags); set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags); + spin_unlock_irqrestore(&m->lock, flags); + } return pgpath; } } @@ -465,7 +486,14 @@ static bool __must_push_back(struct multipath *m) static bool must_push_back_rq(struct multipath *m) { - return test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || __must_push_back(m); + unsigned long flags; + bool ret; + + spin_lock_irqsave(&m->lock, flags); + ret = (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || __must_push_back(m)); + spin_unlock_irqrestore(&m->lock, flags); + + return ret; } /* @@ -485,7 +513,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, /* Do we need to select a new pgpath? */ pgpath = READ_ONCE(m->current_pgpath); - if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) + if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m)) pgpath = choose_pgpath(m, nr_bytes); if (!pgpath) { @@ -493,8 +521,8 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, return DM_MAPIO_DELAY_REQUEUE; dm_report_EIO(m); /* Failed */ return DM_MAPIO_KILL; - } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) || - test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { + } else if (mpath_double_check_test_bit(MPATHF_QUEUE_IO, m) || + mpath_double_check_test_bit(MPATHF_PG_INIT_REQUIRED, m)) { pg_init_all_paths(m); return DM_MAPIO_DELAY_REQUEUE; } @@ -560,33 +588,45 @@ static void multipath_release_clone(struct request *clone, * Map cloned bios (bio-based multipath) */ +static void __multipath_queue_bio(struct multipath *m, struct bio *bio) +{ + /* Queue for the daemon to resubmit */ + bio_list_add(&m->queued_bios, bio); + if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) + queue_work(kmultipathd, &m->process_queued_bios); +} + +static void multipath_queue_bio(struct multipath *m, struct bio *bio) +{ + unsigned long flags; + + spin_lock_irqsave(&m->lock, flags); + __multipath_queue_bio(m, bio); + spin_unlock_irqrestore(&m->lock, flags); +} + static struct pgpath *__map_bio(struct multipath *m, struct bio *bio) { struct pgpath *pgpath; unsigned long flags; - bool queue_io; /* Do we need to select a new pgpath? */ pgpath = READ_ONCE(m->current_pgpath); - if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) + if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m)) pgpath = choose_pgpath(m, bio->bi_iter.bi_size); - /* MPATHF_QUEUE_IO might have been cleared by choose_pgpath. */ - queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags); - - if ((pgpath && queue_io) || - (!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) { - /* Queue for the daemon to resubmit */ + if (!pgpath) { spin_lock_irqsave(&m->lock, flags); - bio_list_add(&m->queued_bios, bio); + if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { + __multipath_queue_bio(m, bio); + pgpath = ERR_PTR(-EAGAIN); + } spin_unlock_irqrestore(&m->lock, flags); - /* PG_INIT_REQUIRED cannot be set without QUEUE_IO */ - if (queue_io || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) - pg_init_all_paths(m); - else if (!queue_io) - queue_work(kmultipathd, &m->process_queued_bios); - + } else if (mpath_double_check_test_bit(MPATHF_QUEUE_IO, m) || + mpath_double_check_test_bit(MPATHF_PG_INIT_REQUIRED, m)) { + multipath_queue_bio(m, bio); + pg_init_all_paths(m); return ERR_PTR(-EAGAIN); } @@ -677,7 +717,7 @@ static void process_queued_bios(struct work_struct *work) bio_endio(bio); break; case DM_MAPIO_REMAPPED: - generic_make_request(bio); + submit_bio_noacct(bio); break; case DM_MAPIO_SUBMITTED: break; @@ -835,7 +875,7 @@ static int setup_scsi_dh(struct block_device *bdev, struct multipath *m, struct request_queue *q = bdev_get_queue(bdev); int r; - if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags)) { + if (mpath_double_check_test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, m)) { retain: if (*attached_handler_name) { /* @@ -1614,7 +1654,7 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, if (pgpath) fail_path(pgpath); - if (atomic_read(&m->nr_valid_paths) == 0 && + if (!atomic_read(&m->nr_valid_paths) && !must_push_back_rq(m)) { if (error == BLK_STS_IOERR) dm_report_EIO(m); @@ -1649,23 +1689,22 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, if (pgpath) fail_path(pgpath); - if (atomic_read(&m->nr_valid_paths) == 0 && - !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { - if (__must_push_back(m)) { - r = DM_ENDIO_REQUEUE; - } else { - dm_report_EIO(m); - *error = BLK_STS_IOERR; + if (!atomic_read(&m->nr_valid_paths)) { + spin_lock_irqsave(&m->lock, flags); + if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { + if (__must_push_back(m)) { + r = DM_ENDIO_REQUEUE; + } else { + dm_report_EIO(m); + *error = BLK_STS_IOERR; + } + spin_unlock_irqrestore(&m->lock, flags); + goto done; } - goto done; + spin_unlock_irqrestore(&m->lock, flags); } - spin_lock_irqsave(&m->lock, flags); - bio_list_add(&m->queued_bios, clone); - spin_unlock_irqrestore(&m->lock, flags); - if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) - queue_work(kmultipathd, &m->process_queued_bios); - + multipath_queue_bio(m, clone); r = DM_ENDIO_INCOMPLETE; done: if (pgpath) { @@ -1937,16 +1976,17 @@ static int multipath_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) { struct multipath *m = ti->private; - struct pgpath *current_pgpath; + struct pgpath *pgpath; + unsigned long flags; int r; - current_pgpath = READ_ONCE(m->current_pgpath); - if (!current_pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) - current_pgpath = choose_pgpath(m, 0); + pgpath = READ_ONCE(m->current_pgpath); + if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m)) + pgpath = choose_pgpath(m, 0); - if (current_pgpath) { - if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) { - *bdev = current_pgpath->path.dev->bdev; + if (pgpath) { + if (!mpath_double_check_test_bit(MPATHF_QUEUE_IO, m)) { + *bdev = pgpath->path.dev->bdev; r = 0; } else { /* pg_init has not started or completed */ @@ -1954,10 +1994,11 @@ static int multipath_prepare_ioctl(struct dm_target *ti, } } else { /* No path is available */ + r = -EIO; + spin_lock_irqsave(&m->lock, flags); if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) r = -ENOTCONN; - else - r = -EIO; + spin_unlock_irqrestore(&m->lock, flags); } if (r == -ENOTCONN) { @@ -1965,8 +2006,10 @@ static int multipath_prepare_ioctl(struct dm_target *ti, /* Path status changed, redo selection */ (void) choose_pgpath(m, 0); } + spin_lock_irqsave(&m->lock, flags); if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) - pg_init_all_paths(m); + (void) __pg_init_all_paths(m); + spin_unlock_irqrestore(&m->lock, flags); dm_table_run_md_queue_async(m->ti->table); process_queued_io_list(m); } @@ -2026,8 +2069,15 @@ static int multipath_busy(struct dm_target *ti) return true; /* no paths available, for blk-mq: rely on IO mapping to delay requeue */ - if (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) - return (m->queue_mode != DM_TYPE_REQUEST_BASED); + if (!atomic_read(&m->nr_valid_paths)) { + unsigned long flags; + spin_lock_irqsave(&m->lock, flags); + if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { + spin_unlock_irqrestore(&m->lock, flags); + return (m->queue_mode != DM_TYPE_REQUEST_BASED); + } + spin_unlock_irqrestore(&m->lock, flags); + } /* Guess which priority_group will be used at next mapping time */ pg = READ_ONCE(m->current_pg); diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 10e8b2fe787b..8d2b835d7a10 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -242,7 +242,6 @@ struct raid_set { struct mddev md; struct raid_type *raid_type; - struct dm_target_callbacks callbacks; sector_t array_sectors; sector_t dev_sectors; @@ -1705,13 +1704,6 @@ static void do_table_event(struct work_struct *ws) dm_table_event(rs->ti->table); } -static int raid_is_congested(struct dm_target_callbacks *cb, int bits) -{ - struct raid_set *rs = container_of(cb, struct raid_set, callbacks); - - return mddev_congested(&rs->md, bits); -} - /* * Make sure a valid takover (level switch) is being requested on @rs * @@ -2345,8 +2337,6 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) if (new_devs == rs->raid_disks || !rebuilds) { /* Replace a broken device */ - if (new_devs == 1 && !rs->delta_disks) - ; if (new_devs == rs->raid_disks) { DMINFO("Superblocks created for new raid set"); set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); @@ -3248,9 +3238,6 @@ size_check: goto bad_md_start; } - rs->callbacks.congested_fn = raid_is_congested; - dm_table_add_target_callbacks(ti->table, &rs->callbacks); - /* If raid4/5/6 journal mode explicitly requested (only possible with journal dev) -> set it */ if (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags)) { r = r5c_journal_mode_set(&rs->md, rs->journal_dev.mode); @@ -3310,7 +3297,6 @@ static void raid_dtr(struct dm_target *ti) { struct raid_set *rs = ti->private; - list_del_init(&rs->callbacks.list); md_stop(&rs->md); raid_set_free(rs); } diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 2f655d9f4200..fa09bc4e4c54 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -779,7 +779,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) wakeup_mirrord(ms); } else { map_bio(get_default_mirror(ms), bio); - generic_make_request(bio); + submit_bio_noacct(bio); } } } diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 85e0daabad49..6d743ff6a314 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -70,9 +70,6 @@ void dm_start_queue(struct request_queue *q) void dm_stop_queue(struct request_queue *q) { - if (blk_mq_queue_stopped(q)) - return; - blk_mq_quiesce_queue(q); } @@ -284,7 +281,8 @@ static void dm_complete_request(struct request *rq, blk_status_t error) struct dm_rq_target_io *tio = tio_from_request(rq); tio->error = error; - blk_mq_complete_request(rq); + if (likely(!blk_should_fake_timeout(rq->q))) + blk_mq_complete_request(rq); } /* diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 963d3774c93e..63fab7c769be 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -252,7 +252,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int op, /* * Issue the synchronous I/O from a different thread - * to avoid generic_make_request recursion. + * to avoid submit_bio_noacct recursion. */ INIT_WORK_ONSTACK(&req.work, do_metadata); queue_work(ps->metadata_wq, &req.work); @@ -613,7 +613,7 @@ static int persistent_read_metadata(struct dm_exception_store *store, chunk_t old, chunk_t new), void *callback_context) { - int r, uninitialized_var(new_snapshot); + int r, new_snapshot; struct pstore *ps = get_info(store); /* diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 6b11a266299f..4668b2cd98f4 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1568,7 +1568,7 @@ static void flush_bios(struct bio *bio) while (bio) { n = bio->bi_next; bio->bi_next = NULL; - generic_make_request(bio); + submit_bio_noacct(bio); bio = n; } } @@ -1588,7 +1588,7 @@ static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio) bio->bi_next = NULL; r = do_origin(s->origin, bio, false); if (r == DM_MAPIO_REMAPPED) - generic_make_request(bio); + submit_bio_noacct(bio); bio = n; } } @@ -1829,7 +1829,7 @@ static void start_full_bio(struct dm_snap_pending_exception *pe, bio->bi_end_io = full_bio_end_io; bio->bi_private = callback_data; - generic_make_request(bio); + submit_bio_noacct(bio); } static struct dm_snap_pending_exception * diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 8277b959e00b..5edc3079e7c1 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -64,8 +64,6 @@ struct dm_table { void *event_context; struct dm_md_mempools *mempools; - - struct list_head target_callbacks; }; /* @@ -190,7 +188,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode, return -ENOMEM; INIT_LIST_HEAD(&t->devices); - INIT_LIST_HEAD(&t->target_callbacks); if (!num_targets) num_targets = KEYS_PER_NODE; @@ -361,7 +358,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, * This upgrades the mode on an already open dm_dev, being * careful to leave things as they were if we fail to reopen the * device and not to touch the existing bdev field in case - * it is accessed concurrently inside dm_table_any_congested(). + * it is accessed concurrently. */ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, struct mapped_device *md) @@ -461,7 +458,8 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, return 0; } - if (bdev_stack_limits(limits, bdev, start) < 0) + if (blk_stack_limits(limits, &q->limits, + get_start_sect(bdev) + start) < 0) DMWARN("%s: adding target device %s caused an alignment inconsistency: " "physical_block_size=%u, logical_block_size=%u, " "alignment_offset=%u, start=%llu", @@ -470,9 +468,6 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, q->limits.logical_block_size, q->limits.alignment_offset, (unsigned long long) start << SECTOR_SHIFT); - - limits->zoned = blk_queue_zoned_model(q); - return 0; } @@ -642,7 +637,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table, */ unsigned short remaining = 0; - struct dm_target *uninitialized_var(ti); + struct dm_target *ti; struct queue_limits ti_limits; unsigned i; @@ -1531,22 +1526,6 @@ combine_limits: dm_device_name(table->md), (unsigned long long) ti->begin, (unsigned long long) ti->len); - - /* - * FIXME: this should likely be moved to blk_stack_limits(), would - * also eliminate limits->zoned stacking hack in dm_set_device_limits() - */ - if (limits->zoned == BLK_ZONED_NONE && ti_limits.zoned != BLK_ZONED_NONE) { - /* - * By default, the stacked limits zoned model is set to - * BLK_ZONED_NONE in blk_set_stacking_limits(). Update - * this model using the first target model reported - * that is not BLK_ZONED_NONE. This will be either the - * first target device zoned model or the model reported - * by the target .io_hints. - */ - limits->zoned = ti_limits.zoned; - } } /* @@ -2052,38 +2031,6 @@ int dm_table_resume_targets(struct dm_table *t) return 0; } -void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb) -{ - list_add(&cb->list, &t->target_callbacks); -} -EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks); - -int dm_table_any_congested(struct dm_table *t, int bdi_bits) -{ - struct dm_dev_internal *dd; - struct list_head *devices = dm_table_get_devices(t); - struct dm_target_callbacks *cb; - int r = 0; - - list_for_each_entry(dd, devices, list) { - struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev); - char b[BDEVNAME_SIZE]; - - if (likely(q)) - r |= bdi_congested(q->backing_dev_info, bdi_bits); - else - DMWARN_LIMIT("%s: any_congested: nonexistent device %s", - dm_device_name(t->md), - bdevname(dd->dm_dev->bdev, b)); - } - - list_for_each_entry(cb, &t->target_callbacks, list) - if (cb->congested_fn) - r |= cb->congested_fn(cb, bdi_bits); - - return r; -} - struct mapped_device *dm_table_get_md(struct dm_table *t) { return t->md; diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index fa8d5464c1fb..fff4c50df74d 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -326,7 +326,6 @@ struct pool_c { struct pool *pool; struct dm_dev *data_dev; struct dm_dev *metadata_dev; - struct dm_target_callbacks callbacks; dm_block_t low_water_blocks; struct pool_features requested_pf; /* Features requested during table load */ @@ -758,7 +757,7 @@ static void issue(struct thin_c *tc, struct bio *bio) struct pool *pool = tc->pool; if (!bio_triggers_commit(tc, bio)) { - generic_make_request(bio); + submit_bio_noacct(bio); return; } @@ -2394,7 +2393,7 @@ static void process_deferred_bios(struct pool *pool) if (bio->bi_opf & REQ_PREFLUSH) bio_endio(bio); else - generic_make_request(bio); + submit_bio_noacct(bio); } } @@ -2796,18 +2795,6 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) } } -static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits) -{ - struct pool_c *pt = container_of(cb, struct pool_c, callbacks); - struct request_queue *q; - - if (get_pool_mode(pt->pool) == PM_OUT_OF_DATA_SPACE) - return 1; - - q = bdev_get_queue(pt->data_dev->bdev); - return bdi_congested(q->backing_dev_info, bdi_bits); -} - static void requeue_bios(struct pool *pool) { struct thin_c *tc; @@ -3420,9 +3407,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) dm_pool_register_pre_commit_callback(pool->pmd, metadata_pre_commit_callback, pool); - pt->callbacks.congested_fn = pool_is_congested; - dm_table_add_target_callbacks(ti->table, &pt->callbacks); - mutex_unlock(&dm_thin_pool_table.mutex); return 0; diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index eec9f252e935..f74982dcbea0 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -30,6 +30,7 @@ #define DM_VERITY_OPT_LOGGING "ignore_corruption" #define DM_VERITY_OPT_RESTART "restart_on_corruption" +#define DM_VERITY_OPT_PANIC "panic_on_corruption" #define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks" #define DM_VERITY_OPT_AT_MOST_ONCE "check_at_most_once" @@ -254,6 +255,9 @@ out: if (v->mode == DM_VERITY_MODE_RESTART) kernel_restart("dm-verity device corrupted"); + if (v->mode == DM_VERITY_MODE_PANIC) + panic("dm-verity device corrupted"); + return 1; } @@ -681,7 +685,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio) verity_submit_prefetch(v, io); - generic_make_request(bio); + submit_bio_noacct(bio); return DM_MAPIO_SUBMITTED; } @@ -742,6 +746,9 @@ static void verity_status(struct dm_target *ti, status_type_t type, case DM_VERITY_MODE_RESTART: DMEMIT(DM_VERITY_OPT_RESTART); break; + case DM_VERITY_MODE_PANIC: + DMEMIT(DM_VERITY_OPT_PANIC); + break; default: BUG(); } @@ -907,6 +914,10 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, v->mode = DM_VERITY_MODE_RESTART; continue; + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_PANIC)) { + v->mode = DM_VERITY_MODE_PANIC; + continue; + } else if (!strcasecmp(arg_name, DM_VERITY_OPT_IGN_ZEROES)) { r = verity_alloc_zero_digest(v); if (r) { @@ -1221,7 +1232,7 @@ bad: static struct target_type verity_target = { .name = "verity", - .version = {1, 6, 0}, + .version = {1, 7, 0}, .module = THIS_MODULE, .ctr = verity_ctr, .dtr = verity_dtr, diff --git a/drivers/md/dm-verity-verify-sig.h b/drivers/md/dm-verity-verify-sig.h index 19b1547aa741..3987c7141f79 100644 --- a/drivers/md/dm-verity-verify-sig.h +++ b/drivers/md/dm-verity-verify-sig.h @@ -34,25 +34,25 @@ void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts); #define DM_VERITY_ROOT_HASH_VERIFICATION_OPTS 0 -int verity_verify_root_hash(const void *data, size_t data_len, - const void *sig_data, size_t sig_len) +static inline int verity_verify_root_hash(const void *data, size_t data_len, + const void *sig_data, size_t sig_len) { return 0; } -bool verity_verify_is_sig_opt_arg(const char *arg_name) +static inline bool verity_verify_is_sig_opt_arg(const char *arg_name) { return false; } -int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, - struct dm_verity_sig_opts *sig_opts, - unsigned int *argc, const char *arg_name) +static inline int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, + struct dm_verity *v, struct dm_verity_sig_opts *sig_opts, + unsigned int *argc, const char *arg_name) { return -EINVAL; } -void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts) +static inline void verity_verify_sig_opts_cleanup(struct dm_verity_sig_opts *sig_opts) { } diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index 641b9e3a399b..4e769d13473a 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -20,7 +20,8 @@ enum verity_mode { DM_VERITY_MODE_EIO, DM_VERITY_MODE_LOGGING, - DM_VERITY_MODE_RESTART + DM_VERITY_MODE_RESTART, + DM_VERITY_MODE_PANIC }; enum verity_block_type { diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 5358894bb9fd..86dbe0c8b45c 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -538,7 +538,7 @@ static void ssd_commit_superblock(struct dm_writecache *wc) static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) { if (WC_MODE_PMEM(wc)) - wmb(); + pmem_wmb(); else ssd_commit_flushed(wc, wait_for_ios); } @@ -1244,7 +1244,7 @@ static int writecache_flush_thread(void *data) bio_end_sector(bio)); wc_unlock(wc); bio_set_dev(bio, wc->dev->bdev); - generic_make_request(bio); + submit_bio_noacct(bio); } else { writecache_flush(wc); wc_unlock(wc); @@ -1752,7 +1752,7 @@ static void writecache_writeback(struct work_struct *work) { struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work); struct blk_plug plug; - struct wc_entry *f, *uninitialized_var(g), *e = NULL; + struct wc_entry *f, *g, *e = NULL; struct rb_node *node, *next_node; struct list_head skipped; struct writeback_list wbl; diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 42aa5139df7c..697f9de37355 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -140,7 +140,7 @@ static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone, bio_advance(bio, clone->bi_iter.bi_size); refcount_inc(&bioctx->ref); - generic_make_request(clone); + submit_bio_noacct(clone); if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone)) zone->wp_block += nr_blocks; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 5b9de2f71bb0..32fa6499739f 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -504,7 +504,8 @@ static int dm_blk_report_zones(struct gendisk *disk, sector_t sector, } args.tgt = tgt; - ret = tgt->type->report_zones(tgt, &args, nr_zones); + ret = tgt->type->report_zones(tgt, &args, + nr_zones - args.zone_idx); if (ret < 0) goto out; } while (args.zone_idx < nr_zones && @@ -1273,7 +1274,6 @@ static blk_qc_t __map_bio(struct dm_target_io *tio) sector_t sector; struct bio *clone = &tio->clone; struct dm_io *io = tio->io; - struct mapped_device *md = io->md; struct dm_target *ti = tio->ti; blk_qc_t ret = BLK_QC_T_NONE; @@ -1295,10 +1295,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio) /* the bio has been remapped so dispatch it */ trace_block_bio_remap(clone->bi_disk->queue, clone, bio_dev(io->orig_bio), sector); - if (md->type == DM_TYPE_NVME_BIO_BASED) - ret = direct_make_request(clone); - else - ret = generic_make_request(clone); + ret = submit_bio_noacct(clone); break; case DM_MAPIO_KILL: free_tio(tio); @@ -1645,7 +1642,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, error = __split_and_process_non_flush(&ci); if (current->bio_list && ci.sector_count && !error) { /* - * Remainder must be passed to generic_make_request() + * Remainder must be passed to submit_bio_noacct() * so that it gets handled *after* bios already submitted * have been completely processed. * We take a clone of the original to store in @@ -1670,7 +1667,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, bio_chain(b, bio); trace_block_split(md->queue, b, bio->bi_iter.bi_sector); - ret = generic_make_request(bio); + ret = submit_bio_noacct(bio); break; } } @@ -1738,7 +1735,7 @@ static void dm_queue_split(struct mapped_device *md, struct dm_target *ti, struc bio_chain(split, *bio); trace_block_split(md->queue, split, (*bio)->bi_iter.bi_sector); - generic_make_request(*bio); + submit_bio_noacct(*bio); *bio = split; } } @@ -1763,13 +1760,13 @@ static blk_qc_t dm_process_bio(struct mapped_device *md, } /* - * If in ->make_request_fn we need to use blk_queue_split(), otherwise + * If in ->queue_bio we need to use blk_queue_split(), otherwise * queue_limits for abnormal requests (e.g. discard, writesame, etc) * won't be imposed. */ if (current->bio_list) { if (is_abnormal_io(bio)) - blk_queue_split(md->queue, &bio); + blk_queue_split(&bio); else dm_queue_split(md, ti, &bio); } @@ -1780,9 +1777,9 @@ static blk_qc_t dm_process_bio(struct mapped_device *md, return __split_and_process_bio(md, map, bio); } -static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t dm_submit_bio(struct bio *bio) { - struct mapped_device *md = q->queuedata; + struct mapped_device *md = bio->bi_disk->private_data; blk_qc_t ret = BLK_QC_T_NONE; int srcu_idx; struct dm_table *map; @@ -1791,12 +1788,12 @@ static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) /* * We are called with a live reference on q_usage_counter, but * that one will be released as soon as we return. Grab an - * extra one as blk_mq_make_request expects to be able to - * consume a reference (which lives until the request is freed - * in case a request is allocated). + * extra one as blk_mq_submit_bio expects to be able to consume + * a reference (which lives until the request is freed in case a + * request is allocated). */ - percpu_ref_get(&q->q_usage_counter); - return blk_mq_make_request(q, bio); + percpu_ref_get(&bio->bi_disk->queue->q_usage_counter); + return blk_mq_submit_bio(bio); } map = dm_get_live_table(md, &srcu_idx); @@ -1818,31 +1815,6 @@ static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) return ret; } -static int dm_any_congested(void *congested_data, int bdi_bits) -{ - int r = bdi_bits; - struct mapped_device *md = congested_data; - struct dm_table *map; - - if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { - if (dm_request_based(md)) { - /* - * With request-based DM we only need to check the - * top-level queue for congestion. - */ - struct backing_dev_info *bdi = md->queue->backing_dev_info; - r = bdi->wb.congested->state & bdi_bits; - } else { - map = dm_get_live_table_fast(md); - if (map) - r = dm_table_any_congested(map, bdi_bits); - dm_put_live_table_fast(md); - } - } - - return r; -} - /*----------------------------------------------------------------- * An IDR is used to keep track of allocated minor numbers. *---------------------------------------------------------------*/ @@ -1981,14 +1953,13 @@ static struct mapped_device *alloc_dev(int minor) spin_lock_init(&md->uevent_lock); /* - * default to bio-based required ->make_request_fn until DM - * table is loaded and md->type established. If request-based - * table is loaded: blk-mq will override accordingly. + * default to bio-based until DM table is loaded and md->type + * established. If request-based table is loaded: blk-mq will + * override accordingly. */ - md->queue = blk_alloc_queue(dm_make_request, numa_node_id); + md->queue = blk_alloc_queue(numa_node_id); if (!md->queue) goto bad; - md->queue->queuedata = md; md->disk = alloc_disk_node(1, md->numa_node_id); if (!md->disk) @@ -2282,12 +2253,6 @@ struct queue_limits *dm_get_queue_limits(struct mapped_device *md) } EXPORT_SYMBOL_GPL(dm_get_queue_limits); -static void dm_init_congested_fn(struct mapped_device *md) -{ - md->queue->backing_dev_info->congested_data = md; - md->queue->backing_dev_info->congested_fn = dm_any_congested; -} - /* * Setup the DM device's queue based on md's type */ @@ -2304,12 +2269,10 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) DMERR("Cannot initialize queue for request-based dm-mq mapped device"); return r; } - dm_init_congested_fn(md); break; case DM_TYPE_BIO_BASED: case DM_TYPE_DAX_BIO_BASED: case DM_TYPE_NVME_BIO_BASED: - dm_init_congested_fn(md); break; case DM_TYPE_NONE: WARN_ON_ONCE(true); @@ -2531,7 +2494,7 @@ static void dm_wq_work(struct work_struct *work) break; if (dm_request_based(md)) - (void) generic_make_request(c); + (void) submit_bio_noacct(c); else (void) dm_process_bio(md, map, c); } @@ -3286,6 +3249,7 @@ static const struct pr_ops dm_pr_ops = { }; static const struct block_device_operations dm_blk_dops = { + .submit_bio = dm_submit_bio, .open = dm_blk_open, .release = dm_blk_close, .ioctl = dm_blk_ioctl, diff --git a/drivers/md/dm.h b/drivers/md/dm.h index d7c4f6606b5f..4f5fe664d05a 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -63,7 +63,6 @@ void dm_table_presuspend_targets(struct dm_table *t); void dm_table_presuspend_undo_targets(struct dm_table *t); void dm_table_postsuspend_targets(struct dm_table *t); int dm_table_resume_targets(struct dm_table *t); -int dm_table_any_congested(struct dm_table *t, int bdi_bits); enum dm_queue_mode dm_table_get_type(struct dm_table *t); struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); struct dm_target *dm_table_get_immutable_target(struct dm_table *t); diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c new file mode 100644 index 000000000000..6bbec89976a7 --- /dev/null +++ b/drivers/md/md-autodetect.c @@ -0,0 +1,291 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/blkdev.h> +#include <linux/init.h> +#include <linux/mount.h> +#include <linux/major.h> +#include <linux/delay.h> +#include <linux/init_syscalls.h> +#include <linux/raid/detect.h> +#include <linux/raid/md_u.h> +#include <linux/raid/md_p.h> +#include "md.h" + +/* + * When md (and any require personalities) are compiled into the kernel + * (not a module), arrays can be assembles are boot time using with AUTODETECT + * where specially marked partitions are registered with md_autodetect_dev(), + * and with MD_BOOT where devices to be collected are given on the boot line + * with md=..... + * The code for that is here. + */ + +#ifdef CONFIG_MD_AUTODETECT +static int __initdata raid_noautodetect; +#else +static int __initdata raid_noautodetect=1; +#endif +static int __initdata raid_autopart; + +static struct md_setup_args { + int minor; + int partitioned; + int level; + int chunk; + char *device_names; +} md_setup_args[256] __initdata; + +static int md_setup_ents __initdata; + +/* + * Parse the command-line parameters given our kernel, but do not + * actually try to invoke the MD device now; that is handled by + * md_setup_drive after the low-level disk drivers have initialised. + * + * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which + * assigns the task of parsing integer arguments to the + * invoked program now). Added ability to initialise all + * the MD devices (by specifying multiple "md=" lines) + * instead of just one. -- KTK + * 18May2000: Added support for persistent-superblock arrays: + * md=n,0,factor,fault,device-list uses RAID0 for device n + * md=n,-1,factor,fault,device-list uses LINEAR for device n + * md=n,device-list reads a RAID superblock from the devices + * elements in device-list are read by name_to_kdev_t so can be + * a hex number or something like /dev/hda1 /dev/sdb + * 2001-06-03: Dave Cinege <dcinege@psychosis.com> + * Shifted name_to_kdev_t() and related operations to md_set_drive() + * for later execution. Rewrote section to make devfs compatible. + */ +static int __init md_setup(char *str) +{ + int minor, level, factor, fault, partitioned = 0; + char *pername = ""; + char *str1; + int ent; + + if (*str == 'd') { + partitioned = 1; + str++; + } + if (get_option(&str, &minor) != 2) { /* MD Number */ + printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); + return 0; + } + str1 = str; + for (ent=0 ; ent< md_setup_ents ; ent++) + if (md_setup_args[ent].minor == minor && + md_setup_args[ent].partitioned == partitioned) { + printk(KERN_WARNING "md: md=%s%d, Specified more than once. " + "Replacing previous definition.\n", partitioned?"d":"", minor); + break; + } + if (ent >= ARRAY_SIZE(md_setup_args)) { + printk(KERN_WARNING "md: md=%s%d - too many md initialisations\n", partitioned?"d":"", minor); + return 0; + } + if (ent >= md_setup_ents) + md_setup_ents++; + switch (get_option(&str, &level)) { /* RAID level */ + case 2: /* could be 0 or -1.. */ + if (level == 0 || level == LEVEL_LINEAR) { + if (get_option(&str, &factor) != 2 || /* Chunk Size */ + get_option(&str, &fault) != 2) { + printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); + return 0; + } + md_setup_args[ent].level = level; + md_setup_args[ent].chunk = 1 << (factor+12); + if (level == LEVEL_LINEAR) + pername = "linear"; + else + pername = "raid0"; + break; + } + /* FALL THROUGH */ + case 1: /* the first device is numeric */ + str = str1; + /* FALL THROUGH */ + case 0: + md_setup_args[ent].level = LEVEL_NONE; + pername="super-block"; + } + + printk(KERN_INFO "md: Will configure md%d (%s) from %s, below.\n", + minor, pername, str); + md_setup_args[ent].device_names = str; + md_setup_args[ent].partitioned = partitioned; + md_setup_args[ent].minor = minor; + + return 1; +} + +static void __init md_setup_drive(struct md_setup_args *args) +{ + char *devname = args->device_names; + dev_t devices[MD_SB_DISKS + 1], mdev; + struct mdu_array_info_s ainfo = { }; + struct block_device *bdev; + struct mddev *mddev; + int err = 0, i; + char name[16]; + + if (args->partitioned) { + mdev = MKDEV(mdp_major, args->minor << MdpMinorShift); + sprintf(name, "md_d%d", args->minor); + } else { + mdev = MKDEV(MD_MAJOR, args->minor); + sprintf(name, "md%d", args->minor); + } + + for (i = 0; i < MD_SB_DISKS && devname != NULL; i++) { + struct kstat stat; + char *p; + char comp_name[64]; + dev_t dev; + + p = strchr(devname, ','); + if (p) + *p++ = 0; + + dev = name_to_dev_t(devname); + if (strncmp(devname, "/dev/", 5) == 0) + devname += 5; + snprintf(comp_name, 63, "/dev/%s", devname); + if (init_stat(comp_name, &stat, 0) == 0 && S_ISBLK(stat.mode)) + dev = new_decode_dev(stat.rdev); + if (!dev) { + pr_warn("md: Unknown device name: %s\n", devname); + break; + } + + devices[i] = dev; + devname = p; + } + devices[i] = 0; + + if (!i) + return; + + pr_info("md: Loading %s: %s\n", name, args->device_names); + + bdev = blkdev_get_by_dev(mdev, FMODE_READ, NULL); + if (IS_ERR(bdev)) { + pr_err("md: open failed - cannot start array %s\n", name); + return; + } + + err = -EIO; + if (WARN(bdev->bd_disk->fops != &md_fops, + "Opening block device %x resulted in non-md device\n", + mdev)) + goto out_blkdev_put; + + mddev = bdev->bd_disk->private_data; + + err = mddev_lock(mddev); + if (err) { + pr_err("md: failed to lock array %s\n", name); + goto out_blkdev_put; + } + + if (!list_empty(&mddev->disks) || mddev->raid_disks) { + pr_warn("md: Ignoring %s, already autodetected. (Use raid=noautodetect)\n", + name); + goto out_unlock; + } + + if (args->level != LEVEL_NONE) { + /* non-persistent */ + ainfo.level = args->level; + ainfo.md_minor = args->minor; + ainfo.not_persistent = 1; + ainfo.state = (1 << MD_SB_CLEAN); + ainfo.chunk_size = args->chunk; + while (devices[ainfo.raid_disks]) + ainfo.raid_disks++; + } + + err = md_set_array_info(mddev, &ainfo); + + for (i = 0; i <= MD_SB_DISKS && devices[i]; i++) { + struct mdu_disk_info_s dinfo = { + .major = MAJOR(devices[i]), + .minor = MINOR(devices[i]), + }; + + if (args->level != LEVEL_NONE) { + dinfo.number = i; + dinfo.raid_disk = i; + dinfo.state = + (1 << MD_DISK_ACTIVE) | (1 << MD_DISK_SYNC); + } + + md_add_new_disk(mddev, &dinfo); + } + + if (!err) + err = do_md_run(mddev); + if (err) + pr_warn("md: starting %s failed\n", name); +out_unlock: + mddev_unlock(mddev); +out_blkdev_put: + blkdev_put(bdev, FMODE_READ); +} + +static int __init raid_setup(char *str) +{ + int len, pos; + + len = strlen(str) + 1; + pos = 0; + + while (pos < len) { + char *comma = strchr(str+pos, ','); + int wlen; + if (comma) + wlen = (comma-str)-pos; + else wlen = (len-1)-pos; + + if (!strncmp(str, "noautodetect", wlen)) + raid_noautodetect = 1; + if (!strncmp(str, "autodetect", wlen)) + raid_noautodetect = 0; + if (strncmp(str, "partitionable", wlen)==0) + raid_autopart = 1; + if (strncmp(str, "part", wlen)==0) + raid_autopart = 1; + pos += wlen+1; + } + return 1; +} + +__setup("raid=", raid_setup); +__setup("md=", md_setup); + +static void __init autodetect_raid(void) +{ + /* + * Since we don't want to detect and use half a raid array, we need to + * wait for the known devices to complete their probing + */ + printk(KERN_INFO "md: Waiting for all devices to be available before autodetect\n"); + printk(KERN_INFO "md: If you don't use raid, use raid=noautodetect\n"); + + wait_for_device_probe(); + md_autostart_arrays(raid_autopart); +} + +void __init md_run_setup(void) +{ + int ent; + + if (raid_noautodetect) + printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=autodetect will force)\n"); + else + autodetect_raid(); + + for (ent = 0; ent < md_setup_ents; ent++) + md_setup_drive(&md_setup_args[ent]); +} diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 95a5f3757fa3..d61b524ae440 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1631,7 +1631,7 @@ void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) s += blocks; } bitmap->last_end_sync = jiffies; - sysfs_notify(&bitmap->mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(bitmap->mddev->sysfs_completed); } EXPORT_SYMBOL(md_bitmap_cond_end_sync); diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 813a99ffa86f..73fd50e77975 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1518,6 +1518,7 @@ static void unlock_all_bitmaps(struct mddev *mddev) } } kfree(cinfo->other_bitmap_lockres); + cinfo->other_bitmap_lockres = NULL; } } diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c index 50ad4ba86f0e..fda4cb3f936f 100644 --- a/drivers/md/md-faulty.c +++ b/drivers/md/md-faulty.c @@ -169,7 +169,7 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio) if (bio_data_dir(bio) == WRITE) { /* write request */ if (atomic_read(&conf->counters[WriteAll])) { - /* special case - don't decrement, don't generic_make_request, + /* special case - don't decrement, don't submit_bio_noacct, * just fail immediately */ bio_io_error(bio); @@ -214,7 +214,7 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio) } else bio_set_dev(bio, conf->rdev->bdev); - generic_make_request(bio); + submit_bio_noacct(bio); return true; } diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 26c75c0199fa..c2ae9125c4c3 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -46,29 +46,6 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) return conf->disks + lo; } -/* - * In linear_congested() conf->raid_disks is used as a copy of - * mddev->raid_disks to iterate conf->disks[], because conf->raid_disks - * and conf->disks[] are created in linear_conf(), they are always - * consitent with each other, but mddev->raid_disks does not. - */ -static int linear_congested(struct mddev *mddev, int bits) -{ - struct linear_conf *conf; - int i, ret = 0; - - rcu_read_lock(); - conf = rcu_dereference(mddev->private); - - for (i = 0; i < conf->raid_disks && !ret ; i++) { - struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); - ret |= bdi_congested(q->backing_dev_info, bits); - } - - rcu_read_unlock(); - return ret; -} - static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disks) { struct linear_conf *conf; @@ -267,7 +244,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) struct bio *split = bio_split(bio, end_sector - bio_sector, GFP_NOIO, &mddev->bio_set); bio_chain(split, bio); - generic_make_request(bio); + submit_bio_noacct(bio); bio = split; } @@ -286,7 +263,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) bio_sector); mddev_check_writesame(mddev, bio); mddev_check_write_zeroes(mddev, bio); - generic_make_request(bio); + submit_bio_noacct(bio); } return true; @@ -322,7 +299,6 @@ static struct md_personality linear_personality = .hot_add_disk = linear_add, .size = linear_size, .quiesce = linear_quiesce, - .congested = linear_congested, }; static int __init linear_init (void) diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c index 152f9e65a226..776bbe542db5 100644 --- a/drivers/md/md-multipath.c +++ b/drivers/md/md-multipath.c @@ -131,7 +131,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio) mp_bh->bio.bi_private = mp_bh; mddev_check_writesame(mddev, &mp_bh->bio); mddev_check_write_zeroes(mddev, &mp_bh->bio); - generic_make_request(&mp_bh->bio); + submit_bio_noacct(&mp_bh->bio); return true; } @@ -151,28 +151,6 @@ static void multipath_status(struct seq_file *seq, struct mddev *mddev) seq_putc(seq, ']'); } -static int multipath_congested(struct mddev *mddev, int bits) -{ - struct mpconf *conf = mddev->private; - int i, ret = 0; - - rcu_read_lock(); - for (i = 0; i < mddev->raid_disks ; i++) { - struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); - if (rdev && !test_bit(Faulty, &rdev->flags)) { - struct request_queue *q = bdev_get_queue(rdev->bdev); - - ret |= bdi_congested(q->backing_dev_info, bits); - /* Just like multipath_map, we just check the - * first available device - */ - break; - } - } - rcu_read_unlock(); - return ret; -} - /* * Careful, this can execute in IRQ contexts as well! */ @@ -348,7 +326,7 @@ static void multipathd(struct md_thread *thread) bio->bi_opf |= REQ_FAILFAST_TRANSPORT; bio->bi_end_io = multipath_end_request; bio->bi_private = mp_bh; - generic_make_request(bio); + submit_bio_noacct(bio); } } spin_unlock_irqrestore(&conf->device_lock, flags); @@ -478,7 +456,6 @@ static struct md_personality multipath_personality = .hot_add_disk = multipath_add_disk, .hot_remove_disk= multipath_remove_disk, .size = multipath_size, - .congested = multipath_congested, }; static int __init multipath_init (void) diff --git a/drivers/md/md.c b/drivers/md/md.c index f567f536b529..15bbdc1630ed 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -68,10 +68,6 @@ #include "md-bitmap.h" #include "md-cluster.h" -#ifndef MODULE -static void autostart_arrays(int part); -#endif - /* pers_list is a list of registered personalities protected * by pers_lock. * pers_lock does extra service to protect accesses to @@ -101,6 +97,8 @@ static void mddev_detach(struct mddev *mddev); * count by 2 for every hour elapsed between read errors. */ #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 +/* Default safemode delay: 200 msec */ +#define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) /* * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' * is 1000 KB/sec, so the extra system load does not show up that much. @@ -199,7 +197,7 @@ static int rdevs_init_serial(struct mddev *mddev) static int rdev_need_serial(struct md_rdev *rdev) { return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 && - rdev->bdev->bd_queue->nr_hw_queues != 1 && + rdev->bdev->bd_disk->queue->nr_hw_queues != 1 && test_bit(WriteMostly, &rdev->flags)); } @@ -330,8 +328,6 @@ static struct ctl_table raid_root_table[] = { { } }; -static const struct block_device_operations md_fops; - static int start_readonly; /* @@ -463,24 +459,46 @@ check_suspended: } EXPORT_SYMBOL(md_handle_request); -static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) +struct md_io { + struct mddev *mddev; + bio_end_io_t *orig_bi_end_io; + void *orig_bi_private; + unsigned long start_time; +}; + +static void md_end_io(struct bio *bio) +{ + struct md_io *md_io = bio->bi_private; + struct mddev *mddev = md_io->mddev; + + disk_end_io_acct(mddev->gendisk, bio_op(bio), md_io->start_time); + + bio->bi_end_io = md_io->orig_bi_end_io; + bio->bi_private = md_io->orig_bi_private; + + mempool_free(md_io, &mddev->md_io_pool); + + if (bio->bi_end_io) + bio->bi_end_io(bio); +} + +static blk_qc_t md_submit_bio(struct bio *bio) { const int rw = bio_data_dir(bio); - const int sgrp = op_stat_group(bio_op(bio)); struct mddev *mddev = bio->bi_disk->private_data; - unsigned int sectors; - if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { + if (mddev == NULL || mddev->pers == NULL) { bio_io_error(bio); return BLK_QC_T_NONE; } - blk_queue_split(q, &bio); - - if (mddev == NULL || mddev->pers == NULL) { + if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { bio_io_error(bio); return BLK_QC_T_NONE; } + + blk_queue_split(&bio); + if (mddev->ro == 1 && unlikely(rw == WRITE)) { if (bio_sectors(bio) != 0) bio->bi_status = BLK_STS_IOERR; @@ -488,21 +506,27 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) return BLK_QC_T_NONE; } - /* - * save the sectors now since our bio can - * go away inside make_request - */ - sectors = bio_sectors(bio); + if (bio->bi_end_io != md_end_io) { + struct md_io *md_io; + + md_io = mempool_alloc(&mddev->md_io_pool, GFP_NOIO); + md_io->mddev = mddev; + md_io->orig_bi_end_io = bio->bi_end_io; + md_io->orig_bi_private = bio->bi_private; + + bio->bi_end_io = md_end_io; + bio->bi_private = md_io; + + md_io->start_time = disk_start_io_acct(mddev->gendisk, + bio_sectors(bio), + bio_op(bio)); + } + /* bio could be mergeable after passing to underlayer */ bio->bi_opf &= ~REQ_NOMERGE; md_handle_request(mddev, bio); - part_stat_lock(); - part_stat_inc(&mddev->gendisk->part0, ios[sgrp]); - part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors); - part_stat_unlock(); - return BLK_QC_T_NONE; } @@ -549,26 +573,6 @@ void mddev_resume(struct mddev *mddev) } EXPORT_SYMBOL_GPL(mddev_resume); -int mddev_congested(struct mddev *mddev, int bits) -{ - struct md_personality *pers = mddev->pers; - int ret = 0; - - rcu_read_lock(); - if (mddev->suspended) - ret = 1; - else if (pers && pers->congested) - ret = pers->congested(mddev, bits); - rcu_read_unlock(); - return ret; -} -EXPORT_SYMBOL_GPL(mddev_congested); -static int md_congested(void *data, int bits) -{ - struct mddev *mddev = data; - return mddev_congested(mddev, bits); -} - /* * Generic flush handling for md */ @@ -948,7 +952,8 @@ static void super_written(struct bio *bio) struct mddev *mddev = rdev->mddev; if (bio->bi_status) { - pr_err("md: super_written gets error=%d\n", bio->bi_status); + pr_err("md: %s gets error=%d\n", __func__, + blk_status_to_errno(bio->bi_status)); md_error(mddev, rdev); if (!test_bit(Faulty, &rdev->flags) && (bio->bi_opf & MD_FAILFAST)) { @@ -2163,6 +2168,24 @@ retry: sb->sb_csum = calc_sb_1_csum(sb); } +static sector_t super_1_choose_bm_space(sector_t dev_size) +{ + sector_t bm_space; + + /* if the device is bigger than 8Gig, save 64k for bitmap + * usage, if bigger than 200Gig, save 128k + */ + if (dev_size < 64*2) + bm_space = 0; + else if (dev_size - 64*2 >= 200*1024*1024*2) + bm_space = 128*2; + else if (dev_size - 4*2 > 8*1024*1024*2) + bm_space = 64*2; + else + bm_space = 4*2; + return bm_space; +} + static unsigned long long super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) { @@ -2183,13 +2206,22 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) return 0; } else { /* minor version 0; superblock after data */ - sector_t sb_start; - sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2; + sector_t sb_start, bm_space; + sector_t dev_size = i_size_read(rdev->bdev->bd_inode) >> 9; + + /* 8K is for superblock */ + sb_start = dev_size - 8*2; sb_start &= ~(sector_t)(4*2 - 1); - max_sectors = rdev->sectors + sb_start - rdev->sb_start; + + bm_space = super_1_choose_bm_space(dev_size); + + /* Space that can be used to store date needs to decrease + * superblock bitmap space and bad block space(4K) + */ + max_sectors = sb_start - bm_space - 4*2; + if (!num_sectors || num_sectors > max_sectors) num_sectors = max_sectors; - rdev->sb_start = sb_start; } sb = page_address(rdev->sb_page); sb->data_size = cpu_to_le64(num_sectors); @@ -2441,9 +2473,13 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) goto fail; ko = &part_to_dev(rdev->bdev->bd_part)->kobj; - if (sysfs_create_link(&rdev->kobj, ko, "block")) - /* failure here is OK */; + /* failure here is OK */ + err = sysfs_create_link(&rdev->kobj, ko, "block"); rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); + rdev->sysfs_unack_badblocks = + sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); + rdev->sysfs_badblocks = + sysfs_get_dirent_safe(rdev->kobj.sd, "bad_blocks"); list_add_rcu(&rdev->same_set, &mddev->disks); bd_link_disk_holder(rdev->bdev, mddev->gendisk); @@ -2477,7 +2513,11 @@ static void unbind_rdev_from_array(struct md_rdev *rdev) rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); + sysfs_put(rdev->sysfs_unack_badblocks); + sysfs_put(rdev->sysfs_badblocks); rdev->sysfs_state = NULL; + rdev->sysfs_unack_badblocks = NULL; + rdev->sysfs_badblocks = NULL; rdev->badblocks.count = 0; /* We need to delay this, otherwise we can deadlock when * writing to 'remove' to "dev/state". We also need @@ -2822,7 +2862,7 @@ rewrite: goto repeat; wake_up(&mddev->sb_wait); if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); rdev_for_each(rdev, mddev) { if (test_and_clear_bit(FaultRecorded, &rdev->flags)) @@ -3202,8 +3242,8 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) return err; } else sysfs_notify_dirent_safe(rdev->sysfs_state); - if (sysfs_link_rdev(rdev->mddev, rdev)) - /* failure here is OK */; + /* failure here is OK */; + sysfs_link_rdev(rdev->mddev, rdev); /* don't wakeup anyone, leave that to userspace. */ } else { if (slot >= rdev->mddev->raid_disks && @@ -4075,7 +4115,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev_resume(mddev); if (!mddev->thread) md_update_sb(mddev, 1); - sysfs_notify(&mddev->kobj, NULL, "level"); + sysfs_notify_dirent_safe(mddev->sysfs_level); md_new_event(mddev); rv = len; out_unlock: @@ -4188,6 +4228,14 @@ static struct md_sysfs_entry md_raid_disks = __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); static ssize_t +uuid_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%pU\n", mddev->uuid); +} +static struct md_sysfs_entry md_uuid = +__ATTR(uuid, S_IRUGO, uuid_show, NULL); + +static ssize_t chunk_size_show(struct mddev *mddev, char *page) { if (mddev->reshape_position != MaxSector && @@ -4372,7 +4420,6 @@ array_state_show(struct mddev *mddev, char *page) static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev); static int md_set_readonly(struct mddev *mddev, struct block_device *bdev); -static int do_md_run(struct mddev *mddev); static int restart_array(struct mddev *mddev); static ssize_t @@ -4828,7 +4875,7 @@ action_store(struct mddev *mddev, const char *page, size_t len) } if (err) return err; - sysfs_notify(&mddev->kobj, NULL, "degraded"); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); } else { if (cmd_match(page, "check")) set_bit(MD_RECOVERY_CHECK, &mddev->recovery); @@ -5443,6 +5490,7 @@ static struct attribute *md_default_attrs[] = { &md_level.attr, &md_layout.attr, &md_raid_disks.attr, + &md_uuid.attr, &md_chunk_size.attr, &md_size.attr, &md_resync_start.attr, @@ -5534,6 +5582,13 @@ static void md_free(struct kobject *ko) if (mddev->sysfs_state) sysfs_put(mddev->sysfs_state); + if (mddev->sysfs_completed) + sysfs_put(mddev->sysfs_completed); + if (mddev->sysfs_degraded) + sysfs_put(mddev->sysfs_degraded); + if (mddev->sysfs_level) + sysfs_put(mddev->sysfs_level); + if (mddev->gendisk) del_gendisk(mddev->gendisk); @@ -5545,6 +5600,7 @@ static void md_free(struct kobject *ko) bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); + mempool_exit(&mddev->md_io_pool); kfree(mddev); } @@ -5640,8 +5696,13 @@ static int md_alloc(dev_t dev, char *name) */ mddev->hold_active = UNTIL_STOP; + error = mempool_init_kmalloc_pool(&mddev->md_io_pool, BIO_POOL_SIZE, + sizeof(struct md_io)); + if (error) + goto abort; + error = -ENOMEM; - mddev->queue = blk_alloc_queue(md_make_request, NUMA_NO_NODE); + mddev->queue = blk_alloc_queue(NUMA_NO_NODE); if (!mddev->queue) goto abort; @@ -5670,6 +5731,7 @@ static int md_alloc(dev_t dev, char *name) * remove it now. */ disk->flags |= GENHD_FL_EXT_DEVT; + disk->events |= DISK_EVENT_MEDIA_CHANGE; mddev->gendisk = disk; /* As soon as we call add_disk(), another thread could get * through to md_open, so make sure it doesn't get too far @@ -5695,6 +5757,9 @@ static int md_alloc(dev_t dev, char *name) if (!error && mddev->kobj.sd) { kobject_uevent(&mddev->kobj, KOBJ_ADD); mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); + mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); + mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); + mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); } mddev_put(mddev); return error; @@ -5964,8 +6029,6 @@ int md_run(struct mddev *mddev) blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue); else blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue); - mddev->queue->backing_dev_info->congested_data = mddev; - mddev->queue->backing_dev_info->congested_fn = md_congested; } if (pers->sync_request) { if (mddev->kobj.sd && @@ -5982,7 +6045,7 @@ int md_run(struct mddev *mddev) if (mddev_is_clustered(mddev)) mddev->safemode_delay = 0; else - mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ + mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; mddev->in_sync = 1; smp_wmb(); spin_lock(&mddev->lock); @@ -6019,7 +6082,7 @@ abort: } EXPORT_SYMBOL_GPL(md_run); -static int do_md_run(struct mddev *mddev) +int do_md_run(struct mddev *mddev) { int err; @@ -6049,7 +6112,7 @@ static int do_md_run(struct mddev *mddev) kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); sysfs_notify_dirent_safe(mddev->sysfs_state); sysfs_notify_dirent_safe(mddev->sysfs_action); - sysfs_notify(&mddev->kobj, NULL, "degraded"); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); out: clear_bit(MD_NOT_READY, &mddev->flags); return err; @@ -6350,7 +6413,6 @@ static int do_md_stop(struct mddev *mddev, int mode, __md_stop_writes(mddev); __md_stop(mddev); - mddev->queue->backing_dev_info->congested_fn = NULL; /* tell userspace to handle 'inactive' */ sysfs_notify_dirent_safe(mddev->sysfs_state); @@ -6655,7 +6717,7 @@ static int get_disk_info(struct mddev *mddev, void __user * arg) return 0; } -static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) +int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) { char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; struct md_rdev *rdev; @@ -6701,7 +6763,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) } /* - * add_new_disk can be used once the array is assembled + * md_add_new_disk can be used once the array is assembled * to add "hot spares". They must already have a superblock * written */ @@ -6814,7 +6876,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) return err; } - /* otherwise, add_new_disk is only allowed + /* otherwise, md_add_new_disk is only allowed * for major_version==0 superblocks */ if (mddev->major_version != 0) { @@ -7059,7 +7121,7 @@ static int set_bitmap_file(struct mddev *mddev, int fd) } /* - * set_array_info is used two different ways + * md_set_array_info is used two different ways * The original usage is when creating a new array. * In this usage, raid_disks is > 0 and it together with * level, size, not_persistent,layout,chunksize determine the @@ -7071,9 +7133,8 @@ static int set_bitmap_file(struct mddev *mddev, int fd) * The minor and patch _version numbers are also kept incase the * super_block handler wishes to interpret them. */ -static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) +int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) { - if (info->raid_disks == 0) { /* just setting version number for superblock loading */ if (info->major_version < 0 || @@ -7361,6 +7422,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.nodes = 0; md_cluster_ops->leave(mddev); + module_put(md_cluster_mod); + mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; } mddev_suspend(mddev); md_bitmap_destroy(mddev); @@ -7421,7 +7484,6 @@ static inline bool md_ioctl_valid(unsigned int cmd) case GET_DISK_INFO: case HOT_ADD_DISK: case HOT_REMOVE_DISK: - case RAID_AUTORUN: case RAID_VERSION: case RESTART_ARRAY_RW: case RUN_ARRAY: @@ -7467,13 +7529,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, case RAID_VERSION: err = get_version(argp); goto out; - -#ifndef MODULE - case RAID_AUTORUN: - err = 0; - autostart_arrays(arg); - goto out; -#endif default:; } @@ -7572,7 +7627,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, err = -EBUSY; goto unlock; } - err = set_array_info(mddev, &info); + err = md_set_array_info(mddev, &info); if (err) { pr_warn("md: couldn't set array info. %d\n", err); goto unlock; @@ -7626,7 +7681,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, /* Need to clear read-only for this */ break; else - err = add_new_disk(mddev, &info); + err = md_add_new_disk(mddev, &info); goto unlock; } break; @@ -7694,7 +7749,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, if (copy_from_user(&info, argp, sizeof(info))) err = -EFAULT; else - err = add_new_disk(mddev, &info); + err = md_add_new_disk(mddev, &info); goto unlock; } @@ -7806,23 +7861,21 @@ static void md_release(struct gendisk *disk, fmode_t mode) mddev_put(mddev); } -static int md_media_changed(struct gendisk *disk) -{ - struct mddev *mddev = disk->private_data; - - return mddev->changed; -} - -static int md_revalidate(struct gendisk *disk) +static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) { struct mddev *mddev = disk->private_data; + unsigned int ret = 0; + if (mddev->changed) + ret = DISK_EVENT_MEDIA_CHANGE; mddev->changed = 0; - return 0; + return ret; } -static const struct block_device_operations md_fops = + +const struct block_device_operations md_fops = { .owner = THIS_MODULE, + .submit_bio = md_submit_bio, .open = md_open, .release = md_release, .ioctl = md_ioctl, @@ -7830,8 +7883,7 @@ static const struct block_device_operations md_fops = .compat_ioctl = md_compat_ioctl, #endif .getgeo = md_getgeo, - .media_changed = md_media_changed, - .revalidate_disk= md_revalidate, + .check_events = md_check_events, }; static int md_thread(void *arg) @@ -8355,6 +8407,7 @@ EXPORT_SYMBOL(unregister_md_cluster_operations); int md_setup_cluster(struct mddev *mddev, int nodes) { + int ret; if (!md_cluster_ops) request_module("md-cluster"); spin_lock(&pers_lock); @@ -8366,7 +8419,10 @@ int md_setup_cluster(struct mddev *mddev, int nodes) } spin_unlock(&pers_lock); - return md_cluster_ops->join(mddev, nodes); + ret = md_cluster_ops->join(mddev, nodes); + if (!ret) + mddev->safemode_delay = 0; + return ret; } void md_cluster_stop(struct mddev *mddev) @@ -8767,7 +8823,7 @@ void md_do_sync(struct md_thread *thread) } else mddev->curr_resync = 3; /* no longer delayed */ mddev->curr_resync_completed = j; - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); md_new_event(mddev); update_time = jiffies; @@ -8795,7 +8851,7 @@ void md_do_sync(struct md_thread *thread) mddev->recovery_cp = j; update_time = jiffies; set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); } while (j >= mddev->resync_max && @@ -8902,7 +8958,7 @@ void md_do_sync(struct md_thread *thread) !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && mddev->curr_resync > 3) { mddev->curr_resync_completed = mddev->curr_resync; - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); } mddev->pers->sync_request(mddev, max_sectors, &skipped); @@ -9032,7 +9088,7 @@ static int remove_and_add_spares(struct mddev *mddev, } if (removed && mddev->kobj.sd) - sysfs_notify(&mddev->kobj, NULL, "degraded"); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); if (this && removed) goto no_add; @@ -9060,8 +9116,8 @@ static int remove_and_add_spares(struct mddev *mddev, rdev->recovery_offset = 0; } if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { - if (sysfs_link_rdev(mddev, rdev)) - /* failure here is OK */; + /* failure here is OK */ + sysfs_link_rdev(mddev, rdev); if (!test_bit(Journal, &rdev->flags)) spares++; md_new_event(mddev); @@ -9315,8 +9371,7 @@ void md_reap_sync_thread(struct mddev *mddev) /* success...*/ /* activate any spares */ if (mddev->pers->spare_active(mddev)) { - sysfs_notify(&mddev->kobj, NULL, - "degraded"); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); } } @@ -9406,8 +9461,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, if (rv == 0) { /* Make sure they get written out promptly */ if (test_bit(ExternalBbl, &rdev->flags)) - sysfs_notify(&rdev->kobj, NULL, - "unacknowledged_bad_blocks"); + sysfs_notify_dirent_safe(rdev->sysfs_unack_badblocks); sysfs_notify_dirent_safe(rdev->sysfs_state); set_mask_bits(&mddev->sb_flags, 0, BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); @@ -9428,7 +9482,7 @@ int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, s += rdev->data_offset; rv = badblocks_clear(&rdev->badblocks, s, sectors); if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) - sysfs_notify(&rdev->kobj, NULL, "bad_blocks"); + sysfs_notify_dirent_safe(rdev->sysfs_badblocks); return rv; } EXPORT_SYMBOL_GPL(rdev_clear_badblocks); @@ -9658,7 +9712,7 @@ static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) if (rdev->recovery_offset == MaxSector && !test_bit(In_sync, &rdev->flags) && mddev->pers->spare_active(mddev)) - sysfs_notify(&mddev->kobj, NULL, "degraded"); + sysfs_notify_dirent_safe(mddev->sysfs_degraded); put_page(swapout); return 0; @@ -9721,7 +9775,7 @@ void md_autodetect_dev(dev_t dev) } } -static void autostart_arrays(int part) +void md_autostart_arrays(int part) { struct md_rdev *rdev; struct detected_devices_node *node_detected_dev; diff --git a/drivers/md/md.h b/drivers/md/md.h index 612814d07d35..d9c4e6b7e939 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -126,7 +126,10 @@ struct md_rdev { struct kernfs_node *sysfs_state; /* handle for 'state' * sysfs entry */ - + /* handle for 'unacknowledged_bad_blocks' sysfs dentry */ + struct kernfs_node *sysfs_unack_badblocks; + /* handle for 'bad_blocks' sysfs dentry */ + struct kernfs_node *sysfs_badblocks; struct badblocks badblocks; struct { @@ -420,6 +423,9 @@ struct mddev { * file in sysfs. */ struct kernfs_node *sysfs_action; /* handle for 'sync_action' */ + struct kernfs_node *sysfs_completed; /*handle for 'sync_completed' */ + struct kernfs_node *sysfs_degraded; /*handle for 'degraded' */ + struct kernfs_node *sysfs_level; /*handle for 'level' */ struct work_struct del_work; /* used for delayed sysfs removal */ @@ -481,6 +487,7 @@ struct mddev { struct bio_set sync_set; /* for sync operations like * metadata and bitmap writes */ + mempool_t md_io_pool; /* Generic flush handling. * The last to finish preflush schedules a worker to submit @@ -597,9 +604,6 @@ struct md_personality * array. */ void *(*takeover) (struct mddev *mddev); - /* congested implements bdi.congested_fn(). - * Will not be called while array is 'suspended' */ - int (*congested)(struct mddev *mddev, int bits); /* Changes the consistency policy of an active array. */ int (*change_consistency_policy)(struct mddev *mddev, const char *buf); }; @@ -710,7 +714,6 @@ extern void md_done_sync(struct mddev *mddev, int blocks, int ok); extern void md_error(struct mddev *mddev, struct md_rdev *rdev); extern void md_finish_reshape(struct mddev *mddev); -extern int mddev_congested(struct mddev *mddev, int bits); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, sector_t sector, int size, struct page *page); @@ -800,4 +803,16 @@ static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio !bio->bi_disk->queue->limits.max_write_zeroes_sectors) mddev->queue->limits.max_write_zeroes_sectors = 0; } + +struct mdu_array_info_s; +struct mdu_disk_info_s; + +extern int mdp_major; +void md_autostart_arrays(int part); +int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info); +int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info); +int do_md_run(struct mddev *mddev); + +extern const struct block_device_operations md_fops; + #endif /* _MD_MD_H */ diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 322386ff5d22..f54a449f97aa 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -29,21 +29,6 @@ module_param(default_layout, int, 0644); (1L << MD_HAS_PPL) | \ (1L << MD_HAS_MULTIPLE_PPLS)) -static int raid0_congested(struct mddev *mddev, int bits) -{ - struct r0conf *conf = mddev->private; - struct md_rdev **devlist = conf->devlist; - int raid_disks = conf->strip_zone[0].nb_dev; - int i, ret = 0; - - for (i = 0; i < raid_disks && !ret ; i++) { - struct request_queue *q = bdev_get_queue(devlist[i]->bdev); - - ret |= bdi_congested(q->backing_dev_info, bits); - } - return ret; -} - /* * inform the user of the raid configuration */ @@ -495,7 +480,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO, &mddev->bio_set); bio_chain(split, bio); - generic_make_request(bio); + submit_bio_noacct(bio); bio = split; end = zone->zone_end; } else @@ -559,7 +544,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) trace_block_bio_remap(bdev_get_queue(rdev->bdev), discard_bio, disk_devt(mddev->gendisk), bio->bi_iter.bi_sector); - generic_make_request(discard_bio); + submit_bio_noacct(discard_bio); } bio_endio(bio); } @@ -600,7 +585,7 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) struct bio *split = bio_split(bio, sectors, GFP_NOIO, &mddev->bio_set); bio_chain(split, bio); - generic_make_request(bio); + submit_bio_noacct(bio); bio = split; } @@ -633,7 +618,7 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) disk_devt(mddev->gendisk), bio_sector); mddev_check_writesame(mddev, bio); mddev_check_write_zeroes(mddev, bio); - generic_make_request(bio); + submit_bio_noacct(bio); return true; } @@ -818,7 +803,6 @@ static struct md_personality raid0_personality= .size = raid0_size, .takeover = raid0_takeover, .quiesce = raid0_quiesce, - .congested = raid0_congested, }; static int __init raid0_init (void) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index dcd27f3da84e..960d854c07f8 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -786,36 +786,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect return best_disk; } -static int raid1_congested(struct mddev *mddev, int bits) -{ - struct r1conf *conf = mddev->private; - int i, ret = 0; - - if ((bits & (1 << WB_async_congested)) && - conf->pending_count >= max_queued_requests) - return 1; - - rcu_read_lock(); - for (i = 0; i < conf->raid_disks * 2; i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); - if (rdev && !test_bit(Faulty, &rdev->flags)) { - struct request_queue *q = bdev_get_queue(rdev->bdev); - - BUG_ON(!q); - - /* Note the '|| 1' - when read_balance prefers - * non-congested targets, it can be removed - */ - if ((bits & (1 << WB_async_congested)) || 1) - ret |= bdi_congested(q->backing_dev_info, bits); - else - ret &= bdi_congested(q->backing_dev_info, bits); - } - } - rcu_read_unlock(); - return ret; -} - static void flush_bio_list(struct r1conf *conf, struct bio *bio) { /* flush any pending bitmap writes to disk before proceeding w/ I/O */ @@ -834,7 +804,7 @@ static void flush_bio_list(struct r1conf *conf, struct bio *bio) /* Just ignore it */ bio_endio(bio); else - generic_make_request(bio); + submit_bio_noacct(bio); bio = next; cond_resched(); } @@ -1312,7 +1282,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, struct bio *split = bio_split(bio, max_sectors, gfp, &conf->bio_split); bio_chain(split, bio); - generic_make_request(bio); + submit_bio_noacct(bio); bio = split; r1_bio->master_bio = bio; r1_bio->sectors = max_sectors; @@ -1338,7 +1308,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, trace_block_bio_remap(read_bio->bi_disk->queue, read_bio, disk_devt(mddev->gendisk), r1_bio->sector); - generic_make_request(read_bio); + submit_bio_noacct(read_bio); } static void raid1_write_request(struct mddev *mddev, struct bio *bio, @@ -1483,7 +1453,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, struct bio *split = bio_split(bio, max_sectors, GFP_NOIO, &conf->bio_split); bio_chain(split, bio); - generic_make_request(bio); + submit_bio_noacct(bio); bio = split; r1_bio->master_bio = bio; r1_bio->sectors = max_sectors; @@ -2240,7 +2210,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) atomic_inc(&r1_bio->remaining); md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio)); - generic_make_request(wbio); + submit_bio_noacct(wbio); } put_sync_write_buf(r1_bio, 1); @@ -2926,7 +2896,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, md_sync_acct_bio(bio, nr_sectors); if (read_targets == 1) bio->bi_opf &= ~MD_FAILFAST; - generic_make_request(bio); + submit_bio_noacct(bio); } } } else { @@ -2935,7 +2905,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, md_sync_acct_bio(bio, nr_sectors); if (read_targets == 1) bio->bi_opf &= ~MD_FAILFAST; - generic_make_request(bio); + submit_bio_noacct(bio); } return nr_sectors; } @@ -3396,7 +3366,6 @@ static struct md_personality raid1_personality = .check_reshape = raid1_reshape, .quiesce = raid1_quiesce, .takeover = raid1_takeover, - .congested = raid1_congested, }; static int __init raid_init(void) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ec136e44aef7..e8fa32733917 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -848,31 +848,6 @@ static struct md_rdev *read_balance(struct r10conf *conf, return rdev; } -static int raid10_congested(struct mddev *mddev, int bits) -{ - struct r10conf *conf = mddev->private; - int i, ret = 0; - - if ((bits & (1 << WB_async_congested)) && - conf->pending_count >= max_queued_requests) - return 1; - - rcu_read_lock(); - for (i = 0; - (i < conf->geo.raid_disks || i < conf->prev.raid_disks) - && ret == 0; - i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); - if (rdev && !test_bit(Faulty, &rdev->flags)) { - struct request_queue *q = bdev_get_queue(rdev->bdev); - - ret |= bdi_congested(q->backing_dev_info, bits); - } - } - rcu_read_unlock(); - return ret; -} - static void flush_pending_writes(struct r10conf *conf) { /* Any writes that have been queued but are awaiting @@ -917,7 +892,7 @@ static void flush_pending_writes(struct r10conf *conf) /* Just ignore it */ bio_endio(bio); else - generic_make_request(bio); + submit_bio_noacct(bio); bio = next; } blk_finish_plug(&plug); @@ -980,6 +955,7 @@ static void wait_barrier(struct r10conf *conf) { spin_lock_irq(&conf->resync_lock); if (conf->barrier) { + struct bio_list *bio_list = current->bio_list; conf->nr_waiting++; /* Wait for the barrier to drop. * However if there are already pending @@ -994,9 +970,16 @@ static void wait_barrier(struct r10conf *conf) wait_event_lock_irq(conf->wait_barrier, !conf->barrier || (atomic_read(&conf->nr_pending) && - current->bio_list && - (!bio_list_empty(¤t->bio_list[0]) || - !bio_list_empty(¤t->bio_list[1]))), + bio_list && + (!bio_list_empty(&bio_list[0]) || + !bio_list_empty(&bio_list[1]))) || + /* move on if recovery thread is + * blocked by us + */ + (conf->mddev->thread->tsk == current && + test_bit(MD_RECOVERY_RUNNING, + &conf->mddev->recovery) && + conf->nr_queued > 0), conf->resync_lock); conf->nr_waiting--; if (!conf->nr_waiting) @@ -1102,7 +1085,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) /* Just ignore it */ bio_endio(bio); else - generic_make_request(bio); + submit_bio_noacct(bio); bio = next; } kfree(plug); @@ -1194,7 +1177,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, gfp, &conf->bio_split); bio_chain(split, bio); allow_barrier(conf); - generic_make_request(bio); + submit_bio_noacct(bio); wait_barrier(conf); bio = split; r10_bio->master_bio = bio; @@ -1221,7 +1204,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, trace_block_bio_remap(read_bio->bi_disk->queue, read_bio, disk_devt(mddev->gendisk), r10_bio->sector); - generic_make_request(read_bio); + submit_bio_noacct(read_bio); return; } @@ -1479,7 +1462,7 @@ retry_write: GFP_NOIO, &conf->bio_split); bio_chain(split, bio); allow_barrier(conf); - generic_make_request(bio); + submit_bio_noacct(bio); wait_barrier(conf); bio = split; r10_bio->master_bio = bio; @@ -2099,7 +2082,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) tbio->bi_opf |= MD_FAILFAST; tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset; bio_set_dev(tbio, conf->mirrors[d].rdev->bdev); - generic_make_request(tbio); + submit_bio_noacct(tbio); } /* Now write out to any replacement devices @@ -2118,7 +2101,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) atomic_inc(&r10_bio->remaining); md_sync_acct(conf->mirrors[d].replacement->bdev, bio_sectors(tbio)); - generic_make_request(tbio); + submit_bio_noacct(tbio); } done: @@ -2241,7 +2224,7 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) wbio = r10_bio->devs[1].bio; wbio2 = r10_bio->devs[1].repl_bio; /* Need to test wbio2->bi_end_io before we call - * generic_make_request as if the former is NULL, + * submit_bio_noacct as if the former is NULL, * the latter is free to free wbio2. */ if (wbio2 && !wbio2->bi_end_io) @@ -2249,13 +2232,13 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) if (wbio->bi_end_io) { atomic_inc(&conf->mirrors[d].rdev->nr_pending); md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio)); - generic_make_request(wbio); + submit_bio_noacct(wbio); } if (wbio2) { atomic_inc(&conf->mirrors[d].replacement->nr_pending); md_sync_acct(conf->mirrors[d].replacement->bdev, bio_sectors(wbio2)); - generic_make_request(wbio2); + submit_bio_noacct(wbio2); } } @@ -2889,7 +2872,7 @@ static void raid10_set_cluster_sync_high(struct r10conf *conf) * a number of r10_bio structures, one for each out-of-sync device. * As we setup these structures, we collect all bio's together into a list * which we then process collectively to add pages, and then process again - * to pass to generic_make_request. + * to pass to submit_bio_noacct. * * The r10_bio structures are linked using a borrowed master_bio pointer. * This link is counted in ->remaining. When the r10_bio that points to NULL @@ -3496,7 +3479,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (bio->bi_end_io == end_sync_read) { md_sync_acct_bio(bio, nr_sectors); bio->bi_status = 0; - generic_make_request(bio); + submit_bio_noacct(bio); } } @@ -4307,8 +4290,8 @@ out: else rdev->recovery_offset = 0; - if (sysfs_link_rdev(mddev, rdev)) - /* Failure here is OK */; + /* Failure here is OK */ + sysfs_link_rdev(mddev, rdev); } } else if (rdev->raid_disk >= conf->prev.raid_disks && !test_bit(Faulty, &rdev->flags)) { @@ -4454,7 +4437,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, sector_nr = conf->reshape_progress; if (sector_nr) { mddev->curr_resync_completed = sector_nr; - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); *skipped = 1; return sector_nr; } @@ -4654,7 +4637,7 @@ read_more: md_sync_acct_bio(read_bio, r10_bio->sectors); atomic_inc(&r10_bio->remaining); read_bio->bi_next = NULL; - generic_make_request(read_bio); + submit_bio_noacct(read_bio); sectors_done += nr_sectors; if (sector_nr <= last) goto read_more; @@ -4717,7 +4700,7 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) md_sync_acct_bio(b, r10_bio->sectors); atomic_inc(&r10_bio->remaining); b->bi_next = NULL; - generic_make_request(b); + submit_bio_noacct(b); } end_reshape_request(r10_bio); } @@ -4929,7 +4912,6 @@ static struct md_personality raid10_personality = .start_reshape = raid10_start_reshape, .finish_reshape = raid10_finish_reshape, .update_reshape_pos = raid10_update_reshape_pos, - .congested = raid10_congested, }; static int __init raid_init(void) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 9b6da759dca2..4337ae0e6af2 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -195,9 +195,7 @@ struct r5l_log { static inline sector_t r5c_tree_index(struct r5conf *conf, sector_t sect) { - sector_t offset; - - offset = sector_div(sect, conf->chunk_sectors); + sector_div(sect, conf->chunk_sectors); return sect; } @@ -298,8 +296,8 @@ r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev) wbi = dev->written; dev->written = NULL; while (wbi && wbi->bi_iter.bi_sector < - dev->sector + STRIPE_SECTORS) { - wbi2 = r5_next_bio(wbi, dev->sector); + dev->sector + RAID5_STRIPE_SECTORS(conf)) { + wbi2 = r5_next_bio(conf, wbi, dev->sector); md_write_end(conf->mddev); bio_endio(wbi); wbi = wbi2; @@ -316,7 +314,7 @@ void r5c_handle_cached_data_endio(struct r5conf *conf, set_bit(R5_UPTODATE, &sh->dev[i].flags); r5c_return_dev_pending_writes(conf, &sh->dev[i]); md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, + RAID5_STRIPE_SECTORS(conf), !test_bit(STRIPE_DEGRADED, &sh->state), 0); } @@ -364,7 +362,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf) */ if (atomic_read(&conf->r5c_cached_full_stripes) >= min(R5C_FULL_STRIPE_FLUSH_BATCH(conf), - conf->chunk_sectors >> STRIPE_SHIFT)) + conf->chunk_sectors >> RAID5_STRIPE_SHIFT(conf))) r5l_wake_reclaim(conf->log, 0); } @@ -2430,10 +2428,15 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log, struct mddev *mddev = log->rdev->mddev; struct r5conf *conf = mddev->private; struct stripe_head *sh, *next; + bool cleared_pending = false; if (ctx->data_only_stripes == 0) return; + if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { + cleared_pending = true; + clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); + } log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK; list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { @@ -2448,6 +2451,8 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log, atomic_read(&conf->active_stripes) == 0); log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; + if (cleared_pending) + set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); } static int r5l_recovery_log(struct r5l_log *log) @@ -2532,13 +2537,10 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) struct r5conf *conf; int ret; - ret = mddev_lock(mddev); - if (ret) - return ret; - + spin_lock(&mddev->lock); conf = mddev->private; if (!conf || !conf->log) { - mddev_unlock(mddev); + spin_unlock(&mddev->lock); return 0; } @@ -2558,7 +2560,7 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) default: ret = 0; } - mddev_unlock(mddev); + spin_unlock(&mddev->lock); return ret; } diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index a750f4bbb5d9..d0f540296fe9 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -324,7 +324,7 @@ static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh) * be just after the last logged stripe and write to the same * disks. Use bit shift and logarithm to avoid 64-bit division. */ - if ((sh->sector == sh_last->sector + STRIPE_SECTORS) && + if ((sh->sector == sh_last->sector + RAID5_STRIPE_SECTORS(conf)) && (data_sector >> ilog2(conf->chunk_sectors) == data_sector_last >> ilog2(conf->chunk_sectors)) && ((data_sector - data_sector_last) * data_disks == @@ -844,9 +844,9 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, /* if start and end is 4k aligned, use a 4k block */ if (block_size == 512 && - (r_sector_first & (STRIPE_SECTORS - 1)) == 0 && - (r_sector_last & (STRIPE_SECTORS - 1)) == 0) - block_size = STRIPE_SIZE; + (r_sector_first & (RAID5_STRIPE_SECTORS(conf) - 1)) == 0 && + (r_sector_last & (RAID5_STRIPE_SECTORS(conf) - 1)) == 0) + block_size = RAID5_STRIPE_SIZE(conf); /* iterate through blocks in strip */ for (i = 0; i < strip_sectors; i += (block_size >> 9)) { @@ -1274,7 +1274,8 @@ static int ppl_validate_rdev(struct md_rdev *rdev) ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9); if (ppl_data_sectors > 0) - ppl_data_sectors = rounddown(ppl_data_sectors, STRIPE_SECTORS); + ppl_data_sectors = rounddown(ppl_data_sectors, + RAID5_STRIPE_SECTORS((struct r5conf *)rdev->mddev->private)); if (ppl_data_sectors <= 0) { pr_warn("md/raid:%s: PPL space too small on %s\n", diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 892aefe88fa7..ef0fd4830803 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -69,13 +69,13 @@ static struct workqueue_struct *raid5_wq; static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) { - int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; + int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK; return &conf->stripe_hashtbl[hash]; } -static inline int stripe_hash_locks_hash(sector_t sect) +static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect) { - return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; + return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK; } static inline void lock_device_hash_lock(struct r5conf *conf, int hash) @@ -627,7 +627,7 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector, int previous, int noblock, int noquiesce) { struct stripe_head *sh; - int hash = stripe_hash_locks_hash(sector); + int hash = stripe_hash_locks_hash(conf, sector); int inc_empty_inactive_list_flag; pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); @@ -748,9 +748,9 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh tmp_sec = sh->sector; if (!sector_div(tmp_sec, conf->chunk_sectors)) return; - head_sector = sh->sector - STRIPE_SECTORS; + head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf); - hash = stripe_hash_locks_hash(head_sector); + hash = stripe_hash_locks_hash(conf, head_sector); spin_lock_irq(conf->hash_locks + hash); head = __find_stripe(conf, head_sector, conf->generation); if (head && !atomic_inc_not_zero(&head->count)) { @@ -873,7 +873,7 @@ static void dispatch_bio_list(struct bio_list *tmp) struct bio *bio; while ((bio = bio_list_pop(tmp))) - generic_make_request(bio); + submit_bio_noacct(bio); } static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b) @@ -1057,7 +1057,7 @@ again: test_bit(WriteErrorSeen, &rdev->flags)) { sector_t first_bad; int bad_sectors; - int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, + int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), &first_bad, &bad_sectors); if (!bad) break; @@ -1089,7 +1089,7 @@ again: if (rdev) { if (s->syncing || s->expanding || s->expanded || s->replacing) - md_sync_acct(rdev->bdev, STRIPE_SECTORS); + md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf)); set_bit(STRIPE_IO_STARTED, &sh->state); @@ -1129,9 +1129,9 @@ again: else sh->dev[i].vec.bv_page = sh->dev[i].page; bi->bi_vcnt = 1; - bi->bi_io_vec[0].bv_len = STRIPE_SIZE; + bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf); bi->bi_io_vec[0].bv_offset = 0; - bi->bi_iter.bi_size = STRIPE_SIZE; + bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf); bi->bi_write_hint = sh->dev[i].write_hint; if (!rrdev) sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET; @@ -1151,12 +1151,12 @@ again: if (should_defer && op_is_write(op)) bio_list_add(&pending_bios, bi); else - generic_make_request(bi); + submit_bio_noacct(bi); } if (rrdev) { if (s->syncing || s->expanding || s->expanded || s->replacing) - md_sync_acct(rrdev->bdev, STRIPE_SECTORS); + md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf)); set_bit(STRIPE_IO_STARTED, &sh->state); @@ -1183,9 +1183,9 @@ again: WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); sh->dev[i].rvec.bv_page = sh->dev[i].page; rbi->bi_vcnt = 1; - rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; + rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf); rbi->bi_io_vec[0].bv_offset = 0; - rbi->bi_iter.bi_size = STRIPE_SIZE; + rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf); rbi->bi_write_hint = sh->dev[i].write_hint; sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET; /* @@ -1201,7 +1201,7 @@ again: if (should_defer && op_is_write(op)) bio_list_add(&pending_bios, rbi); else - generic_make_request(rbi); + submit_bio_noacct(rbi); } if (!rdev && !rrdev) { if (op_is_write(op)) @@ -1235,6 +1235,7 @@ async_copy_data(int frombio, struct bio *bio, struct page **page, int page_offset; struct async_submit_ctl submit; enum async_tx_flags flags = 0; + struct r5conf *conf = sh->raid_conf; if (bio->bi_iter.bi_sector >= sector) page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512; @@ -1256,8 +1257,8 @@ async_copy_data(int frombio, struct bio *bio, struct page **page, len -= b_offset; } - if (len > 0 && page_offset + len > STRIPE_SIZE) - clen = STRIPE_SIZE - page_offset; + if (len > 0 && page_offset + len > RAID5_STRIPE_SIZE(conf)) + clen = RAID5_STRIPE_SIZE(conf) - page_offset; else clen = len; @@ -1265,9 +1266,9 @@ async_copy_data(int frombio, struct bio *bio, struct page **page, b_offset += bvl.bv_offset; bio_page = bvl.bv_page; if (frombio) { - if (sh->raid_conf->skip_copy && + if (conf->skip_copy && b_offset == 0 && page_offset == 0 && - clen == STRIPE_SIZE && + clen == RAID5_STRIPE_SIZE(conf) && !no_skipcopy) *page = bio_page; else @@ -1292,6 +1293,7 @@ static void ops_complete_biofill(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; int i; + struct r5conf *conf = sh->raid_conf; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -1312,8 +1314,8 @@ static void ops_complete_biofill(void *stripe_head_ref) rbi = dev->read; dev->read = NULL; while (rbi && rbi->bi_iter.bi_sector < - dev->sector + STRIPE_SECTORS) { - rbi2 = r5_next_bio(rbi, dev->sector); + dev->sector + RAID5_STRIPE_SECTORS(conf)) { + rbi2 = r5_next_bio(conf, rbi, dev->sector); bio_endio(rbi); rbi = rbi2; } @@ -1330,6 +1332,7 @@ static void ops_run_biofill(struct stripe_head *sh) struct dma_async_tx_descriptor *tx = NULL; struct async_submit_ctl submit; int i; + struct r5conf *conf = sh->raid_conf; BUG_ON(sh->batch_head); pr_debug("%s: stripe %llu\n", __func__, @@ -1344,10 +1347,10 @@ static void ops_run_biofill(struct stripe_head *sh) dev->toread = NULL; spin_unlock_irq(&sh->stripe_lock); while (rbi && rbi->bi_iter.bi_sector < - dev->sector + STRIPE_SECTORS) { + dev->sector + RAID5_STRIPE_SECTORS(conf)) { tx = async_copy_data(0, rbi, &dev->page, dev->sector, tx, sh, 0); - rbi = r5_next_bio(rbi, dev->sector); + rbi = r5_next_bio(conf, rbi, dev->sector); } } } @@ -1429,9 +1432,11 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); if (unlikely(count == 1)) - tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); + tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); else - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); + tx = async_xor(xor_dest, xor_srcs, 0, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); return tx; } @@ -1522,7 +1527,8 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) init_async_submit(&submit, ASYNC_TX_FENCE, NULL, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); - tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); + tx = async_gen_syndrome(blocks, 0, count+2, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); } else { /* Compute any data- or p-drive using XOR */ count = 0; @@ -1535,7 +1541,8 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); - tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); + tx = async_xor(dest, blocks, 0, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); } return tx; @@ -1598,7 +1605,8 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); return async_gen_syndrome(blocks, 0, syndrome_disks+2, - STRIPE_SIZE, &submit); + RAID5_STRIPE_SIZE(sh->raid_conf), + &submit); } else { struct page *dest; int data_target; @@ -1621,7 +1629,8 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, NULL, NULL, to_addr_conv(sh, percpu, 0)); - tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, + tx = async_xor(dest, blocks, 0, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); @@ -1629,7 +1638,8 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); return async_gen_syndrome(blocks, 0, count+2, - STRIPE_SIZE, &submit); + RAID5_STRIPE_SIZE(sh->raid_conf), + &submit); } } else { init_async_submit(&submit, ASYNC_TX_FENCE, NULL, @@ -1638,13 +1648,15 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) if (failb == syndrome_disks) { /* We're missing D+P. */ return async_raid6_datap_recov(syndrome_disks+2, - STRIPE_SIZE, faila, - blocks, &submit); + RAID5_STRIPE_SIZE(sh->raid_conf), + faila, + blocks, &submit); } else { /* We're missing D+D. */ return async_raid6_2data_recov(syndrome_disks+2, - STRIPE_SIZE, faila, failb, - blocks, &submit); + RAID5_STRIPE_SIZE(sh->raid_conf), + faila, failb, + blocks, &submit); } } } @@ -1691,7 +1703,8 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); + tx = async_xor(xor_dest, xor_srcs, 0, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); return tx; } @@ -1711,7 +1724,8 @@ ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); - tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); + tx = async_gen_syndrome(blocks, 0, count+2, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); return tx; } @@ -1752,7 +1766,7 @@ again: WARN_ON(dev->page != dev->orig_page); while (wbi && wbi->bi_iter.bi_sector < - dev->sector + STRIPE_SECTORS) { + dev->sector + RAID5_STRIPE_SECTORS(conf)) { if (wbi->bi_opf & REQ_FUA) set_bit(R5_WantFUA, &dev->flags); if (wbi->bi_opf & REQ_SYNC) @@ -1770,7 +1784,7 @@ again: clear_bit(R5_OVERWRITE, &dev->flags); } } - wbi = r5_next_bio(wbi, dev->sector); + wbi = r5_next_bio(conf, wbi, dev->sector); } if (head_sh->batch_head) { @@ -1910,9 +1924,11 @@ again: } if (unlikely(count == 1)) - tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); + tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); else - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); + tx = async_xor(xor_dest, xor_srcs, 0, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); if (!last_stripe) { j++; sh = list_first_entry(&sh->batch_list, struct stripe_head, @@ -1972,7 +1988,8 @@ again: } else init_async_submit(&submit, 0, tx, NULL, NULL, to_addr_conv(sh, percpu, j)); - tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); + tx = async_gen_syndrome(blocks, 0, count+2, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); if (!last_stripe) { j++; sh = list_first_entry(&sh->batch_list, struct stripe_head, @@ -2020,7 +2037,8 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) init_async_submit(&submit, 0, NULL, NULL, NULL, to_addr_conv(sh, percpu, 0)); - tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, + tx = async_xor_val(xor_dest, xor_srcs, 0, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &sh->ops.zero_sum_result, &submit); atomic_inc(&sh->count); @@ -2045,7 +2063,8 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu atomic_inc(&sh->count); init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, sh, to_addr_conv(sh, percpu, 0)); - async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, + async_syndrome_val(srcs, 0, count+2, + RAID5_STRIPE_SIZE(sh->raid_conf), &sh->ops.zero_sum_result, percpu->spare_page, &submit); } @@ -2217,9 +2236,9 @@ static int grow_stripes(struct r5conf *conf, int num) /** * scribble_alloc - allocate percpu scribble buffer for required size * of the scribble region - * @percpu - from for_each_present_cpu() of the caller - * @num - total number of disks in the array - * @cnt - scribble objs count for required size of the scribble region + * @percpu: from for_each_present_cpu() of the caller + * @num: total number of disks in the array + * @cnt: scribble objs count for required size of the scribble region * * The scribble buffer size must be enough to contain: * 1/ a struct page pointer for each device in the array +2 @@ -2275,7 +2294,7 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) percpu = per_cpu_ptr(conf->percpu, cpu); err = scribble_alloc(percpu, new_disks, - new_sectors / STRIPE_SECTORS); + new_sectors / RAID5_STRIPE_SECTORS(conf)); if (err) break; } @@ -2509,10 +2528,10 @@ static void raid5_end_read_request(struct bio * bi) */ pr_info_ratelimited( "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n", - mdname(conf->mddev), STRIPE_SECTORS, + mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf), (unsigned long long)s, bdevname(rdev->bdev, b)); - atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); + atomic_add(RAID5_STRIPE_SECTORS(conf), &rdev->corrected_errors); clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) @@ -2585,7 +2604,7 @@ static void raid5_end_read_request(struct bio * bi) if (!(set_bad && test_bit(In_sync, &rdev->flags) && rdev_set_badblocks( - rdev, sh->sector, STRIPE_SECTORS, 0))) + rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0))) md_error(conf->mddev, rdev); } } @@ -2601,7 +2620,7 @@ static void raid5_end_write_request(struct bio *bi) struct stripe_head *sh = bi->bi_private; struct r5conf *conf = sh->raid_conf; int disks = sh->disks, i; - struct md_rdev *uninitialized_var(rdev); + struct md_rdev *rdev; sector_t first_bad; int bad_sectors; int replacement = 0; @@ -2637,7 +2656,7 @@ static void raid5_end_write_request(struct bio *bi) if (bi->bi_status) md_error(conf->mddev, rdev); else if (is_badblock(rdev, sh->sector, - STRIPE_SECTORS, + RAID5_STRIPE_SECTORS(conf), &first_bad, &bad_sectors)) set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); } else { @@ -2649,7 +2668,7 @@ static void raid5_end_write_request(struct bio *bi) set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); } else if (is_badblock(rdev, sh->sector, - STRIPE_SECTORS, + RAID5_STRIPE_SECTORS(conf), &first_bad, &bad_sectors)) { set_bit(R5_MadeGood, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) @@ -3283,13 +3302,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, /* check if page is covered */ sector_t sector = sh->dev[dd_idx].sector; for (bi=sh->dev[dd_idx].towrite; - sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && + sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) && bi && bi->bi_iter.bi_sector <= sector; - bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { + bi = r5_next_bio(conf, bi, sh->dev[dd_idx].sector)) { if (bio_end_sector(bi) >= sector) sector = bio_end_sector(bi); } - if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) + if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf)) if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags)) sh->overwrite_disks++; } @@ -3314,7 +3333,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, set_bit(STRIPE_BITMAP_PENDING, &sh->state); spin_unlock_irq(&sh->stripe_lock); md_bitmap_startwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, 0); + RAID5_STRIPE_SECTORS(conf), 0); spin_lock_irq(&sh->stripe_lock); clear_bit(STRIPE_BITMAP_PENDING, &sh->state); if (!sh->batch_head) { @@ -3376,7 +3395,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, if (!rdev_set_badblocks( rdev, sh->sector, - STRIPE_SECTORS, 0)) + RAID5_STRIPE_SECTORS(conf), 0)) md_error(conf->mddev, rdev); rdev_dec_pending(rdev, conf->mddev); } @@ -3396,8 +3415,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, wake_up(&conf->wait_for_overlap); while (bi && bi->bi_iter.bi_sector < - sh->dev[i].sector + STRIPE_SECTORS) { - struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); + sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { + struct bio *nextbi = r5_next_bio(conf, bi, sh->dev[i].sector); md_write_end(conf->mddev); bio_io_error(bi); @@ -3405,7 +3424,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, } if (bitmap_end) md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, 0, 0); + RAID5_STRIPE_SECTORS(conf), 0, 0); bitmap_end = 0; /* and fail all 'written' */ bi = sh->dev[i].written; @@ -3417,8 +3436,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, if (bi) bitmap_end = 1; while (bi && bi->bi_iter.bi_sector < - sh->dev[i].sector + STRIPE_SECTORS) { - struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); + sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { + struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector); md_write_end(conf->mddev); bio_io_error(bi); @@ -3441,9 +3460,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, if (bi) s->to_read--; while (bi && bi->bi_iter.bi_sector < - sh->dev[i].sector + STRIPE_SECTORS) { + sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { struct bio *nextbi = - r5_next_bio(bi, sh->dev[i].sector); + r5_next_bio(conf, bi, sh->dev[i].sector); bio_io_error(bi); bi = nextbi; @@ -3451,7 +3470,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, } if (bitmap_end) md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, 0, 0); + RAID5_STRIPE_SECTORS(conf), 0, 0); /* If we were in the middle of a write the parity block might * still be locked - so just clear all R5_LOCKED flags */ @@ -3496,14 +3515,14 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && !rdev_set_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0)) + RAID5_STRIPE_SECTORS(conf), 0)) abort = 1; rdev = rcu_dereference(conf->disks[i].replacement); if (rdev && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && !rdev_set_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0)) + RAID5_STRIPE_SECTORS(conf), 0)) abort = 1; } rcu_read_unlock(); @@ -3511,7 +3530,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, conf->recovery_disabled = conf->mddev->recovery_disabled; } - md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); + md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort); } static int want_replace(struct stripe_head *sh, int disk_idx) @@ -3538,6 +3557,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], &sh->dev[s->failed_num[1]] }; int i; + bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW); if (test_bit(R5_LOCKED, &dev->flags) || @@ -3596,17 +3616,27 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, * devices must be read. */ return 1; + + if (s->failed >= 2 && + (fdev[i]->towrite || + s->failed_num[i] == sh->pd_idx || + s->failed_num[i] == sh->qd_idx) && + !test_bit(R5_UPTODATE, &fdev[i]->flags)) + /* In max degraded raid6, If the failed disk is P, Q, + * or we want to read the failed disk, we need to do + * reconstruct-write. + */ + force_rcw = true; } - /* If we are forced to do a reconstruct-write, either because - * the current RAID6 implementation only supports that, or - * because parity cannot be trusted and we are currently - * recovering it, there is extra need to be careful. + /* If we are forced to do a reconstruct-write, because parity + * cannot be trusted and we are currently recovering it, there + * is extra need to be careful. * If one of the devices that we would need to read, because * it is not being overwritten (and maybe not written at all) * is missing/faulty, then we need to read everything we can. */ - if (sh->raid_conf->level != 6 && + if (!force_rcw && sh->sector < sh->raid_conf->mddev->recovery_cp) /* reconstruct-write isn't being forced */ return 0; @@ -3710,7 +3740,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, return 0; } -/** +/* * handle_stripe_fill - read or compute data to satisfy pending requests. */ static void handle_stripe_fill(struct stripe_head *sh, @@ -3785,14 +3815,14 @@ returnbi: wbi = dev->written; dev->written = NULL; while (wbi && wbi->bi_iter.bi_sector < - dev->sector + STRIPE_SECTORS) { - wbi2 = r5_next_bio(wbi, dev->sector); + dev->sector + RAID5_STRIPE_SECTORS(conf)) { + wbi2 = r5_next_bio(conf, wbi, dev->sector); md_write_end(conf->mddev); bio_endio(wbi); wbi = wbi2; } md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, + RAID5_STRIPE_SECTORS(conf), !test_bit(STRIPE_DEGRADED, &sh->state), 0); if (head_sh->batch_head) { @@ -3976,10 +4006,8 @@ static int handle_stripe_dirtying(struct r5conf *conf, set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); s->locked++; - } else { + } else set_bit(STRIPE_DELAYED, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); - } } } } @@ -4004,10 +4032,8 @@ static int handle_stripe_dirtying(struct r5conf *conf, set_bit(R5_Wantread, &dev->flags); s->locked++; qread++; - } else { + } else set_bit(STRIPE_DELAYED, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); - } } } if (rcw && conf->mddev->queue) @@ -4099,7 +4125,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, */ set_bit(STRIPE_INSYNC, &sh->state); else { - atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); + atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches); if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) { /* don't try to repair!! */ set_bit(STRIPE_INSYNC, &sh->state); @@ -4107,7 +4133,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, "%llu-%llu\n", mdname(conf->mddev), (unsigned long long) sh->sector, (unsigned long long) sh->sector + - STRIPE_SECTORS); + RAID5_STRIPE_SECTORS(conf)); } else { sh->check_state = check_state_compute_run; set_bit(STRIPE_COMPUTE_RUN, &sh->state); @@ -4264,7 +4290,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, */ } } else { - atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); + atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches); if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) { /* don't try to repair!! */ set_bit(STRIPE_INSYNC, &sh->state); @@ -4272,7 +4298,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, "%llu-%llu\n", mdname(conf->mddev), (unsigned long long) sh->sector, (unsigned long long) sh->sector + - STRIPE_SECTORS); + RAID5_STRIPE_SECTORS(conf)); } else { int *target = &sh->ops.target; @@ -4343,7 +4369,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) /* place all the copies on one channel */ init_async_submit(&submit, 0, tx, NULL, NULL, NULL); tx = async_memcpy(sh2->dev[dd_idx].page, - sh->dev[i].page, 0, 0, STRIPE_SIZE, + sh->dev[i].page, 0, 0, RAID5_STRIPE_SIZE(conf), &submit); set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); @@ -4442,8 +4468,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) */ rdev = rcu_dereference(conf->disks[i].replacement); if (rdev && !test_bit(Faulty, &rdev->flags) && - rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && - !is_badblock(rdev, sh->sector, STRIPE_SECTORS, + rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) && + !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), &first_bad, &bad_sectors)) set_bit(R5_ReadRepl, &dev->flags); else { @@ -4457,7 +4483,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) if (rdev && test_bit(Faulty, &rdev->flags)) rdev = NULL; if (rdev) { - is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, + is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), &first_bad, &bad_sectors); if (s->blocked_rdev == NULL && (test_bit(Blocked, &rdev->flags) @@ -4484,7 +4510,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) } } else if (test_bit(In_sync, &rdev->flags)) set_bit(R5_Insync, &dev->flags); - else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) + else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset) /* in sync if before recovery_offset */ set_bit(R5_Insync, &dev->flags); else if (test_bit(R5_UPTODATE, &dev->flags) && @@ -4573,12 +4599,12 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) rcu_read_unlock(); } +/* + * Return '1' if this is a member of batch, or '0' if it is a lone stripe or + * a head which can now be handled. + */ static int clear_batch_ready(struct stripe_head *sh) { - /* Return '1' if this is a member of batch, or - * '0' if it is a lone stripe or a head which can now be - * handled. - */ struct stripe_head *tmp; if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) return (sh->batch_head && sh->batch_head != sh); @@ -4682,6 +4708,16 @@ static void handle_stripe(struct stripe_head *sh) struct r5dev *pdev, *qdev; clear_bit(STRIPE_HANDLE, &sh->state); + + /* + * handle_stripe should not continue handle the batched stripe, only + * the head of batch list or lone stripe can continue. Otherwise we + * could see break_stripe_batch_list warns about the STRIPE_ACTIVE + * is set for the batched stripe. + */ + if (clear_batch_ready(sh)) + return; + if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { /* already being handled, ensure it gets handled * again when current action finishes */ @@ -4689,11 +4725,6 @@ static void handle_stripe(struct stripe_head *sh) return; } - if (clear_batch_ready(sh) ) { - clear_bit_unlock(STRIPE_ACTIVE, &sh->state); - return; - } - if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) break_stripe_batch_list(sh, 0); @@ -4842,7 +4873,7 @@ static void handle_stripe(struct stripe_head *sh) * or to load a block that is being partially written. */ if (s.to_read || s.non_overwrite - || (conf->level == 6 && s.to_write && s.failed) + || (s.to_write && s.failed) || (s.syncing && (s.uptodate + s.compute < disks)) || s.replacing || s.expanding) @@ -4927,7 +4958,7 @@ static void handle_stripe(struct stripe_head *sh) if ((s.syncing || s.replacing) && s.locked == 0 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && test_bit(STRIPE_INSYNC, &sh->state)) { - md_done_sync(conf->mddev, STRIPE_SECTORS, 1); + md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1); clear_bit(STRIPE_SYNCING, &sh->state); if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) wake_up(&conf->wait_for_overlap); @@ -4946,14 +4977,11 @@ static void handle_stripe(struct stripe_head *sh) if (!test_bit(R5_ReWrite, &dev->flags)) { set_bit(R5_Wantwrite, &dev->flags); set_bit(R5_ReWrite, &dev->flags); - set_bit(R5_LOCKED, &dev->flags); - s.locked++; - } else { + } else /* let's read it back */ set_bit(R5_Wantread, &dev->flags); - set_bit(R5_LOCKED, &dev->flags); - s.locked++; - } + set_bit(R5_LOCKED, &dev->flags); + s.locked++; } } @@ -4995,7 +5023,7 @@ static void handle_stripe(struct stripe_head *sh) clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); wake_up(&conf->wait_for_overlap); - md_done_sync(conf->mddev, STRIPE_SECTORS, 1); + md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1); } if (s.expanding && s.locked == 0 && @@ -5025,14 +5053,14 @@ finish: /* We own a safe reference to the rdev */ rdev = conf->disks[i].rdev; if (!rdev_set_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0)) + RAID5_STRIPE_SECTORS(conf), 0)) md_error(conf->mddev, rdev); rdev_dec_pending(rdev, conf->mddev); } if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { rdev = conf->disks[i].rdev; rdev_clear_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0); + RAID5_STRIPE_SECTORS(conf), 0); rdev_dec_pending(rdev, conf->mddev); } if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { @@ -5041,7 +5069,7 @@ finish: /* rdev have been moved down */ rdev = conf->disks[i].rdev; rdev_clear_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0); + RAID5_STRIPE_SECTORS(conf), 0); rdev_dec_pending(rdev, conf->mddev); } } @@ -5099,28 +5127,6 @@ static void activate_bit_delay(struct r5conf *conf, } } -static int raid5_congested(struct mddev *mddev, int bits) -{ - struct r5conf *conf = mddev->private; - - /* No difference between reads and writes. Just check - * how busy the stripe_cache is - */ - - if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) - return 1; - - /* Also checks whether there is pressure on r5cache log space */ - if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) - return 1; - if (conf->quiesce) - return 1; - if (atomic_read(&conf->empty_inactive_list_nr)) - return 1; - - return 0; -} - static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) { struct r5conf *conf = mddev->private; @@ -5289,7 +5295,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) trace_block_bio_remap(align_bi->bi_disk->queue, align_bi, disk_devt(mddev->gendisk), raid_bio->bi_iter.bi_sector); - generic_make_request(align_bi); + submit_bio_noacct(align_bi); return 1; } else { rcu_read_unlock(); @@ -5309,7 +5315,7 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) struct r5conf *conf = mddev->private; split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split); bio_chain(split, raid_bio); - generic_make_request(raid_bio); + submit_bio_noacct(raid_bio); raid_bio = split; } @@ -5505,7 +5511,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) /* Skip discard while reshape is happening */ return; - logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); + logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); last_sector = bio_end_sector(bi); bi->bi_next = NULL; @@ -5520,7 +5526,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) last_sector *= conf->chunk_sectors; for (; logical_sector < last_sector; - logical_sector += STRIPE_SECTORS) { + logical_sector += RAID5_STRIPE_SECTORS(conf)) { DEFINE_WAIT(w); int d; again: @@ -5565,7 +5571,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) d++) md_bitmap_startwrite(mddev->bitmap, sh->sector, - STRIPE_SECTORS, + RAID5_STRIPE_SECTORS(conf), 0); sh->bm_seq = conf->seq_flush + 1; set_bit(STRIPE_BIT_DELAY, &sh->state); @@ -5630,12 +5636,12 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) return true; } - logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); + logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); last_sector = bio_end_sector(bi); bi->bi_next = NULL; prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); - for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { + for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) { int previous; int seq; @@ -5733,8 +5739,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) do_flush = false; } - if (!sh->batch_head || sh == sh->batch_head) - set_bit(STRIPE_HANDLE, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); if ((!sh->batch_head || sh == sh->batch_head) && (bi->bi_opf & REQ_SYNC) && @@ -5799,7 +5804,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk sector_div(sector_nr, new_data_disks); if (sector_nr) { mddev->curr_resync_completed = sector_nr; - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); *skipped = 1; retn = sector_nr; goto finish; @@ -5913,11 +5918,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); } INIT_LIST_HEAD(&stripes); - for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { + for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) { int j; int skipped_disk = 0; sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1); @@ -5938,7 +5943,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk skipped_disk = 1; continue; } - memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); + memset(page_address(sh->dev[j].page), 0, RAID5_STRIPE_SIZE(conf)); set_bit(R5_Expanded, &sh->dev[j].flags); set_bit(R5_UPTODATE, &sh->dev[j].flags); } @@ -5973,7 +5978,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk set_bit(STRIPE_EXPAND_SOURCE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); raid5_release_stripe(sh); - first_sector += STRIPE_SECTORS; + first_sector += RAID5_STRIPE_SECTORS(conf); } /* Now that the sources are clearly marked, we can release * the destination stripes @@ -6020,7 +6025,7 @@ finish: conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); - sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(mddev->sysfs_completed); } ret: return retn; @@ -6079,11 +6084,12 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && !conf->fullsync && !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && - sync_blocks >= STRIPE_SECTORS) { + sync_blocks >= RAID5_STRIPE_SECTORS(conf)) { /* we can skip this block, and probably more */ - sync_blocks /= STRIPE_SECTORS; + do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf)); *skipped = 1; - return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ + /* keep things rounded to whole stripes */ + return sync_blocks * RAID5_STRIPE_SECTORS(conf); } md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); @@ -6116,7 +6122,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n raid5_release_stripe(sh); - return STRIPE_SECTORS; + return RAID5_STRIPE_SECTORS(conf); } static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, @@ -6139,14 +6145,14 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, int handled = 0; logical_sector = raid_bio->bi_iter.bi_sector & - ~((sector_t)STRIPE_SECTORS-1); + ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); sector = raid5_compute_sector(conf, logical_sector, 0, &dd_idx, NULL); last_sector = bio_end_sector(raid_bio); for (; logical_sector < last_sector; - logical_sector += STRIPE_SECTORS, - sector += STRIPE_SECTORS, + logical_sector += RAID5_STRIPE_SECTORS(conf), + sector += RAID5_STRIPE_SECTORS(conf), scnt++) { if (scnt < offset) @@ -6479,6 +6485,77 @@ raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR, raid5_show_rmw_level, raid5_store_rmw_level); +static ssize_t +raid5_show_stripe_size(struct mddev *mddev, char *page) +{ + struct r5conf *conf; + int ret = 0; + + spin_lock(&mddev->lock); + conf = mddev->private; + if (conf) + ret = sprintf(page, "%lu\n", RAID5_STRIPE_SIZE(conf)); + spin_unlock(&mddev->lock); + return ret; +} + +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE +static ssize_t +raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf; + unsigned long new; + int err; + + if (len >= PAGE_SIZE) + return -EINVAL; + if (kstrtoul(page, 10, &new)) + return -EINVAL; + + /* + * The value should not be bigger than PAGE_SIZE. It requires to + * be multiple of DEFAULT_STRIPE_SIZE. + */ + if (new % DEFAULT_STRIPE_SIZE != 0 || new > PAGE_SIZE || new == 0) + return -EINVAL; + + err = mddev_lock(mddev); + if (err) + return err; + + conf = mddev->private; + if (!conf) { + err = -ENODEV; + goto out_unlock; + } + + if (new == conf->stripe_size) + goto out_unlock; + + pr_debug("md/raid: change stripe_size from %lu to %lu\n", + conf->stripe_size, new); + + mddev_suspend(mddev); + conf->stripe_size = new; + conf->stripe_shift = ilog2(new) - 9; + conf->stripe_sectors = new >> 9; + mddev_resume(mddev); + +out_unlock: + mddev_unlock(mddev); + return err ?: len; +} + +static struct md_sysfs_entry +raid5_stripe_size = __ATTR(stripe_size, 0644, + raid5_show_stripe_size, + raid5_store_stripe_size); +#else +static struct md_sysfs_entry +raid5_stripe_size = __ATTR(stripe_size, 0444, + raid5_show_stripe_size, + NULL); +#endif static ssize_t raid5_show_preread_threshold(struct mddev *mddev, char *page) @@ -6667,6 +6744,7 @@ static struct attribute *raid5_attrs[] = { &raid5_group_thread_cnt.attr, &raid5_skip_copy.attr, &raid5_rmw_level.attr, + &raid5_stripe_size.attr, &r5c_journal_mode.attr, &ppl_write_hint.attr, NULL, @@ -6766,7 +6844,7 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu conf->previous_raid_disks), max(conf->chunk_sectors, conf->prev_chunk_sectors) - / STRIPE_SECTORS)) { + / RAID5_STRIPE_SECTORS(conf))) { free_scratch_buffer(conf, percpu); return -ENOMEM; } @@ -6918,6 +6996,12 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); if (conf == NULL) goto abort; + +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + conf->stripe_size = DEFAULT_STRIPE_SIZE; + conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - 9; + conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> 9; +#endif INIT_LIST_HEAD(&conf->free_list); INIT_LIST_HEAD(&conf->pending_list); conf->pending_data = kcalloc(PENDING_IO_MAX, @@ -7069,8 +7153,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf->min_nr_stripes = NR_STRIPES; if (mddev->reshape_position != MaxSector) { int stripes = max_t(int, - ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4, - ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4); + ((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4, + ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4); conf->min_nr_stripes = max(NR_STRIPES, stripes); if (conf->min_nr_stripes != NR_STRIPES) pr_info("md/raid:%s: force stripe size %d for reshape\n", @@ -7801,14 +7885,14 @@ static int check_stripe_cache(struct mddev *mddev) * stripe_heads first. */ struct r5conf *conf = mddev->private; - if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 + if (((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4 > conf->min_nr_stripes || - ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 + ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4 > conf->min_nr_stripes) { pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n", mdname(mddev), ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) - / STRIPE_SIZE)*4); + / RAID5_STRIPE_SIZE(conf))*4); return 0; } return 1; @@ -7944,8 +8028,8 @@ static int raid5_start_reshape(struct mddev *mddev) else rdev->recovery_offset = 0; - if (sysfs_link_rdev(mddev, rdev)) - /* Failure here is OK */; + /* Failure here is OK */ + sysfs_link_rdev(mddev, rdev); } } else if (rdev->raid_disk >= conf->previous_raid_disks && !test_bit(Faulty, &rdev->flags)) { @@ -8140,7 +8224,7 @@ static void *raid5_takeover_raid1(struct mddev *mddev) while (chunksect && (mddev->array_sectors & (chunksect-1))) chunksect >>= 1; - if ((chunksect<<9) < STRIPE_SIZE) + if ((chunksect<<9) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private)) /* array size does not allow a suitable chunk size */ return ERR_PTR(-EINVAL); @@ -8427,7 +8511,6 @@ static struct md_personality raid6_personality = .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, .takeover = raid6_takeover, - .congested = raid5_congested, .change_consistency_policy = raid5_change_consistency_policy, }; static struct md_personality raid5_personality = @@ -8452,7 +8535,6 @@ static struct md_personality raid5_personality = .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, .takeover = raid5_takeover, - .congested = raid5_congested, .change_consistency_policy = raid5_change_consistency_policy, }; @@ -8478,7 +8560,6 @@ static struct md_personality raid4_personality = .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, .takeover = raid4_takeover, - .congested = raid5_congested, .change_consistency_policy = raid5_change_consistency_policy, }; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index a2c9e9e9f5ac..16fc29472f5c 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -472,32 +472,20 @@ struct disk_info { */ #define NR_STRIPES 256 +#define DEFAULT_STRIPE_SIZE 4096 + +#if PAGE_SIZE == DEFAULT_STRIPE_SIZE #define STRIPE_SIZE PAGE_SIZE #define STRIPE_SHIFT (PAGE_SHIFT - 9) #define STRIPE_SECTORS (STRIPE_SIZE>>9) +#endif + #define IO_THRESHOLD 1 #define BYPASS_THRESHOLD 1 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) #define HASH_MASK (NR_HASH - 1) #define MAX_STRIPE_BATCH 8 -/* bio's attached to a stripe+device for I/O are linked together in bi_sector - * order without overlap. There may be several bio's per stripe+device, and - * a bio could span several devices. - * When walking this list for a particular stripe+device, we must never proceed - * beyond a bio that extends past this device, as the next bio might no longer - * be valid. - * This function is used to determine the 'next' bio in the list, given the - * sector of the current stripe+device - */ -static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) -{ - if (bio_end_sector(bio) < sector + STRIPE_SECTORS) - return bio->bi_next; - else - return NULL; -} - /* NOTE NR_STRIPE_HASH_LOCKS must remain below 64. * This is because we sometimes take all the spinlocks * and creating that much locking depth can cause @@ -574,6 +562,11 @@ struct r5conf { int raid_disks; int max_nr_stripes; int min_nr_stripes; +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + unsigned long stripe_size; + unsigned int stripe_shift; + unsigned long stripe_sectors; +#endif /* reshape_progress is the leading edge of a 'reshape' * It has value MaxSector when no reshape is happening @@ -690,6 +683,32 @@ struct r5conf { struct r5pending_data *next_pending_data; }; +#if PAGE_SIZE == DEFAULT_STRIPE_SIZE +#define RAID5_STRIPE_SIZE(conf) STRIPE_SIZE +#define RAID5_STRIPE_SHIFT(conf) STRIPE_SHIFT +#define RAID5_STRIPE_SECTORS(conf) STRIPE_SECTORS +#else +#define RAID5_STRIPE_SIZE(conf) ((conf)->stripe_size) +#define RAID5_STRIPE_SHIFT(conf) ((conf)->stripe_shift) +#define RAID5_STRIPE_SECTORS(conf) ((conf)->stripe_sectors) +#endif + +/* bio's attached to a stripe+device for I/O are linked together in bi_sector + * order without overlap. There may be several bio's per stripe+device, and + * a bio could span several devices. + * When walking this list for a particular stripe+device, we must never proceed + * beyond a bio that extends past this device, as the next bio might no longer + * be valid. + * This function is used to determine the 'next' bio in the list, given the + * sector of the current stripe+device + */ +static inline struct bio *r5_next_bio(struct r5conf *conf, struct bio *bio, sector_t sector) +{ + if (bio_end_sector(bio) < sector + RAID5_STRIPE_SECTORS(conf)) + return bio->bi_next; + else + return NULL; +} /* * Our supported algorithms |