diff options
Diffstat (limited to 'drivers/md')
68 files changed, 5805 insertions, 2655 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 02a5345a44a6..b7767da50c26 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -240,9 +240,17 @@ config DM_BUFIO as a cache, holding recently-read blocks in memory and performing delayed writes. +config DM_DEBUG_BLOCK_MANAGER_LOCKING + bool "Block manager locking" + depends on DM_BUFIO + ---help--- + Block manager locking can catch various metadata corruption issues. + + If unsure, say N. + config DM_DEBUG_BLOCK_STACK_TRACING bool "Keep stack trace of persistent data block lock holders" - depends on STACKTRACE_SUPPORT && DM_BUFIO + depends on STACKTRACE_SUPPORT && DM_DEBUG_BLOCK_MANAGER_LOCKING select STACKTRACE ---help--- Enable this for messages that may help debug problems with the diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 6b420a55c745..c3ea03c9a1a8 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -425,7 +425,7 @@ struct cache { * until a gc finishes - otherwise we could pointlessly burn a ton of * cpu */ - unsigned invalidate_needs_gc:1; + unsigned invalidate_needs_gc; bool discard; /* Get rid of? */ @@ -593,8 +593,8 @@ struct cache_set { /* Counts how many sectors bio_insert has added to the cache */ atomic_t sectors_to_gc; + wait_queue_head_t gc_wait; - wait_queue_head_t moving_gc_wait; struct keybuf moving_gc_keys; /* Number of moving GC bios in flight */ struct semaphore moving_in_flight; diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 646fe85261c1..18526d44688d 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -11,6 +11,7 @@ #include "bset.h" #include <linux/console.h> +#include <linux/sched/clock.h> #include <linux/random.h> #include <linux/prefetch.h> diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 81d3db40cd7b..450d0e848ae4 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -32,6 +32,9 @@ #include <linux/prefetch.h> #include <linux/random.h> #include <linux/rcupdate.h> +#include <linux/sched/clock.h> +#include <linux/rculist.h> + #include <trace/events/bcache.h> /* @@ -297,7 +300,7 @@ static void bch_btree_node_read(struct btree *b) bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9; bio->bi_end_io = btree_node_read_endio; bio->bi_private = &cl; - bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC); + bio->bi_opf = REQ_OP_READ | REQ_META; bch_bio_map(bio, b->keys.set[0].data); @@ -393,7 +396,7 @@ static void do_btree_node_write(struct btree *b) b->bio->bi_end_io = btree_node_write_endio; b->bio->bi_private = cl; b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c)); - bio_set_op_attrs(b->bio, REQ_OP_WRITE, REQ_META|WRITE_SYNC|REQ_FUA); + b->bio->bi_opf = REQ_OP_WRITE | REQ_META | REQ_FUA; bch_bio_map(b->bio, i); /* @@ -1757,32 +1760,34 @@ static void bch_btree_gc(struct cache_set *c) bch_moving_gc(c); } -static int bch_gc_thread(void *arg) +static bool gc_should_run(struct cache_set *c) { - struct cache_set *c = arg; struct cache *ca; unsigned i; - while (1) { -again: - bch_btree_gc(c); + for_each_cache(ca, c, i) + if (ca->invalidate_needs_gc) + return true; - set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) - break; + if (atomic_read(&c->sectors_to_gc) < 0) + return true; - mutex_lock(&c->bucket_lock); + return false; +} - for_each_cache(ca, c, i) - if (ca->invalidate_needs_gc) { - mutex_unlock(&c->bucket_lock); - set_current_state(TASK_RUNNING); - goto again; - } +static int bch_gc_thread(void *arg) +{ + struct cache_set *c = arg; - mutex_unlock(&c->bucket_lock); + while (1) { + wait_event_interruptible(c->gc_wait, + kthread_should_stop() || gc_should_run(c)); - schedule(); + if (kthread_should_stop()) + break; + + set_gc_sectors(c); + bch_btree_gc(c); } return 0; @@ -1790,11 +1795,10 @@ again: int bch_gc_thread_start(struct cache_set *c) { - c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc"); + c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc"); if (IS_ERR(c->gc_thread)) return PTR_ERR(c->gc_thread); - set_task_state(c->gc_thread, TASK_INTERRUPTIBLE); return 0; } diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 5c391fa01bed..9b80417cd547 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -260,8 +260,7 @@ void bch_initial_mark_key(struct cache_set *, int, struct bkey *); static inline void wake_up_gc(struct cache_set *c) { - if (c->gc_thread) - wake_up_process(c->gc_thread); + wake_up(&c->gc_wait); } #define MAP_DONE 0 diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 9b2fe2d3e3a9..1ec84ca81146 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -3,6 +3,7 @@ #include <linux/llist.h> #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <linux/workqueue.h> /* diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 333a1e5f6ae6..06f55056aaae 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -52,7 +52,7 @@ void bch_btree_verify(struct btree *b) bio->bi_bdev = PTR_CACHE(b->c, &b->key, 0)->bdev; bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); bio->bi_iter.bi_size = KEY_SIZE(&v->key) << 9; - bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC); + bio->bi_opf = REQ_OP_READ | REQ_META; bch_bio_map(bio, sorted); submit_bio_wait(bio); @@ -107,22 +107,26 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) { char name[BDEVNAME_SIZE]; struct bio *check; - struct bio_vec bv; - struct bvec_iter iter; + struct bio_vec bv, cbv; + struct bvec_iter iter, citer = { 0 }; check = bio_clone(bio, GFP_NOIO); if (!check) return; - bio_set_op_attrs(check, REQ_OP_READ, READ_SYNC); + check->bi_opf = REQ_OP_READ; if (bio_alloc_pages(check, GFP_NOIO)) goto out_put; submit_bio_wait(check); + citer.bi_size = UINT_MAX; bio_for_each_segment(bv, bio, iter) { void *p1 = kmap_atomic(bv.bv_page); - void *p2 = page_address(check->bi_io_vec[iter.bi_idx].bv_page); + void *p2; + + cbv = bio_iter_iovec(check, citer); + p2 = page_address(cbv.bv_page); cache_set_err_on(memcmp(p1 + bv.bv_offset, p2 + bv.bv_offset, @@ -133,6 +137,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) (uint64_t) bio->bi_iter.bi_sector); kunmap_atomic(p1); + bio_advance_iter(check, &citer, bv.bv_len); } bio_free_pages(check); diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index e97b0acf7b8d..db45a88c0ce9 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -24,9 +24,7 @@ struct bio *bch_bbio_alloc(struct cache_set *c) struct bbio *b = mempool_alloc(c->bio_meta, GFP_NOIO); struct bio *bio = &b->bio; - bio_init(bio); - bio->bi_max_vecs = bucket_pages(c); - bio->bi_io_vec = bio->bi_inline_vecs; + bio_init(bio, bio->bi_inline_vecs, bucket_pages(c)); return bio; } diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 6925023e12d4..1198e53d5670 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -448,13 +448,11 @@ static void do_journal_discard(struct cache *ca) atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT); - bio_init(bio); + bio_init(bio, bio->bi_inline_vecs, 1); bio_set_op_attrs(bio, REQ_OP_DISCARD, 0); bio->bi_iter.bi_sector = bucket_to_sector(ca->set, ca->sb.d[ja->discard_idx]); bio->bi_bdev = ca->bdev; - bio->bi_max_vecs = 1; - bio->bi_io_vec = bio->bi_inline_vecs; bio->bi_iter.bi_size = bucket_bytes(ca); bio->bi_end_io = journal_discard_endio; diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 5c4bddecfaf0..13b8a907006d 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -77,15 +77,13 @@ static void moving_init(struct moving_io *io) { struct bio *bio = &io->bio.bio; - bio_init(bio); + bio_init(bio, bio->bi_inline_vecs, + DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS)); bio_get(bio); bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); bio->bi_iter.bi_size = KEY_SIZE(&io->w->key) << 9; - bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&io->w->key), - PAGE_SECTORS); bio->bi_private = &io->cl; - bio->bi_io_vec = bio->bi_inline_vecs; bch_bio_map(bio, NULL); } diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 40ffe5e424b3..709c9cc34369 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -196,10 +196,8 @@ static void bch_data_insert_start(struct closure *cl) struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); struct bio *bio = op->bio, *n; - if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { - set_gc_sectors(op->c); + if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) wake_up_gc(op->c); - } if (op->bypass) return bch_data_invalidate(cl); @@ -404,8 +402,8 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) if (!congested && mode == CACHE_MODE_WRITEBACK && - op_is_write(bio_op(bio)) && - (bio->bi_opf & REQ_SYNC)) + op_is_write(bio->bi_opf) && + op_is_sync(bio->bi_opf)) goto rescale; spin_lock(&dc->io_lock); @@ -623,7 +621,7 @@ static void do_bio_hook(struct search *s, struct bio *orig_bio) { struct bio *bio = &s->bio.bio; - bio_init(bio); + bio_init(bio, NULL, 0); __bio_clone_fast(bio, orig_bio); bio->bi_end_io = request_endio; bio->bi_private = &s->cl; @@ -668,7 +666,7 @@ static inline struct search *search_alloc(struct bio *bio, s->iop.write_prio = 0; s->iop.error = 0; s->iop.flags = 0; - s->iop.flush_journal = (bio->bi_opf & (REQ_PREFLUSH|REQ_FUA)) != 0; + s->iop.flush_journal = op_is_flush(bio->bi_opf); s->iop.wq = bcache_wq; return s; @@ -923,7 +921,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) flush->bi_bdev = bio->bi_bdev; flush->bi_end_io = request_endio; flush->bi_private = cl; - bio_set_op_attrs(flush, REQ_OP_WRITE, WRITE_FLUSH); + flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; closure_bio_submit(flush, cl); } @@ -1011,7 +1009,7 @@ static int cached_dev_congested(void *data, int bits) struct request_queue *q = bdev_get_queue(dc->bdev); int ret = 0; - if (bdi_congested(&q->backing_dev_info, bits)) + if (bdi_congested(q->backing_dev_info, bits)) return 1; if (cached_dev_get(dc)) { @@ -1020,7 +1018,7 @@ static int cached_dev_congested(void *data, int bits) for_each_cache(ca, d->c, i) { q = bdev_get_queue(ca->bdev); - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= bdi_congested(q->backing_dev_info, bits); } cached_dev_put(dc); @@ -1034,7 +1032,7 @@ void bch_cached_dev_request_init(struct cached_dev *dc) struct gendisk *g = dc->disk.disk; g->queue->make_request_fn = cached_dev_make_request; - g->queue->backing_dev_info.congested_fn = cached_dev_congested; + g->queue->backing_dev_info->congested_fn = cached_dev_congested; dc->disk.cache_miss = cached_dev_cache_miss; dc->disk.ioctl = cached_dev_ioctl; } @@ -1127,7 +1125,7 @@ static int flash_dev_congested(void *data, int bits) for_each_cache(ca, d->c, i) { q = bdev_get_queue(ca->bdev); - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= bdi_congested(q->backing_dev_info, bits); } return ret; @@ -1138,7 +1136,7 @@ void bch_flash_dev_request_init(struct bcache_device *d) struct gendisk *g = d->disk; g->queue->make_request_fn = flash_dev_make_request; - g->queue->backing_dev_info.congested_fn = flash_dev_congested; + g->queue->backing_dev_info->congested_fn = flash_dev_congested; d->cache_miss = flash_dev_cache_miss; d->ioctl = flash_dev_ioctl; } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 849ad441cd76..85e3f21c2514 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -58,6 +58,7 @@ static wait_queue_head_t unregister_wait; struct workqueue_struct *bcache_wq; #define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE) +#define BCACHE_MINORS 16 /* partition support */ /* Superblock */ @@ -381,7 +382,7 @@ static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl) return "bad uuid pointer"; bkey_copy(&c->uuid_bucket, k); - uuid_io(c, REQ_OP_READ, READ_SYNC, k, cl); + uuid_io(c, REQ_OP_READ, 0, k, cl); if (j->version < BCACHE_JSET_VERSION_UUIDv1) { struct uuid_entry_v0 *u0 = (void *) c->uuids; @@ -600,7 +601,7 @@ static void prio_read(struct cache *ca, uint64_t bucket) ca->prio_last_buckets[bucket_nr] = bucket; bucket_nr++; - prio_io(ca, bucket, REQ_OP_READ, READ_SYNC); + prio_io(ca, bucket, REQ_OP_READ, 0); if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8)) pr_warn("bad csum reading priorities"); @@ -783,8 +784,10 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, if (minor < 0) return minor; + minor *= BCACHE_MINORS; + if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || - !(d->disk = alloc_disk(1))) { + !(d->disk = alloc_disk(BCACHE_MINORS))) { ida_simple_remove(&bcache_minor, minor); return -ENOMEM; } @@ -804,7 +807,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, blk_queue_make_request(q, NULL); d->disk->queue = q; q->queuedata = d; - q->backing_dev_info.congested_data = d; + q->backing_dev_info->congested_data = d; q->limits.max_hw_sectors = UINT_MAX; q->limits.max_sectors = UINT_MAX; q->limits.max_segment_size = UINT_MAX; @@ -1129,9 +1132,9 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) set_capacity(dc->disk.disk, dc->bdev->bd_part->nr_sects - dc->sb.data_offset); - dc->disk.disk->queue->backing_dev_info.ra_pages = - max(dc->disk.disk->queue->backing_dev_info.ra_pages, - q->backing_dev_info.ra_pages); + dc->disk.disk->queue->backing_dev_info->ra_pages = + max(dc->disk.disk->queue->backing_dev_info->ra_pages, + q->backing_dev_info->ra_pages); bch_cached_dev_request_init(dc); bch_cached_dev_writeback_init(dc); @@ -1152,9 +1155,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page, dc->bdev = bdev; dc->bdev->bd_holder = dc; - bio_init(&dc->sb_bio); - dc->sb_bio.bi_max_vecs = 1; - dc->sb_bio.bi_io_vec = dc->sb_bio.bi_inline_vecs; + bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1); dc->sb_bio.bi_io_vec[0].bv_page = sb_page; get_page(sb_page); @@ -1491,6 +1492,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) mutex_init(&c->bucket_lock); init_waitqueue_head(&c->btree_cache_wait); init_waitqueue_head(&c->bucket_wait); + init_waitqueue_head(&c->gc_wait); sema_init(&c->uuid_write_mutex, 1); spin_lock_init(&c->btree_gc_time.lock); @@ -1550,6 +1552,7 @@ static void run_cache_set(struct cache_set *c) for_each_cache(ca, c, i) c->nbuckets += ca->sb.nbuckets; + set_gc_sectors(c); if (CACHE_SYNC(&c->sb)) { LIST_HEAD(journal); @@ -1814,9 +1817,7 @@ static int cache_alloc(struct cache *ca) __module_get(THIS_MODULE); kobject_init(&ca->kobj, &bch_cache_ktype); - bio_init(&ca->journal.bio); - ca->journal.bio.bi_max_vecs = 8; - ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; + bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8); free = roundup_pow_of_two(ca->sb.nbuckets) >> 10; @@ -1852,9 +1853,7 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, ca->bdev = bdev; ca->bdev->bd_holder = ca; - bio_init(&ca->sb_bio); - ca->sb_bio.bi_max_vecs = 1; - ca->sb_bio.bi_io_vec = ca->sb_bio.bi_inline_vecs; + bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1); ca->sb_bio.bi_io_vec[0].bv_page = sb_page; get_page(sb_page); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index b3ff57d61dde..f90f13616980 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -13,6 +13,7 @@ #include <linux/blkdev.h> #include <linux/sort.h> +#include <linux/sched/clock.h> static const char * const cache_replacement_policies[] = { "lru", diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index dde6172f3f10..8c3a938f4bf0 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -12,6 +12,7 @@ #include <linux/module.h> #include <linux/seq_file.h> #include <linux/types.h> +#include <linux/sched/clock.h> #include "util.h" diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index cf2cbc211d83..a126919ed102 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -6,6 +6,7 @@ #include <linux/errno.h> #include <linux/blkdev.h> #include <linux/kernel.h> +#include <linux/sched/clock.h> #include <linux/llist.h> #include <linux/ratelimit.h> #include <linux/vmalloc.h> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index e51644e503a5..6ac2e48b9235 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -13,6 +13,7 @@ #include <linux/delay.h> #include <linux/kthread.h> +#include <linux/sched/clock.h> #include <trace/events/bcache.h> /* Rate limiting */ @@ -106,14 +107,13 @@ static void dirty_init(struct keybuf_key *w) struct dirty_io *io = w->private; struct bio *bio = &io->bio; - bio_init(bio); + bio_init(bio, bio->bi_inline_vecs, + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)); if (!io->dc->writeback_percent) bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9; - bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS); bio->bi_private = w; - bio->bi_io_vec = bio->bi_inline_vecs; bch_bio_map(bio, NULL); } diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 301eaf565167..629bd1a502fd 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -57,8 +57,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, if (would_skip) return false; - return bio->bi_opf & REQ_SYNC || - in_use <= CUTOFF_WRITEBACK; + return op_is_sync(bio->bi_opf) || in_use <= CUTOFF_WRITEBACK; } static inline void bch_writeback_queue(struct cached_dev *dc) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 2d826927a3bf..9fb2ccac958a 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -27,6 +27,7 @@ #include <linux/mount.h> #include <linux/buffer_head.h> #include <linux/seq_file.h> +#include <trace/events/block.h> #include "md.h" #include "bitmap.h" @@ -208,11 +209,13 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) { - struct md_rdev *rdev = NULL; + struct md_rdev *rdev; struct block_device *bdev; struct mddev *mddev = bitmap->mddev; struct bitmap_storage *store = &bitmap->storage; +restart: + rdev = NULL; while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { int size = PAGE_SIZE; loff_t offset = mddev->bitmap_info.offset; @@ -268,8 +271,8 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) page); } - if (wait) - md_super_wait(mddev); + if (wait && md_super_wait(mddev) < 0) + goto restart; return 0; bad_alignment: @@ -405,10 +408,10 @@ static int read_page(struct file *file, unsigned long index, ret = -EIO; out: if (ret) - printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %d\n", - (int)PAGE_SIZE, - (unsigned long long)index << PAGE_SHIFT, - ret); + pr_err("md: bitmap read error: (%dB @ %llu): %d\n", + (int)PAGE_SIZE, + (unsigned long long)index << PAGE_SHIFT, + ret); return ret; } @@ -416,6 +419,28 @@ out: * bitmap file superblock operations */ +/* + * bitmap_wait_writes() should be called before writing any bitmap + * blocks, to ensure previous writes, particularly from + * bitmap_daemon_work(), have completed. + */ +static void bitmap_wait_writes(struct bitmap *bitmap) +{ + if (bitmap->storage.file) + wait_event(bitmap->write_wait, + atomic_read(&bitmap->pending_writes)==0); + else + /* Note that we ignore the return value. The writes + * might have failed, but that would just mean that + * some bits which should be cleared haven't been, + * which is safe. The relevant bitmap blocks will + * probably get written again, but there is no great + * loss if they aren't. + */ + md_super_wait(bitmap->mddev); +} + + /* update the event counter and sync the superblock to disk */ void bitmap_update_sb(struct bitmap *bitmap) { @@ -455,24 +480,24 @@ void bitmap_print_sb(struct bitmap *bitmap) if (!bitmap || !bitmap->storage.sb_page) return; sb = kmap_atomic(bitmap->storage.sb_page); - printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); - printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); - printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); - printk(KERN_DEBUG " uuid: %08x.%08x.%08x.%08x\n", - *(__u32 *)(sb->uuid+0), - *(__u32 *)(sb->uuid+4), - *(__u32 *)(sb->uuid+8), - *(__u32 *)(sb->uuid+12)); - printk(KERN_DEBUG " events: %llu\n", - (unsigned long long) le64_to_cpu(sb->events)); - printk(KERN_DEBUG "events cleared: %llu\n", - (unsigned long long) le64_to_cpu(sb->events_cleared)); - printk(KERN_DEBUG " state: %08x\n", le32_to_cpu(sb->state)); - printk(KERN_DEBUG " chunksize: %d B\n", le32_to_cpu(sb->chunksize)); - printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); - printk(KERN_DEBUG " sync size: %llu KB\n", - (unsigned long long)le64_to_cpu(sb->sync_size)/2); - printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind)); + pr_debug("%s: bitmap file superblock:\n", bmname(bitmap)); + pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic)); + pr_debug(" version: %d\n", le32_to_cpu(sb->version)); + pr_debug(" uuid: %08x.%08x.%08x.%08x\n", + *(__u32 *)(sb->uuid+0), + *(__u32 *)(sb->uuid+4), + *(__u32 *)(sb->uuid+8), + *(__u32 *)(sb->uuid+12)); + pr_debug(" events: %llu\n", + (unsigned long long) le64_to_cpu(sb->events)); + pr_debug("events cleared: %llu\n", + (unsigned long long) le64_to_cpu(sb->events_cleared)); + pr_debug(" state: %08x\n", le32_to_cpu(sb->state)); + pr_debug(" chunksize: %d B\n", le32_to_cpu(sb->chunksize)); + pr_debug(" daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); + pr_debug(" sync size: %llu KB\n", + (unsigned long long)le64_to_cpu(sb->sync_size)/2); + pr_debug("max write behind: %d\n", le32_to_cpu(sb->write_behind)); kunmap_atomic(sb); } @@ -506,14 +531,14 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) BUG_ON(!chunksize); if (!is_power_of_2(chunksize)) { kunmap_atomic(sb); - printk(KERN_ERR "bitmap chunksize not a power of 2\n"); + pr_warn("bitmap chunksize not a power of 2\n"); return -EINVAL; } sb->chunksize = cpu_to_le32(chunksize); daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep; if (!daemon_sleep || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) { - printk(KERN_INFO "Choosing daemon_sleep default (5 sec)\n"); + pr_debug("Choosing daemon_sleep default (5 sec)\n"); daemon_sleep = 5 * HZ; } sb->daemon_sleep = cpu_to_le32(daemon_sleep); @@ -584,7 +609,7 @@ re_read: /* to 4k blocks */ bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096); offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3)); - pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__, + pr_debug("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__, bitmap->cluster_slot, offset); } @@ -634,7 +659,7 @@ re_read: else if (write_behind > COUNTER_MAX) reason = "write-behind limit out of range (0 - 16383)"; if (reason) { - printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n", + pr_warn("%s: invalid bitmap file superblock: %s\n", bmname(bitmap), reason); goto out; } @@ -648,18 +673,15 @@ re_read: * bitmap's UUID and event counter to the mddev's */ if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) { - printk(KERN_INFO - "%s: bitmap superblock UUID mismatch\n", - bmname(bitmap)); + pr_warn("%s: bitmap superblock UUID mismatch\n", + bmname(bitmap)); goto out; } events = le64_to_cpu(sb->events); if (!nodes && (events < bitmap->mddev->events)) { - printk(KERN_INFO - "%s: bitmap file is out of date (%llu < %llu) " - "-- forcing full recovery\n", - bmname(bitmap), events, - (unsigned long long) bitmap->mddev->events); + pr_warn("%s: bitmap file is out of date (%llu < %llu) -- forcing full recovery\n", + bmname(bitmap), events, + (unsigned long long) bitmap->mddev->events); set_bit(BITMAP_STALE, &bitmap->flags); } } @@ -679,8 +701,8 @@ out: if (err == 0 && nodes && (bitmap->cluster_slot < 0)) { err = md_setup_cluster(bitmap->mddev, nodes); if (err) { - pr_err("%s: Could not setup cluster service (%d)\n", - bmname(bitmap), err); + pr_warn("%s: Could not setup cluster service (%d)\n", + bmname(bitmap), err); goto out_no_sb; } bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev); @@ -847,15 +869,13 @@ static void bitmap_file_kick(struct bitmap *bitmap) ptr = file_path(bitmap->storage.file, path, PAGE_SIZE); - printk(KERN_ALERT - "%s: kicking failed bitmap file %s from array!\n", - bmname(bitmap), IS_ERR(ptr) ? "" : ptr); + pr_warn("%s: kicking failed bitmap file %s from array!\n", + bmname(bitmap), IS_ERR(ptr) ? "" : ptr); kfree(path); } else - printk(KERN_ALERT - "%s: disabling internal bitmap due to errors\n", - bmname(bitmap)); + pr_warn("%s: disabling internal bitmap due to errors\n", + bmname(bitmap)); } } @@ -983,6 +1003,7 @@ void bitmap_unplug(struct bitmap *bitmap) { unsigned long i; int dirty, need_write; + int writing = 0; if (!bitmap || !bitmap->storage.filemap || test_bit(BITMAP_STALE, &bitmap->flags)) @@ -997,15 +1018,19 @@ void bitmap_unplug(struct bitmap *bitmap) need_write = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE); if (dirty || need_write) { + if (!writing) { + bitmap_wait_writes(bitmap); + if (bitmap->mddev->queue) + blk_add_trace_msg(bitmap->mddev->queue, + "md bitmap_unplug"); + } clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); write_page(bitmap, bitmap->storage.filemap[i], 0); + writing = 1; } } - if (bitmap->storage.file) - wait_event(bitmap->write_wait, - atomic_read(&bitmap->pending_writes)==0); - else - md_super_wait(bitmap->mddev); + if (writing) + bitmap_wait_writes(bitmap); if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) bitmap_file_kick(bitmap); @@ -1056,14 +1081,13 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) outofdate = test_bit(BITMAP_STALE, &bitmap->flags); if (outofdate) - printk(KERN_INFO "%s: bitmap file is out of date, doing full " - "recovery\n", bmname(bitmap)); + pr_warn("%s: bitmap file is out of date, doing full recovery\n", bmname(bitmap)); if (file && i_size_read(file->f_mapping->host) < store->bytes) { - printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", - bmname(bitmap), - (unsigned long) i_size_read(file->f_mapping->host), - store->bytes); + pr_warn("%s: bitmap file too short %lu < %lu\n", + bmname(bitmap), + (unsigned long) i_size_read(file->f_mapping->host), + store->bytes); goto err; } @@ -1137,16 +1161,15 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) offset = 0; } - printk(KERN_INFO "%s: bitmap initialized from disk: " - "read %lu pages, set %lu of %lu bits\n", - bmname(bitmap), store->file_pages, - bit_cnt, chunks); + pr_debug("%s: bitmap initialized from disk: read %lu pages, set %lu of %lu bits\n", + bmname(bitmap), store->file_pages, + bit_cnt, chunks); return 0; err: - printk(KERN_INFO "%s: bitmap initialisation failed: %d\n", - bmname(bitmap), ret); + pr_warn("%s: bitmap initialisation failed: %d\n", + bmname(bitmap), ret); return ret; } @@ -1225,6 +1248,10 @@ void bitmap_daemon_work(struct mddev *mddev) } bitmap->allclean = 1; + if (bitmap->mddev->queue) + blk_add_trace_msg(bitmap->mddev->queue, + "md bitmap_daemon_work"); + /* Any file-page which is PENDING now needs to be written. * So set NEEDWRITE now, then after we make any last-minute changes * we will write it. @@ -1289,6 +1316,7 @@ void bitmap_daemon_work(struct mddev *mddev) } spin_unlock_irq(&counts->lock); + bitmap_wait_writes(bitmap); /* Now start writeout on any page in NEEDWRITE that isn't DIRTY. * DIRTY pages need to be written by bitmap_unplug so it can wait * for them. @@ -1595,7 +1623,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) atomic_read(&bitmap->mddev->recovery_active) == 0); bitmap->mddev->curr_resync_completed = sector; - set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); + set_bit(MD_SB_CHANGE_CLEAN, &bitmap->mddev->sb_flags); sector &= ~((1ULL << bitmap->counts.chunkshift) - 1); s = 0; while (s < sector && s < bitmap->mddev->resync_max_sectors) { @@ -1825,8 +1853,8 @@ struct bitmap *bitmap_create(struct mddev *mddev, int slot) if (err) goto error; - printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", - bitmap->counts.pages, bmname(bitmap)); + pr_debug("created bitmap (%lu pages) for device %s\n", + bitmap->counts.pages, bmname(bitmap)); err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; if (err) @@ -2029,8 +2057,10 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, !bitmap->mddev->bitmap_info.external, mddev_is_clustered(bitmap->mddev) ? bitmap->cluster_slot : 0); - if (ret) + if (ret) { + bitmap_file_unmap(&store); goto err; + } pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO); @@ -2089,7 +2119,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift + BITMAP_BLOCK_SHIFT); blocks = old_counts.chunks << old_counts.chunkshift; - pr_err("Could not pre-allocate in-memory bitmap for cluster raid\n"); + pr_warn("Could not pre-allocate in-memory bitmap for cluster raid\n"); break; } else bitmap->counts.bp[page].count += 1; @@ -2266,7 +2296,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len) /* Ensure new bitmap info is stored in * metadata promptly. */ - set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); md_wakeup_thread(mddev->thread); } rv = 0; diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 125aedc3875f..df4859f6ac6a 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -11,6 +11,7 @@ #include <linux/device-mapper.h> #include <linux/dm-io.h> #include <linux/slab.h> +#include <linux/sched/mm.h> #include <linux/jiffies.h> #include <linux/vmalloc.h> #include <linux/shrinker.h> @@ -611,9 +612,7 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, char *ptr; int len; - bio_init(&b->bio); - b->bio.bi_io_vec = b->bio_vec; - b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; + bio_init(&b->bio, b->bio_vec, DM_BUFIO_INLINE_VECS); b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits; b->bio.bi_bdev = b->c->bdev; b->bio.bi_end_io = inline_endio; @@ -796,7 +795,7 @@ static void __wait_for_free_buffer(struct dm_bufio_client *c) DECLARE_WAITQUEUE(wait, current); add_wait_queue(&c->free_buffer_wait, &wait); - set_task_state(current, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); dm_bufio_unlock(c); io_schedule(); @@ -822,12 +821,14 @@ enum new_flag { static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf) { struct dm_buffer *b; + bool tried_noio_alloc = false; /* * dm-bufio is resistant to allocation failures (it just keeps * one buffer reserved in cases all the allocations fail). * So set flags to not try too hard: - * GFP_NOIO: don't recurse into the I/O layer + * GFP_NOWAIT: don't wait; if we need to sleep we'll release our + * mutex and wait ourselves. * __GFP_NORETRY: don't retry and rather return failure * __GFP_NOMEMALLOC: don't use emergency reserves * __GFP_NOWARN: don't print a warning in case of failure @@ -837,7 +838,7 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client */ while (1) { if (dm_bufio_cache_size_latch != 1) { - b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); if (b) return b; } @@ -845,6 +846,15 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client if (nf == NF_PREFETCH) return NULL; + if (dm_bufio_cache_size_latch != 1 && !tried_noio_alloc) { + dm_bufio_unlock(c); + b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + dm_bufio_lock(c); + if (b) + return b; + tried_noio_alloc = true; + } + if (!list_empty(&c->reserved_buffers)) { b = list_entry(c->reserved_buffers.next, struct dm_buffer, lru_list); @@ -1316,7 +1326,7 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c) { struct dm_io_request io_req = { .bi_op = REQ_OP_WRITE, - .bi_op_flags = WRITE_FLUSH, + .bi_op_flags = REQ_PREFLUSH, .mem.type = DM_IO_KMEM, .mem.ptr.addr = NULL, .client = c->dm_io, @@ -1587,18 +1597,9 @@ dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { - struct dm_bufio_client *c; - unsigned long count; - - c = container_of(shrink, struct dm_bufio_client, shrinker); - if (sc->gfp_mask & __GFP_FS) - dm_bufio_lock(c); - else if (!dm_bufio_trylock(c)) - return 0; + struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker); - count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; - dm_bufio_unlock(c); - return count; + return ACCESS_ONCE(c->n_buffers[LIST_CLEAN]) + ACCESS_ONCE(c->n_buffers[LIST_DIRTY]); } /* diff --git a/drivers/md/dm-cache-block-types.h b/drivers/md/dm-cache-block-types.h index bed4ad4e1b7c..389c9e8ac785 100644 --- a/drivers/md/dm-cache-block-types.h +++ b/drivers/md/dm-cache-block-types.h @@ -17,9 +17,9 @@ * discard bitset. */ -typedef dm_block_t __bitwise__ dm_oblock_t; -typedef uint32_t __bitwise__ dm_cblock_t; -typedef dm_block_t __bitwise__ dm_dblock_t; +typedef dm_block_t __bitwise dm_oblock_t; +typedef uint32_t __bitwise dm_cblock_t; +typedef dm_block_t __bitwise dm_dblock_t; static inline dm_oblock_t to_oblock(dm_block_t b) { diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 695577812cf6..e4c2c1a1e993 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -25,7 +25,7 @@ * defines a range of metadata versions that this module can handle. */ #define MIN_CACHE_VERSION 1 -#define MAX_CACHE_VERSION 1 +#define MAX_CACHE_VERSION 2 #define CACHE_METADATA_CACHE_SIZE 64 @@ -55,6 +55,7 @@ enum mapping_bits { /* * The data on the cache is different from that on the origin. + * This flag is only used by metadata format 1. */ M_DIRTY = 2 }; @@ -93,12 +94,18 @@ struct cache_disk_superblock { __le32 write_misses; __le32 policy_version[CACHE_POLICY_VERSION_SIZE]; + + /* + * Metadata format 2 fields. + */ + __le64 dirty_root; } __packed; struct dm_cache_metadata { atomic_t ref_count; struct list_head list; + unsigned version; struct block_device *bdev; struct dm_block_manager *bm; struct dm_space_map *metadata_sm; @@ -142,11 +149,18 @@ struct dm_cache_metadata { bool fail_io:1; /* + * Metadata format 2 fields. + */ + dm_block_t dirty_root; + struct dm_disk_bitset dirty_info; + + /* * These structures are used when loading metadata. They're too * big to put on the stack. */ struct dm_array_cursor mapping_cursor; struct dm_array_cursor hint_cursor; + struct dm_bitset_cursor dirty_cursor; }; /*------------------------------------------------------------------- @@ -170,6 +184,7 @@ static void sb_prepare_for_write(struct dm_block_validator *v, static int check_metadata_version(struct cache_disk_superblock *disk_super) { uint32_t metadata_version = le32_to_cpu(disk_super->version); + if (metadata_version < MIN_CACHE_VERSION || metadata_version > MAX_CACHE_VERSION) { DMERR("Cache metadata version %u found, but only versions between %u and %u supported.", metadata_version, MIN_CACHE_VERSION, MAX_CACHE_VERSION); @@ -310,6 +325,11 @@ static void __copy_sm_root(struct dm_cache_metadata *cmd, sizeof(cmd->metadata_space_map_root)); } +static bool separate_dirty_bits(struct dm_cache_metadata *cmd) +{ + return cmd->version >= 2; +} + static int __write_initial_superblock(struct dm_cache_metadata *cmd) { int r; @@ -341,7 +361,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) disk_super->flags = 0; memset(disk_super->uuid, 0, sizeof(disk_super->uuid)); disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC); - disk_super->version = cpu_to_le32(MAX_CACHE_VERSION); + disk_super->version = cpu_to_le32(cmd->version); memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name)); memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version)); disk_super->policy_hint_size = 0; @@ -362,6 +382,9 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) disk_super->write_hits = cpu_to_le32(0); disk_super->write_misses = cpu_to_le32(0); + if (separate_dirty_bits(cmd)) + disk_super->dirty_root = cpu_to_le64(cmd->dirty_root); + return dm_tm_commit(cmd->tm, sblock); } @@ -382,8 +405,14 @@ static int __format_metadata(struct dm_cache_metadata *cmd) if (r < 0) goto bad; - dm_disk_bitset_init(cmd->tm, &cmd->discard_info); + if (separate_dirty_bits(cmd)) { + dm_disk_bitset_init(cmd->tm, &cmd->dirty_info); + r = dm_bitset_empty(&cmd->dirty_info, &cmd->dirty_root); + if (r < 0) + goto bad; + } + dm_disk_bitset_init(cmd->tm, &cmd->discard_info); r = dm_bitset_empty(&cmd->discard_info, &cmd->discard_root); if (r < 0) goto bad; @@ -408,9 +437,10 @@ bad: static int __check_incompat_features(struct cache_disk_superblock *disk_super, struct dm_cache_metadata *cmd) { - uint32_t features; + uint32_t incompat_flags, features; - features = le32_to_cpu(disk_super->incompat_flags) & ~DM_CACHE_FEATURE_INCOMPAT_SUPP; + incompat_flags = le32_to_cpu(disk_super->incompat_flags); + features = incompat_flags & ~DM_CACHE_FEATURE_INCOMPAT_SUPP; if (features) { DMERR("could not access metadata due to unsupported optional features (%lx).", (unsigned long)features); @@ -471,6 +501,7 @@ static int __open_metadata(struct dm_cache_metadata *cmd) } __setup_mapping_info(cmd); + dm_disk_bitset_init(cmd->tm, &cmd->dirty_info); dm_disk_bitset_init(cmd->tm, &cmd->discard_info); sb_flags = le32_to_cpu(disk_super->flags); cmd->clean_when_opened = test_bit(CLEAN_SHUTDOWN, &sb_flags); @@ -549,6 +580,7 @@ static unsigned long clear_clean_shutdown(unsigned long flags) static void read_superblock_fields(struct dm_cache_metadata *cmd, struct cache_disk_superblock *disk_super) { + cmd->version = le32_to_cpu(disk_super->version); cmd->flags = le32_to_cpu(disk_super->flags); cmd->root = le64_to_cpu(disk_super->mapping_root); cmd->hint_root = le64_to_cpu(disk_super->hint_root); @@ -568,6 +600,9 @@ static void read_superblock_fields(struct dm_cache_metadata *cmd, cmd->stats.write_hits = le32_to_cpu(disk_super->write_hits); cmd->stats.write_misses = le32_to_cpu(disk_super->write_misses); + if (separate_dirty_bits(cmd)) + cmd->dirty_root = le64_to_cpu(disk_super->dirty_root); + cmd->changed = false; } @@ -626,6 +661,13 @@ static int __commit_transaction(struct dm_cache_metadata *cmd, */ BUILD_BUG_ON(sizeof(struct cache_disk_superblock) > 512); + if (separate_dirty_bits(cmd)) { + r = dm_bitset_flush(&cmd->dirty_info, cmd->dirty_root, + &cmd->dirty_root); + if (r) + return r; + } + r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root, &cmd->discard_root); if (r) @@ -650,6 +692,8 @@ static int __commit_transaction(struct dm_cache_metadata *cmd, update_flags(disk_super, mutator); disk_super->mapping_root = cpu_to_le64(cmd->root); + if (separate_dirty_bits(cmd)) + disk_super->dirty_root = cpu_to_le64(cmd->dirty_root); disk_super->hint_root = cpu_to_le64(cmd->hint_root); disk_super->discard_root = cpu_to_le64(cmd->discard_root); disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); @@ -699,7 +743,8 @@ static void unpack_value(__le64 value_le, dm_oblock_t *block, unsigned *flags) static struct dm_cache_metadata *metadata_open(struct block_device *bdev, sector_t data_block_size, bool may_format_device, - size_t policy_hint_size) + size_t policy_hint_size, + unsigned metadata_version) { int r; struct dm_cache_metadata *cmd; @@ -710,6 +755,7 @@ static struct dm_cache_metadata *metadata_open(struct block_device *bdev, return ERR_PTR(-ENOMEM); } + cmd->version = metadata_version; atomic_set(&cmd->ref_count, 1); init_rwsem(&cmd->root_lock); cmd->bdev = bdev; @@ -758,7 +804,8 @@ static struct dm_cache_metadata *lookup(struct block_device *bdev) static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev, sector_t data_block_size, bool may_format_device, - size_t policy_hint_size) + size_t policy_hint_size, + unsigned metadata_version) { struct dm_cache_metadata *cmd, *cmd2; @@ -769,7 +816,8 @@ static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev, if (cmd) return cmd; - cmd = metadata_open(bdev, data_block_size, may_format_device, policy_hint_size); + cmd = metadata_open(bdev, data_block_size, may_format_device, + policy_hint_size, metadata_version); if (!IS_ERR(cmd)) { mutex_lock(&table_lock); cmd2 = lookup(bdev); @@ -789,7 +837,7 @@ static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev, static bool same_params(struct dm_cache_metadata *cmd, sector_t data_block_size) { if (cmd->data_block_size != data_block_size) { - DMERR("data_block_size (%llu) different from that in metadata (%llu)\n", + DMERR("data_block_size (%llu) different from that in metadata (%llu)", (unsigned long long) data_block_size, (unsigned long long) cmd->data_block_size); return false; @@ -801,10 +849,11 @@ static bool same_params(struct dm_cache_metadata *cmd, sector_t data_block_size) struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, sector_t data_block_size, bool may_format_device, - size_t policy_hint_size) + size_t policy_hint_size, + unsigned metadata_version) { - struct dm_cache_metadata *cmd = lookup_or_open(bdev, data_block_size, - may_format_device, policy_hint_size); + struct dm_cache_metadata *cmd = lookup_or_open(bdev, data_block_size, may_format_device, + policy_hint_size, metadata_version); if (!IS_ERR(cmd) && !same_params(cmd, data_block_size)) { dm_cache_metadata_close(cmd); @@ -830,8 +879,8 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd) /* * Checks that the given cache block is either unmapped or clean. */ -static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b, - bool *result) +static int block_clean_combined_dirty(struct dm_cache_metadata *cmd, dm_cblock_t b, + bool *result) { int r; __le64 value; @@ -839,10 +888,8 @@ static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b, unsigned flags; r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(b), &value); - if (r) { - DMERR("block_unmapped_or_clean failed"); + if (r) return r; - } unpack_value(value, &ob, &flags); *result = !((flags & M_VALID) && (flags & M_DIRTY)); @@ -850,17 +897,19 @@ static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b, return 0; } -static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd, - dm_cblock_t begin, dm_cblock_t end, - bool *result) +static int blocks_are_clean_combined_dirty(struct dm_cache_metadata *cmd, + dm_cblock_t begin, dm_cblock_t end, + bool *result) { int r; *result = true; while (begin != end) { - r = block_unmapped_or_clean(cmd, begin, result); - if (r) + r = block_clean_combined_dirty(cmd, begin, result); + if (r) { + DMERR("block_clean_combined_dirty failed"); return r; + } if (!*result) { DMERR("cache block %llu is dirty", @@ -874,6 +923,67 @@ static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd, return 0; } +static int blocks_are_clean_separate_dirty(struct dm_cache_metadata *cmd, + dm_cblock_t begin, dm_cblock_t end, + bool *result) +{ + int r; + bool dirty_flag; + *result = true; + + r = dm_bitset_cursor_begin(&cmd->dirty_info, cmd->dirty_root, + from_cblock(begin), &cmd->dirty_cursor); + if (r) { + DMERR("%s: dm_bitset_cursor_begin for dirty failed", __func__); + return r; + } + + r = dm_bitset_cursor_skip(&cmd->dirty_cursor, from_cblock(begin)); + if (r) { + DMERR("%s: dm_bitset_cursor_skip for dirty failed", __func__); + dm_bitset_cursor_end(&cmd->dirty_cursor); + return r; + } + + while (begin != end) { + /* + * We assume that unmapped blocks have their dirty bit + * cleared. + */ + dirty_flag = dm_bitset_cursor_get_value(&cmd->dirty_cursor); + if (dirty_flag) { + DMERR("%s: cache block %llu is dirty", __func__, + (unsigned long long) from_cblock(begin)); + dm_bitset_cursor_end(&cmd->dirty_cursor); + *result = false; + return 0; + } + + r = dm_bitset_cursor_next(&cmd->dirty_cursor); + if (r) { + DMERR("%s: dm_bitset_cursor_next for dirty failed", __func__); + dm_bitset_cursor_end(&cmd->dirty_cursor); + return r; + } + + begin = to_cblock(from_cblock(begin) + 1); + } + + dm_bitset_cursor_end(&cmd->dirty_cursor); + + return 0; +} + +static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd, + dm_cblock_t begin, dm_cblock_t end, + bool *result) +{ + if (separate_dirty_bits(cmd)) + return blocks_are_clean_separate_dirty(cmd, begin, end, result); + else + return blocks_are_clean_combined_dirty(cmd, begin, end, result); +} + static bool cmd_write_lock(struct dm_cache_metadata *cmd) { down_write(&cmd->root_lock); @@ -951,8 +1061,18 @@ int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size) r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks), from_cblock(new_cache_size), &null_mapping, &cmd->root); - if (!r) - cmd->cache_blocks = new_cache_size; + if (r) + goto out; + + if (separate_dirty_bits(cmd)) { + r = dm_bitset_resize(&cmd->dirty_info, cmd->dirty_root, + from_cblock(cmd->cache_blocks), from_cblock(new_cache_size), + false, &cmd->dirty_root); + if (r) + goto out; + } + + cmd->cache_blocks = new_cache_size; cmd->changed = true; out: @@ -996,14 +1116,6 @@ static int __clear_discard(struct dm_cache_metadata *cmd, dm_dblock_t b) from_dblock(b), &cmd->discard_root); } -static int __is_discarded(struct dm_cache_metadata *cmd, dm_dblock_t b, - bool *is_discarded) -{ - return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root, - from_dblock(b), &cmd->discard_root, - is_discarded); -} - static int __discard(struct dm_cache_metadata *cmd, dm_dblock_t dblock, bool discard) { @@ -1033,22 +1145,38 @@ static int __load_discards(struct dm_cache_metadata *cmd, load_discard_fn fn, void *context) { int r = 0; - dm_block_t b; - bool discard; + uint32_t b; + struct dm_bitset_cursor c; - for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) { - dm_dblock_t dblock = to_dblock(b); + if (from_dblock(cmd->discard_nr_blocks) == 0) + /* nothing to do */ + return 0; - if (cmd->clean_when_opened) { - r = __is_discarded(cmd, dblock, &discard); - if (r) - return r; - } else - discard = false; + if (cmd->clean_when_opened) { + r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root, &cmd->discard_root); + if (r) + return r; - r = fn(context, cmd->discard_block_size, dblock, discard); + r = dm_bitset_cursor_begin(&cmd->discard_info, cmd->discard_root, + from_dblock(cmd->discard_nr_blocks), &c); if (r) - break; + return r; + + for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) { + r = fn(context, cmd->discard_block_size, to_dblock(b), + dm_bitset_cursor_get_value(&c)); + if (r) + break; + } + + dm_bitset_cursor_end(&c); + + } else { + for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) { + r = fn(context, cmd->discard_block_size, to_dblock(b), false); + if (r) + return r; + } } return r; @@ -1178,11 +1306,11 @@ static bool hints_array_available(struct dm_cache_metadata *cmd, hints_array_initialized(cmd); } -static int __load_mapping(struct dm_cache_metadata *cmd, - uint64_t cb, bool hints_valid, - struct dm_array_cursor *mapping_cursor, - struct dm_array_cursor *hint_cursor, - load_mapping_fn fn, void *context) +static int __load_mapping_v1(struct dm_cache_metadata *cmd, + uint64_t cb, bool hints_valid, + struct dm_array_cursor *mapping_cursor, + struct dm_array_cursor *hint_cursor, + load_mapping_fn fn, void *context) { int r = 0; @@ -1207,8 +1335,51 @@ static int __load_mapping(struct dm_cache_metadata *cmd, r = fn(context, oblock, to_cblock(cb), flags & M_DIRTY, le32_to_cpu(hint), hints_valid); - if (r) - DMERR("policy couldn't load cblock"); + if (r) { + DMERR("policy couldn't load cache block %llu", + (unsigned long long) from_cblock(to_cblock(cb))); + } + } + + return r; +} + +static int __load_mapping_v2(struct dm_cache_metadata *cmd, + uint64_t cb, bool hints_valid, + struct dm_array_cursor *mapping_cursor, + struct dm_array_cursor *hint_cursor, + struct dm_bitset_cursor *dirty_cursor, + load_mapping_fn fn, void *context) +{ + int r = 0; + + __le64 mapping; + __le32 hint = 0; + + __le64 *mapping_value_le; + __le32 *hint_value_le; + + dm_oblock_t oblock; + unsigned flags; + bool dirty; + + dm_array_cursor_get_value(mapping_cursor, (void **) &mapping_value_le); + memcpy(&mapping, mapping_value_le, sizeof(mapping)); + unpack_value(mapping, &oblock, &flags); + + if (flags & M_VALID) { + if (hints_valid) { + dm_array_cursor_get_value(hint_cursor, (void **) &hint_value_le); + memcpy(&hint, hint_value_le, sizeof(hint)); + } + + dirty = dm_bitset_cursor_get_value(dirty_cursor); + r = fn(context, oblock, to_cblock(cb), dirty, + le32_to_cpu(hint), hints_valid); + if (r) { + DMERR("policy couldn't load cache block %llu", + (unsigned long long) from_cblock(to_cblock(cb))); + } } return r; @@ -1239,10 +1410,28 @@ static int __load_mappings(struct dm_cache_metadata *cmd, } } + if (separate_dirty_bits(cmd)) { + r = dm_bitset_cursor_begin(&cmd->dirty_info, cmd->dirty_root, + from_cblock(cmd->cache_blocks), + &cmd->dirty_cursor); + if (r) { + dm_array_cursor_end(&cmd->hint_cursor); + dm_array_cursor_end(&cmd->mapping_cursor); + return r; + } + } + for (cb = 0; ; cb++) { - r = __load_mapping(cmd, cb, hints_valid, - &cmd->mapping_cursor, &cmd->hint_cursor, - fn, context); + if (separate_dirty_bits(cmd)) + r = __load_mapping_v2(cmd, cb, hints_valid, + &cmd->mapping_cursor, + &cmd->hint_cursor, + &cmd->dirty_cursor, + fn, context); + else + r = __load_mapping_v1(cmd, cb, hints_valid, + &cmd->mapping_cursor, &cmd->hint_cursor, + fn, context); if (r) goto out; @@ -1265,12 +1454,23 @@ static int __load_mappings(struct dm_cache_metadata *cmd, goto out; } } + + if (separate_dirty_bits(cmd)) { + r = dm_bitset_cursor_next(&cmd->dirty_cursor); + if (r) { + DMERR("dm_bitset_cursor_next for dirty failed"); + goto out; + } + } } out: dm_array_cursor_end(&cmd->mapping_cursor); if (hints_valid) dm_array_cursor_end(&cmd->hint_cursor); + if (separate_dirty_bits(cmd)) + dm_bitset_cursor_end(&cmd->dirty_cursor); + return r; } @@ -1353,13 +1553,55 @@ static int __dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty } -int dm_cache_set_dirty(struct dm_cache_metadata *cmd, - dm_cblock_t cblock, bool dirty) +static int __set_dirty_bits_v1(struct dm_cache_metadata *cmd, unsigned nr_bits, unsigned long *bits) +{ + int r; + unsigned i; + for (i = 0; i < nr_bits; i++) { + r = __dirty(cmd, to_cblock(i), test_bit(i, bits)); + if (r) + return r; + } + + return 0; +} + +static int is_dirty_callback(uint32_t index, bool *value, void *context) +{ + unsigned long *bits = context; + *value = test_bit(index, bits); + return 0; +} + +static int __set_dirty_bits_v2(struct dm_cache_metadata *cmd, unsigned nr_bits, unsigned long *bits) +{ + int r = 0; + + /* nr_bits is really just a sanity check */ + if (nr_bits != from_cblock(cmd->cache_blocks)) { + DMERR("dirty bitset is wrong size"); + return -EINVAL; + } + + r = dm_bitset_del(&cmd->dirty_info, cmd->dirty_root); + if (r) + return r; + + cmd->changed = true; + return dm_bitset_new(&cmd->dirty_info, &cmd->dirty_root, nr_bits, is_dirty_callback, bits); +} + +int dm_cache_set_dirty_bits(struct dm_cache_metadata *cmd, + unsigned nr_bits, + unsigned long *bits) { int r; WRITE_LOCK(cmd); - r = __dirty(cmd, cblock, dirty); + if (separate_dirty_bits(cmd)) + r = __set_dirty_bits_v2(cmd, nr_bits, bits); + else + r = __set_dirty_bits_v1(cmd, nr_bits, bits); WRITE_UNLOCK(cmd); return r; diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index 8528744195e5..4f07c08cf107 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h @@ -45,18 +45,20 @@ * As these various flags are defined they should be added to the * following masks. */ + #define DM_CACHE_FEATURE_COMPAT_SUPP 0UL #define DM_CACHE_FEATURE_COMPAT_RO_SUPP 0UL #define DM_CACHE_FEATURE_INCOMPAT_SUPP 0UL /* - * Reopens or creates a new, empty metadata volume. - * Returns an ERR_PTR on failure. + * Reopens or creates a new, empty metadata volume. Returns an ERR_PTR on + * failure. If reopening then features must match. */ struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, sector_t data_block_size, bool may_format_device, - size_t policy_hint_size); + size_t policy_hint_size, + unsigned metadata_version); void dm_cache_metadata_close(struct dm_cache_metadata *cmd); @@ -91,7 +93,8 @@ int dm_cache_load_mappings(struct dm_cache_metadata *cmd, load_mapping_fn fn, void *context); -int dm_cache_set_dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty); +int dm_cache_set_dirty_bits(struct dm_cache_metadata *cmd, + unsigned nr_bits, unsigned long *bits); struct dm_cache_statistics { uint32_t read_hits; diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index c33f4a6e1d7d..f19c6930a67c 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -1361,7 +1361,7 @@ static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) static unsigned random_level(dm_cblock_t cblock) { - return hash_32_generic(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1); + return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1); } static int smq_load_mapping(struct dm_cache_policy *p, diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 59b2c50562e4..9c689b34e6e7 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -179,6 +179,7 @@ enum cache_io_mode { struct cache_features { enum cache_metadata_mode mode; enum cache_io_mode io_mode; + unsigned metadata_version; }; struct cache_stats { @@ -248,7 +249,7 @@ struct cache { /* * Fields for converting from sectors to blocks. */ - uint32_t sectors_per_block; + sector_t sectors_per_block; int sectors_per_block_shift; spinlock_t lock; @@ -787,8 +788,7 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); spin_lock_irqsave(&cache->lock, flags); - if (cache->need_tick_bio && - !(bio->bi_opf & (REQ_FUA | REQ_PREFLUSH)) && + if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) && bio_op(bio) != REQ_OP_DISCARD) { pb->tick = true; cache->need_tick_bio = false; @@ -828,11 +828,6 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) return to_oblock(block_nr); } -static int bio_triggers_commit(struct cache *cache, struct bio *bio) -{ - return bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); -} - /* * You must increment the deferred set whilst the prison cell is held. To * encourage this, we ask for 'cell' to be passed in. @@ -884,7 +879,7 @@ static void issue(struct cache *cache, struct bio *bio) { unsigned long flags; - if (!bio_triggers_commit(cache, bio)) { + if (!op_is_flush(bio->bi_opf)) { accounted_request(cache, bio); return; } @@ -989,7 +984,8 @@ static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mod enum cache_metadata_mode old_mode = get_cache_mode(cache); if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) { - DMERR("unable to read needs_check flag, setting failure mode"); + DMERR("%s: unable to read needs_check flag, setting failure mode.", + cache_device_name(cache)); new_mode = CM_FAIL; } @@ -1068,8 +1064,7 @@ static void dec_io_migrations(struct cache *cache) static bool discard_or_flush(struct bio *bio) { - return bio_op(bio) == REQ_OP_DISCARD || - bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); + return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); } static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell) @@ -2290,7 +2285,7 @@ static void do_waker(struct work_struct *ws) static int is_congested(struct dm_dev *dev, int bdi_bits) { struct request_queue *q = bdev_get_queue(dev->bdev); - return bdi_congested(&q->backing_dev_info, bdi_bits); + return bdi_congested(q->backing_dev_info, bdi_bits); } static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) @@ -2540,13 +2535,14 @@ static void init_features(struct cache_features *cf) { cf->mode = CM_WRITE; cf->io_mode = CM_IO_WRITEBACK; + cf->metadata_version = 1; } static int parse_features(struct cache_args *ca, struct dm_arg_set *as, char **error) { static struct dm_arg _args[] = { - {0, 1, "Invalid number of cache feature arguments"}, + {0, 2, "Invalid number of cache feature arguments"}, }; int r; @@ -2572,6 +2568,9 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as, else if (!strcasecmp(arg, "passthrough")) cf->io_mode = CM_IO_PASSTHROUGH; + else if (!strcasecmp(arg, "metadata2")) + cf->metadata_version = 2; + else { *error = "Unrecognised cache feature requested"; return -EINVAL; @@ -2826,7 +2825,8 @@ static int cache_create(struct cache_args *ca, struct cache **result) cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, ca->block_size, may_format, - dm_cache_policy_get_hint_size(cache->policy)); + dm_cache_policy_get_hint_size(cache->policy), + ca->features.metadata_version); if (IS_ERR(cmd)) { *error = "Error creating metadata object"; r = PTR_ERR(cmd); @@ -3171,21 +3171,16 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) static int write_dirty_bitset(struct cache *cache) { - unsigned i, r; + int r; if (get_cache_mode(cache) >= CM_READ_ONLY) return -EINVAL; - for (i = 0; i < from_cblock(cache->cache_size); i++) { - r = dm_cache_set_dirty(cache->cmd, to_cblock(i), - is_dirty(cache, to_cblock(i))); - if (r) { - metadata_operation_failed(cache, "dm_cache_set_dirty", r); - return r; - } - } + r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset); + if (r) + metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r); - return 0; + return r; } static int write_discard_bitset(struct cache *cache) @@ -3546,11 +3541,11 @@ static void cache_status(struct dm_target *ti, status_type_t type, residency = policy_residency(cache->policy); - DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ", + DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ", (unsigned)DM_CACHE_METADATA_BLOCK_SIZE, (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), (unsigned long long)nr_blocks_metadata, - cache->sectors_per_block, + (unsigned long long)cache->sectors_per_block, (unsigned long long) from_cblock(residency), (unsigned long long) from_cblock(cache->cache_size), (unsigned) atomic_read(&cache->stats.read_hit), @@ -3561,14 +3556,19 @@ static void cache_status(struct dm_target *ti, status_type_t type, (unsigned) atomic_read(&cache->stats.promotion), (unsigned long) atomic_read(&cache->nr_dirty)); + if (cache->features.metadata_version == 2) + DMEMIT("2 metadata2 "); + else + DMEMIT("1 "); + if (writethrough_mode(&cache->features)) - DMEMIT("1 writethrough "); + DMEMIT("writethrough "); else if (passthrough_mode(&cache->features)) - DMEMIT("1 passthrough "); + DMEMIT("passthrough "); else if (writeback_mode(&cache->features)) - DMEMIT("1 writeback "); + DMEMIT("writeback "); else { DMERR("%s: internal error: unknown io mode: %d", @@ -3816,7 +3816,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {1, 9, 0}, + .version = {1, 10, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 40ceba1fe8be..136fda3ff9e5 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -92,7 +92,6 @@ struct mapped_device { * io objects are allocated from here. */ mempool_t *io_pool; - mempool_t *rq_pool; struct bio_set *bs; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index a2768835d394..389a3637ffcc 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -12,6 +12,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/kernel.h> +#include <linux/key.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/mempool.h> @@ -23,12 +24,14 @@ #include <linux/atomic.h> #include <linux/scatterlist.h> #include <linux/rbtree.h> +#include <linux/ctype.h> #include <asm/page.h> #include <asm/unaligned.h> #include <crypto/hash.h> #include <crypto/md5.h> #include <crypto/algapi.h> #include <crypto/skcipher.h> +#include <keys/user-type.h> #include <linux/device-mapper.h> @@ -140,8 +143,9 @@ struct crypt_config { char *cipher; char *cipher_string; + char *key_string; - struct crypt_iv_operations *iv_gen_ops; + const struct crypt_iv_operations *iv_gen_ops; union { struct iv_essiv_private essiv; struct iv_benbi_private benbi; @@ -758,15 +762,15 @@ static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, return r; } -static struct crypt_iv_operations crypt_iv_plain_ops = { +static const struct crypt_iv_operations crypt_iv_plain_ops = { .generator = crypt_iv_plain_gen }; -static struct crypt_iv_operations crypt_iv_plain64_ops = { +static const struct crypt_iv_operations crypt_iv_plain64_ops = { .generator = crypt_iv_plain64_gen }; -static struct crypt_iv_operations crypt_iv_essiv_ops = { +static const struct crypt_iv_operations crypt_iv_essiv_ops = { .ctr = crypt_iv_essiv_ctr, .dtr = crypt_iv_essiv_dtr, .init = crypt_iv_essiv_init, @@ -774,17 +778,17 @@ static struct crypt_iv_operations crypt_iv_essiv_ops = { .generator = crypt_iv_essiv_gen }; -static struct crypt_iv_operations crypt_iv_benbi_ops = { +static const struct crypt_iv_operations crypt_iv_benbi_ops = { .ctr = crypt_iv_benbi_ctr, .dtr = crypt_iv_benbi_dtr, .generator = crypt_iv_benbi_gen }; -static struct crypt_iv_operations crypt_iv_null_ops = { +static const struct crypt_iv_operations crypt_iv_null_ops = { .generator = crypt_iv_null_gen }; -static struct crypt_iv_operations crypt_iv_lmk_ops = { +static const struct crypt_iv_operations crypt_iv_lmk_ops = { .ctr = crypt_iv_lmk_ctr, .dtr = crypt_iv_lmk_dtr, .init = crypt_iv_lmk_init, @@ -793,7 +797,7 @@ static struct crypt_iv_operations crypt_iv_lmk_ops = { .post = crypt_iv_lmk_post }; -static struct crypt_iv_operations crypt_iv_tcw_ops = { +static const struct crypt_iv_operations crypt_iv_tcw_ops = { .ctr = crypt_iv_tcw_ctr, .dtr = crypt_iv_tcw_dtr, .init = crypt_iv_tcw_init, @@ -994,7 +998,6 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size) gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM; unsigned i, len, remaining_size; struct page *page; - struct bio_vec *bvec; retry: if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM)) @@ -1019,12 +1022,7 @@ retry: len = (remaining_size > PAGE_SIZE) ? PAGE_SIZE : remaining_size; - bvec = &clone->bi_io_vec[clone->bi_vcnt++]; - bvec->bv_page = page; - bvec->bv_len = len; - bvec->bv_offset = 0; - - clone->bi_iter.bi_size += len; + bio_add_page(clone, page, len, 0); remaining_size -= len; } @@ -1135,7 +1133,7 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone) clone->bi_private = io; clone->bi_end_io = crypt_endio; clone->bi_bdev = cc->dev->bdev; - bio_set_op_attrs(clone, bio_op(io->base_bio), bio_flags(io->base_bio)); + clone->bi_opf = io->base_bio->bi_opf; } static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) @@ -1212,14 +1210,14 @@ continue_locked: spin_unlock_irq(&cc->write_thread_wait.lock); if (unlikely(kthread_should_stop())) { - set_task_state(current, TASK_RUNNING); + set_current_state(TASK_RUNNING); remove_wait_queue(&cc->write_thread_wait, &wait); break; } schedule(); - set_task_state(current, TASK_RUNNING); + set_current_state(TASK_RUNNING); spin_lock_irq(&cc->write_thread_wait.lock); __remove_wait_queue(&cc->write_thread_wait, &wait); goto continue_locked; @@ -1471,7 +1469,7 @@ static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode) return 0; } -static int crypt_setkey_allcpus(struct crypt_config *cc) +static int crypt_setkey(struct crypt_config *cc) { unsigned subkey_size; int err = 0, i, r; @@ -1490,25 +1488,157 @@ static int crypt_setkey_allcpus(struct crypt_config *cc) return err; } +#ifdef CONFIG_KEYS + +static bool contains_whitespace(const char *str) +{ + while (*str) + if (isspace(*str++)) + return true; + return false; +} + +static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string) +{ + char *new_key_string, *key_desc; + int ret; + struct key *key; + const struct user_key_payload *ukp; + + /* + * Reject key_string with whitespace. dm core currently lacks code for + * proper whitespace escaping in arguments on DM_TABLE_STATUS path. + */ + if (contains_whitespace(key_string)) { + DMERR("whitespace chars not allowed in key string"); + return -EINVAL; + } + + /* look for next ':' separating key_type from key_description */ + key_desc = strpbrk(key_string, ":"); + if (!key_desc || key_desc == key_string || !strlen(key_desc + 1)) + return -EINVAL; + + if (strncmp(key_string, "logon:", key_desc - key_string + 1) && + strncmp(key_string, "user:", key_desc - key_string + 1)) + return -EINVAL; + + new_key_string = kstrdup(key_string, GFP_KERNEL); + if (!new_key_string) + return -ENOMEM; + + key = request_key(key_string[0] == 'l' ? &key_type_logon : &key_type_user, + key_desc + 1, NULL); + if (IS_ERR(key)) { + kzfree(new_key_string); + return PTR_ERR(key); + } + + down_read(&key->sem); + + ukp = user_key_payload_locked(key); + if (!ukp) { + up_read(&key->sem); + key_put(key); + kzfree(new_key_string); + return -EKEYREVOKED; + } + + if (cc->key_size != ukp->datalen) { + up_read(&key->sem); + key_put(key); + kzfree(new_key_string); + return -EINVAL; + } + + memcpy(cc->key, ukp->data, cc->key_size); + + up_read(&key->sem); + key_put(key); + + /* clear the flag since following operations may invalidate previously valid key */ + clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); + + ret = crypt_setkey(cc); + + /* wipe the kernel key payload copy in each case */ + memset(cc->key, 0, cc->key_size * sizeof(u8)); + + if (!ret) { + set_bit(DM_CRYPT_KEY_VALID, &cc->flags); + kzfree(cc->key_string); + cc->key_string = new_key_string; + } else + kzfree(new_key_string); + + return ret; +} + +static int get_key_size(char **key_string) +{ + char *colon, dummy; + int ret; + + if (*key_string[0] != ':') + return strlen(*key_string) >> 1; + + /* look for next ':' in key string */ + colon = strpbrk(*key_string + 1, ":"); + if (!colon) + return -EINVAL; + + if (sscanf(*key_string + 1, "%u%c", &ret, &dummy) != 2 || dummy != ':') + return -EINVAL; + + *key_string = colon; + + /* remaining key string should be :<logon|user>:<key_desc> */ + + return ret; +} + +#else + +static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string) +{ + return -EINVAL; +} + +static int get_key_size(char **key_string) +{ + return (*key_string[0] == ':') ? -EINVAL : strlen(*key_string) >> 1; +} + +#endif + static int crypt_set_key(struct crypt_config *cc, char *key) { int r = -EINVAL; int key_string_len = strlen(key); - /* The key size may not be changed. */ - if (cc->key_size != (key_string_len >> 1)) - goto out; - /* Hyphen (which gives a key_size of zero) means there is no key. */ if (!cc->key_size && strcmp(key, "-")) goto out; - if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0) + /* ':' means the key is in kernel keyring, short-circuit normal key processing */ + if (key[0] == ':') { + r = crypt_set_keyring_key(cc, key + 1); goto out; + } - set_bit(DM_CRYPT_KEY_VALID, &cc->flags); + /* clear the flag since following operations may invalidate previously valid key */ + clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); - r = crypt_setkey_allcpus(cc); + /* wipe references to any kernel keyring key */ + kzfree(cc->key_string); + cc->key_string = NULL; + + if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0) + goto out; + + r = crypt_setkey(cc); + if (!r) + set_bit(DM_CRYPT_KEY_VALID, &cc->flags); out: /* Hex key string not needed after here, so wipe it. */ @@ -1521,8 +1651,10 @@ static int crypt_wipe_key(struct crypt_config *cc) { clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); memset(&cc->key, 0, cc->key_size * sizeof(u8)); + kzfree(cc->key_string); + cc->key_string = NULL; - return crypt_setkey_allcpus(cc); + return crypt_setkey(cc); } static void crypt_dtr(struct dm_target *ti) @@ -1558,6 +1690,7 @@ static void crypt_dtr(struct dm_target *ti) kzfree(cc->cipher); kzfree(cc->cipher_string); + kzfree(cc->key_string); /* Must zero key material before freeing */ kzfree(cc); @@ -1726,12 +1859,13 @@ bad_mem: /* * Construct an encryption mapping: - * <cipher> <key> <iv_offset> <dev_path> <start> + * <cipher> [<key>|:<key_size>:<user|logon>:<key_description>] <iv_offset> <dev_path> <start> */ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct crypt_config *cc; - unsigned int key_size, opt_params; + int key_size; + unsigned int opt_params; unsigned long long tmpll; int ret; size_t iv_size_padding; @@ -1748,7 +1882,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -EINVAL; } - key_size = strlen(argv[1]) >> 1; + key_size = get_key_size(&argv[1]); + if (key_size < 0) { + ti->error = "Cannot parse key size"; + return -EINVAL; + } cc = kzalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL); if (!cc) { @@ -1955,10 +2093,13 @@ static void crypt_status(struct dm_target *ti, status_type_t type, case STATUSTYPE_TABLE: DMEMIT("%s ", cc->cipher_string); - if (cc->key_size > 0) - for (i = 0; i < cc->key_size; i++) - DMEMIT("%02x", cc->key[i]); - else + if (cc->key_size > 0) { + if (cc->key_string) + DMEMIT(":%u:%s", cc->key_size, cc->key_string); + else + for (i = 0; i < cc->key_size; i++) + DMEMIT("%02x", cc->key[i]); + } else DMEMIT("-"); DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset, @@ -2014,7 +2155,7 @@ static void crypt_resume(struct dm_target *ti) static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) { struct crypt_config *cc = ti->private; - int ret = -EINVAL; + int key_size, ret = -EINVAL; if (argc < 2) goto error; @@ -2025,6 +2166,13 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv) return -EINVAL; } if (argc == 3 && !strcasecmp(argv[1], "set")) { + /* The key size may not be changed. */ + key_size = get_key_size(&argv[2]); + if (key_size < 0 || cc->key_size != key_size) { + memset(argv[2], '0', strlen(argv[2])); + return -EINVAL; + } + ret = crypt_set_key(cc, argv[2]); if (ret) return ret; @@ -2068,7 +2216,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type crypt_target = { .name = "crypt", - .version = {1, 14, 1}, + .version = {1, 15, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index bf2b2676cb8a..9fab33b113c4 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -1379,7 +1379,7 @@ static void stop_worker(struct era *era) static int dev_is_congested(struct dm_dev *dev, int bdi_bits) { struct request_queue *q = bdev_get_queue(dev->bdev); - return bdi_congested(&q->backing_dev_info, bdi_bits); + return bdi_congested(q->backing_dev_info, bdi_bits); } static int era_is_congested(struct dm_target_callbacks *cb, int bdi_bits) diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 6a2e8dd44a1b..13305a182611 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -36,7 +36,8 @@ struct flakey_c { }; enum feature_flag_bits { - DROP_WRITES + DROP_WRITES, + ERROR_WRITES }; struct per_bio_data { @@ -76,6 +77,25 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, if (test_and_set_bit(DROP_WRITES, &fc->flags)) { ti->error = "Feature drop_writes duplicated"; return -EINVAL; + } else if (test_bit(ERROR_WRITES, &fc->flags)) { + ti->error = "Feature drop_writes conflicts with feature error_writes"; + return -EINVAL; + } + + continue; + } + + /* + * error_writes + */ + if (!strcasecmp(arg_name, "error_writes")) { + if (test_and_set_bit(ERROR_WRITES, &fc->flags)) { + ti->error = "Feature error_writes duplicated"; + return -EINVAL; + + } else if (test_bit(DROP_WRITES, &fc->flags)) { + ti->error = "Feature error_writes conflicts with feature drop_writes"; + return -EINVAL; } continue; @@ -135,6 +155,10 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, if (test_bit(DROP_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) { ti->error = "drop_writes is incompatible with corrupt_bio_byte with the WRITE flag set"; return -EINVAL; + + } else if (test_bit(ERROR_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) { + ti->error = "error_writes is incompatible with corrupt_bio_byte with the WRITE flag set"; + return -EINVAL; } return 0; @@ -200,11 +224,13 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (!(fc->up_interval + fc->down_interval)) { ti->error = "Total (up + down) interval is zero"; + r = -EINVAL; goto bad; } if (fc->up_interval + fc->down_interval < fc->up_interval) { ti->error = "Interval overflow"; + r = -EINVAL; goto bad; } @@ -289,22 +315,27 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) pb->bio_submitted = true; /* - * Error reads if neither corrupt_bio_byte or drop_writes are set. + * Error reads if neither corrupt_bio_byte or drop_writes or error_writes are set. * Otherwise, flakey_end_io() will decide if the reads should be modified. */ if (bio_data_dir(bio) == READ) { - if (!fc->corrupt_bio_byte && !test_bit(DROP_WRITES, &fc->flags)) + if (!fc->corrupt_bio_byte && !test_bit(DROP_WRITES, &fc->flags) && + !test_bit(ERROR_WRITES, &fc->flags)) return -EIO; goto map_bio; } /* - * Drop writes? + * Drop or error writes? */ if (test_bit(DROP_WRITES, &fc->flags)) { bio_endio(bio); return DM_MAPIO_SUBMITTED; } + else if (test_bit(ERROR_WRITES, &fc->flags)) { + bio_io_error(bio); + return DM_MAPIO_SUBMITTED; + } /* * Corrupt matching writes. @@ -340,10 +371,11 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, int error) */ corrupt_bio_data(bio, fc); - } else if (!test_bit(DROP_WRITES, &fc->flags)) { + } else if (!test_bit(DROP_WRITES, &fc->flags) && + !test_bit(ERROR_WRITES, &fc->flags)) { /* * Error read during the down_interval if drop_writes - * wasn't configured. + * and error_writes were not configured. */ return -EIO; } @@ -357,7 +389,7 @@ static void flakey_status(struct dm_target *ti, status_type_t type, { unsigned sz = 0; struct flakey_c *fc = ti->private; - unsigned drop_writes; + unsigned drop_writes, error_writes; switch (type) { case STATUSTYPE_INFO: @@ -370,10 +402,13 @@ static void flakey_status(struct dm_target *ti, status_type_t type, fc->down_interval); drop_writes = test_bit(DROP_WRITES, &fc->flags); - DMEMIT("%u ", drop_writes + (fc->corrupt_bio_byte > 0) * 5); + error_writes = test_bit(ERROR_WRITES, &fc->flags); + DMEMIT("%u ", drop_writes + error_writes + (fc->corrupt_bio_byte > 0) * 5); if (drop_writes) DMEMIT("drop_writes "); + else if (error_writes) + DMEMIT("error_writes "); if (fc->corrupt_bio_byte) DMEMIT("corrupt_bio_byte %u %c %u %u ", @@ -410,7 +445,7 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_ static struct target_type flakey_target = { .name = "flakey", - .version = {1, 3, 1}, + .version = {1, 4, 0}, .module = THIS_MODULE, .ctr = flakey_ctr, .dtr = flakey_dtr, diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 0bf1a12e35fe..03940bf36f6c 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -162,7 +162,10 @@ struct dpages { struct page **p, unsigned long *len, unsigned *offset); void (*next_page)(struct dpages *dp); - unsigned context_u; + union { + unsigned context_u; + struct bvec_iter context_bi; + }; void *context_ptr; void *vma_invalidate_address; @@ -204,25 +207,36 @@ static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offse static void bio_get_page(struct dpages *dp, struct page **p, unsigned long *len, unsigned *offset) { - struct bio_vec *bvec = dp->context_ptr; - *p = bvec->bv_page; - *len = bvec->bv_len - dp->context_u; - *offset = bvec->bv_offset + dp->context_u; + struct bio_vec bvec = bvec_iter_bvec((struct bio_vec *)dp->context_ptr, + dp->context_bi); + + *p = bvec.bv_page; + *len = bvec.bv_len; + *offset = bvec.bv_offset; + + /* avoid figuring it out again in bio_next_page() */ + dp->context_bi.bi_sector = (sector_t)bvec.bv_len; } static void bio_next_page(struct dpages *dp) { - struct bio_vec *bvec = dp->context_ptr; - dp->context_ptr = bvec + 1; - dp->context_u = 0; + unsigned int len = (unsigned int)dp->context_bi.bi_sector; + + bvec_iter_advance((struct bio_vec *)dp->context_ptr, + &dp->context_bi, len); } static void bio_dp_init(struct dpages *dp, struct bio *bio) { dp->get_page = bio_get_page; dp->next_page = bio_next_page; - dp->context_ptr = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); - dp->context_u = bio->bi_iter.bi_bvec_done; + + /* + * We just use bvec iterator to retrieve pages, so it is ok to + * access the bvec table directly here + */ + dp->context_ptr = bio->bi_io_vec; + dp->context_bi = bio->bi_iter; } /* diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 966eb4b61aed..4da6fc6b1ffd 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/vmalloc.h> #include <linux/miscdevice.h> +#include <linux/sched/mm.h> #include <linux/init.h> #include <linux/wait.h> #include <linux/slab.h> @@ -17,7 +18,7 @@ #include <linux/hdreg.h> #include <linux/compat.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #define DM_MSG_PREFIX "ioctl" #define DM_DRIVER_EMAIL "dm-devel@redhat.com" @@ -1697,7 +1698,7 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern { struct dm_ioctl *dmi; int secure_data; - const size_t minimum_data_size = sizeof(*param_kernel) - sizeof(param_kernel->data); + const size_t minimum_data_size = offsetof(struct dm_ioctl, data); if (copy_from_user(param_kernel, user, minimum_data_size)) return -EFAULT; diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 07fc1ad42ec5..33e71ea6cc14 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -308,7 +308,7 @@ static int flush_header(struct log_c *lc) }; lc->io_req.bi_op = REQ_OP_WRITE; - lc->io_req.bi_op_flags = WRITE_FLUSH; + lc->io_req.bi_op_flags = REQ_PREFLUSH; return dm_io(&lc->io_req, 1, &null_location, NULL); } diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index e477af8596e2..7f223dbed49f 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -92,12 +92,6 @@ struct multipath { unsigned queue_mode; - /* - * We must use a mempool of dm_mpath_io structs so that we - * can resubmit bios on error. - */ - mempool_t *mpio_pool; - struct mutex work_mutex; struct work_struct trigger_event; @@ -115,8 +109,6 @@ struct dm_mpath_io { typedef int (*action_fn) (struct pgpath *pgpath); -static struct kmem_cache *_mpio_cache; - static struct workqueue_struct *kmultipathd, *kmpath_handlerd; static void trigger_event(struct work_struct *work); static void activate_path(struct work_struct *work); @@ -209,7 +201,6 @@ static struct multipath *alloc_multipath(struct dm_target *ti) init_waitqueue_head(&m->pg_init_wait); mutex_init(&m->work_mutex); - m->mpio_pool = NULL; m->queue_mode = DM_TYPE_NONE; m->ti = ti; @@ -229,16 +220,7 @@ static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m) m->queue_mode = DM_TYPE_MQ_REQUEST_BASED; else m->queue_mode = DM_TYPE_REQUEST_BASED; - } - - if (m->queue_mode == DM_TYPE_REQUEST_BASED) { - unsigned min_ios = dm_get_reserved_rq_based_ios(); - - m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache); - if (!m->mpio_pool) - return -ENOMEM; - } - else if (m->queue_mode == DM_TYPE_BIO_BASED) { + } else if (m->queue_mode == DM_TYPE_BIO_BASED) { INIT_WORK(&m->process_queued_bios, process_queued_bios); /* * bio-based doesn't support any direct scsi_dh management; @@ -263,7 +245,6 @@ static void free_multipath(struct multipath *m) kfree(m->hw_handler_name); kfree(m->hw_handler_params); - mempool_destroy(m->mpio_pool); kfree(m); } @@ -272,38 +253,6 @@ static struct dm_mpath_io *get_mpio(union map_info *info) return info->ptr; } -static struct dm_mpath_io *set_mpio(struct multipath *m, union map_info *info) -{ - struct dm_mpath_io *mpio; - - if (!m->mpio_pool) { - /* Use blk-mq pdu memory requested via per_io_data_size */ - mpio = get_mpio(info); - memset(mpio, 0, sizeof(*mpio)); - return mpio; - } - - mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); - if (!mpio) - return NULL; - - memset(mpio, 0, sizeof(*mpio)); - info->ptr = mpio; - - return mpio; -} - -static void clear_request_fn_mpio(struct multipath *m, union map_info *info) -{ - /* Only needed for non blk-mq (.request_fn) multipath */ - if (m->mpio_pool) { - struct dm_mpath_io *mpio = info->ptr; - - info->ptr = NULL; - mempool_free(mpio, m->mpio_pool); - } -} - static size_t multipath_per_bio_data_size(void) { return sizeof(struct dm_mpath_io) + sizeof(struct dm_bio_details); @@ -372,16 +321,13 @@ static int __pg_init_all_paths(struct multipath *m) return atomic_read(&m->pg_init_in_progress); } -static int pg_init_all_paths(struct multipath *m) +static void pg_init_all_paths(struct multipath *m) { - int r; unsigned long flags; spin_lock_irqsave(&m->lock, flags); - r = __pg_init_all_paths(m); + __pg_init_all_paths(m); spin_unlock_irqrestore(&m->lock, flags); - - return r; } static void __switch_pg(struct multipath *m, struct priority_group *pg) @@ -430,7 +376,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) unsigned long flags; struct priority_group *pg; struct pgpath *pgpath; - bool bypassed = true; + unsigned bypassed = 1; if (!atomic_read(&m->nr_valid_paths)) { clear_bit(MPATHF_QUEUE_IO, &m->flags); @@ -469,7 +415,7 @@ check_current_pg: */ do { list_for_each_entry(pg, &m->priority_groups, list) { - if (pg->bypassed == bypassed) + if (pg->bypassed == !!bypassed) continue; pgpath = choose_path_in_pg(m, pg, nr_bytes); if (!IS_ERR_OR_NULL(pgpath)) { @@ -533,16 +479,17 @@ static bool must_push_back_bio(struct multipath *m) /* * Map cloned requests (request-based multipath) */ -static int __multipath_map(struct dm_target *ti, struct request *clone, - union map_info *map_context, - struct request *rq, struct request **__clone) +static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, + union map_info *map_context, + struct request **__clone) { struct multipath *m = ti->private; int r = DM_MAPIO_REQUEUE; - size_t nr_bytes = clone ? blk_rq_bytes(clone) : blk_rq_bytes(rq); + size_t nr_bytes = blk_rq_bytes(rq); struct pgpath *pgpath; struct block_device *bdev; - struct dm_mpath_io *mpio; + struct dm_mpath_io *mpio = get_mpio(map_context); + struct request *clone; /* Do we need to select a new pgpath? */ pgpath = lockless_dereference(m->current_pgpath); @@ -559,41 +506,23 @@ static int __multipath_map(struct dm_target *ti, struct request *clone, return r; } - mpio = set_mpio(m, map_context); - if (!mpio) - /* ENOMEM, requeue */ - return r; - + memset(mpio, 0, sizeof(*mpio)); mpio->pgpath = pgpath; mpio->nr_bytes = nr_bytes; bdev = pgpath->path.dev->bdev; - if (clone) { - /* - * Old request-based interface: allocated clone is passed in. - * Used by: .request_fn stacked on .request_fn path(s). - */ - clone->q = bdev_get_queue(bdev); - clone->rq_disk = bdev->bd_disk; - clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; - } else { - /* - * blk-mq request-based interface; used by both: - * .request_fn stacked on blk-mq path(s) and - * blk-mq stacked on blk-mq path(s). - */ - *__clone = blk_mq_alloc_request(bdev_get_queue(bdev), - rq_data_dir(rq), BLK_MQ_REQ_NOWAIT); - if (IS_ERR(*__clone)) { - /* ENOMEM, requeue */ - clear_request_fn_mpio(m, map_context); - return r; - } - (*__clone)->bio = (*__clone)->biotail = NULL; - (*__clone)->rq_disk = bdev->bd_disk; - (*__clone)->cmd_flags |= REQ_FAILFAST_TRANSPORT; + clone = blk_get_request(bdev_get_queue(bdev), + rq->cmd_flags | REQ_NOMERGE, + GFP_ATOMIC); + if (IS_ERR(clone)) { + /* EBUSY, ENODEV or EWOULDBLOCK: requeue */ + return r; } + clone->bio = clone->biotail = NULL; + clone->rq_disk = bdev->bd_disk; + clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; + *__clone = clone; if (pgpath->pg->ps.type->start_io) pgpath->pg->ps.type->start_io(&pgpath->pg->ps, @@ -602,22 +531,9 @@ static int __multipath_map(struct dm_target *ti, struct request *clone, return DM_MAPIO_REMAPPED; } -static int multipath_map(struct dm_target *ti, struct request *clone, - union map_info *map_context) -{ - return __multipath_map(ti, clone, map_context, NULL, NULL); -} - -static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, - union map_info *map_context, - struct request **clone) -{ - return __multipath_map(ti, NULL, map_context, rq, clone); -} - static void multipath_release_clone(struct request *clone) { - blk_mq_free_request(clone); + blk_put_request(clone); } /* @@ -852,18 +768,22 @@ retain: attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL); if (attached_handler_name) { /* + * Clear any hw_handler_params associated with a + * handler that isn't already attached. + */ + if (m->hw_handler_name && strcmp(attached_handler_name, m->hw_handler_name)) { + kfree(m->hw_handler_params); + m->hw_handler_params = NULL; + } + + /* * Reset hw_handler_name to match the attached handler - * and clear any hw_handler_params associated with the - * ignored handler. * * NB. This modifies the table line to show the actual * handler instead of the original table passed in. */ kfree(m->hw_handler_name); m->hw_handler_name = attached_handler_name; - - kfree(m->hw_handler_params); - m->hw_handler_params = NULL; } } @@ -1002,6 +922,8 @@ static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m) } m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL); + if (!m->hw_handler_name) + return -EINVAL; if (hw_argc > 1) { char *p; @@ -1183,7 +1105,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->num_write_same_bios = 1; if (m->queue_mode == DM_TYPE_BIO_BASED) ti->per_io_data_size = multipath_per_bio_data_size(); - else if (m->queue_mode == DM_TYPE_MQ_REQUEST_BASED) + else ti->per_io_data_size = sizeof(struct dm_mpath_io); return 0; @@ -1362,7 +1284,7 @@ static int switch_pg_num(struct multipath *m, const char *pgstr) char dummy; if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || - (pgnum > m->nr_priority_groups)) { + !m->nr_priority_groups || (pgnum > m->nr_priority_groups)) { DMWARN("invalid PG number supplied to switch_pg_num"); return -EINVAL; } @@ -1394,7 +1316,7 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed) char dummy; if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || - (pgnum > m->nr_priority_groups)) { + !m->nr_priority_groups || (pgnum > m->nr_priority_groups)) { DMWARN("invalid PG number supplied to bypass_pg"); return -EINVAL; } @@ -1606,7 +1528,6 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, if (ps->type->end_io) ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); } - clear_request_fn_mpio(m, map_context); return r; } @@ -2056,7 +1977,6 @@ static struct target_type multipath_target = { .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, - .map_rq = multipath_map, .clone_and_map_rq = multipath_clone_and_map, .release_clone_rq = multipath_release_clone, .rq_end_io = multipath_end_io, @@ -2076,11 +1996,6 @@ static int __init dm_multipath_init(void) { int r; - /* allocate a slab for the dm_mpath_ios */ - _mpio_cache = KMEM_CACHE(dm_mpath_io, 0); - if (!_mpio_cache) - return -ENOMEM; - r = dm_register_target(&multipath_target); if (r < 0) { DMERR("request-based register failed %d", r); @@ -2116,8 +2031,6 @@ bad_alloc_kmpath_handlerd: bad_alloc_kmultipathd: dm_unregister_target(&multipath_target); bad_register_target: - kmem_cache_destroy(_mpio_cache); - return r; } @@ -2127,7 +2040,6 @@ static void __exit dm_multipath_exit(void) destroy_workqueue(kmultipathd); dm_unregister_target(&multipath_target); - kmem_cache_destroy(_mpio_cache); } module_init(dm_multipath_init); diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 6d53810963f7..f8564d63982f 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -24,6 +24,11 @@ */ #define MIN_FREE_RESHAPE_SPACE to_sector(4*4096) +/* + * Minimum journal space 4 MiB in sectors. + */ +#define MIN_RAID456_JOURNAL_SPACE (4*2048) + static bool devices_handle_discard_safely = false; /* @@ -73,6 +78,9 @@ struct raid_dev { #define __CTR_FLAG_DATA_OFFSET 13 /* 2 */ /* Only with reshapable raid4/5/6/10! */ #define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */ +/* New for v1.10.0 */ +#define __CTR_FLAG_JOURNAL_DEV 15 /* 2 */ /* Only with raid4/5/6! */ + /* * Flags for rs->ctr_flags field. */ @@ -91,6 +99,9 @@ struct raid_dev { #define CTR_FLAG_DELTA_DISKS (1 << __CTR_FLAG_DELTA_DISKS) #define CTR_FLAG_DATA_OFFSET (1 << __CTR_FLAG_DATA_OFFSET) #define CTR_FLAG_RAID10_USE_NEAR_SETS (1 << __CTR_FLAG_RAID10_USE_NEAR_SETS) +#define CTR_FLAG_JOURNAL_DEV (1 << __CTR_FLAG_JOURNAL_DEV) + +#define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET) /* * Definitions of various constructor flags to @@ -160,22 +171,22 @@ struct raid_dev { CTR_FLAG_DAEMON_SLEEP | \ CTR_FLAG_MIN_RECOVERY_RATE | \ CTR_FLAG_MAX_RECOVERY_RATE | \ - CTR_FLAG_MAX_WRITE_BEHIND | \ CTR_FLAG_STRIPE_CACHE | \ CTR_FLAG_REGION_SIZE | \ CTR_FLAG_DELTA_DISKS | \ - CTR_FLAG_DATA_OFFSET) + CTR_FLAG_DATA_OFFSET | \ + CTR_FLAG_JOURNAL_DEV) #define RAID6_VALID_FLAGS (CTR_FLAG_SYNC | \ CTR_FLAG_REBUILD | \ CTR_FLAG_DAEMON_SLEEP | \ CTR_FLAG_MIN_RECOVERY_RATE | \ CTR_FLAG_MAX_RECOVERY_RATE | \ - CTR_FLAG_MAX_WRITE_BEHIND | \ CTR_FLAG_STRIPE_CACHE | \ CTR_FLAG_REGION_SIZE | \ CTR_FLAG_DELTA_DISKS | \ - CTR_FLAG_DATA_OFFSET) + CTR_FLAG_DATA_OFFSET | \ + CTR_FLAG_JOURNAL_DEV) /* ...valid options definitions per raid level */ /* @@ -224,6 +235,12 @@ struct raid_set { struct raid_type *raid_type; struct dm_target_callbacks callbacks; + /* Optional raid4/5/6 journal device */ + struct journal_dev { + struct dm_dev *dev; + struct md_rdev rdev; + } journal_dev; + struct raid_dev dev[0]; }; @@ -308,6 +325,7 @@ static struct arg_name_flag { { CTR_FLAG_DATA_OFFSET, "data_offset"}, { CTR_FLAG_DELTA_DISKS, "delta_disks"}, { CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"}, + { CTR_FLAG_JOURNAL_DEV, "journal_dev" }, }; /* Return argument name string for given @flag */ @@ -372,7 +390,7 @@ static bool rs_is_reshapable(struct raid_set *rs) /* Return true, if raid set in @rs is recovering */ static bool rs_is_recovering(struct raid_set *rs) { - return rs->md.recovery_cp < rs->dev[0].rdev.sectors; + return rs->md.recovery_cp < rs->md.dev_sectors; } /* Return true, if raid set in @rs is reshaping */ @@ -629,7 +647,8 @@ static void rs_set_capacity(struct raid_set *rs) * is unintended in case of out-of-place reshaping */ rdev_for_each(rdev, mddev) - rdev->sectors = mddev->dev_sectors; + if (!test_bit(Journal, &rdev->flags)) + rdev->sectors = mddev->dev_sectors; set_capacity(gendisk, mddev->array_sectors); revalidate_disk(gendisk); @@ -715,6 +734,11 @@ static void raid_set_free(struct raid_set *rs) { int i; + if (rs->journal_dev.dev) { + md_rdev_clear(&rs->journal_dev.rdev); + dm_put_device(rs->ti, rs->journal_dev.dev); + } + for (i = 0; i < rs->raid_disks; i++) { if (rs->dev[i].meta_dev) dm_put_device(rs->ti, rs->dev[i].meta_dev); @@ -762,10 +786,11 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as) rs->dev[i].data_dev = NULL; /* - * There are no offsets, since there is a separate device - * for data and metadata. + * There are no offsets initially. + * Out of place reshape will set them accordingly. */ rs->dev[i].rdev.data_offset = 0; + rs->dev[i].rdev.new_data_offset = 0; rs->dev[i].rdev.mddev = &rs->md; arg = dm_shift_arg(as); @@ -823,6 +848,9 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as) rebuild++; } + if (rs->journal_dev.dev) + list_add_tail(&rs->journal_dev.rdev.same_set, &rs->md.disks); + if (metadata_available) { rs->md.external = 0; rs->md.persistent = 1; @@ -1028,6 +1056,8 @@ too_many: * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) * [stripe_cache <sectors>] Stripe cache size for higher RAIDs * [region_size <sectors>] Defines granularity of bitmap + * [journal_dev <dev>] raid4/5/6 journaling deviice + * (i.e. write hole closing log) * * RAID10-only options: * [raid10_copies <# copies>] Number of copies. (Default: 2) @@ -1135,7 +1165,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, /* * Parameters that take a string value are checked here. */ - + /* "raid10_format {near|offset|far} */ if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT))) { if (test_and_set_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) { rs->ti->error = "Only one 'raid10_format' argument pair allowed"; @@ -1153,6 +1183,41 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, continue; } + /* "journal_dev dev" */ + if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV))) { + int r; + struct md_rdev *jdev; + + if (test_and_set_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) { + rs->ti->error = "Only one raid4/5/6 set journaling device allowed"; + return -EINVAL; + } + if (!rt_is_raid456(rt)) { + rs->ti->error = "'journal_dev' is an invalid parameter for this RAID type"; + return -EINVAL; + } + r = dm_get_device(rs->ti, arg, dm_table_get_mode(rs->ti->table), + &rs->journal_dev.dev); + if (r) { + rs->ti->error = "raid4/5/6 journal device lookup failure"; + return r; + } + jdev = &rs->journal_dev.rdev; + md_rdev_init(jdev); + jdev->mddev = &rs->md; + jdev->bdev = rs->journal_dev.dev->bdev; + jdev->sectors = to_sector(i_size_read(jdev->bdev->bd_inode)); + if (jdev->sectors < MIN_RAID456_JOURNAL_SPACE) { + rs->ti->error = "No space for raid4/5/6 journal"; + return -ENOSPC; + } + set_bit(Journal, &jdev->flags); + continue; + } + + /* + * Parameters with number values from here on. + */ if (kstrtoint(arg, 10, &value) < 0) { rs->ti->error = "Bad numerical argument given in raid params"; return -EINVAL; @@ -1427,6 +1492,25 @@ static unsigned int rs_data_stripes(struct raid_set *rs) return rs->raid_disks - rs->raid_type->parity_devs; } +/* + * Retrieve rdev->sectors from any valid raid device of @rs + * to allow userpace to pass in arbitray "- -" device tupples. + */ +static sector_t __rdev_sectors(struct raid_set *rs) +{ + int i; + + for (i = 0; i < rs->md.raid_disks; i++) { + struct md_rdev *rdev = &rs->dev[i].rdev; + + if (!test_bit(Journal, &rdev->flags) && + rdev->bdev && rdev->sectors) + return rdev->sectors; + } + + BUG(); /* Constructor ensures we got some. */ +} + /* Calculate the sectors per device and per array used for @rs */ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev) { @@ -1470,7 +1554,8 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev) array_sectors = (data_stripes + delta_disks) * dev_sectors; rdev_for_each(rdev, mddev) - rdev->sectors = dev_sectors; + if (!test_bit(Journal, &rdev->flags)) + rdev->sectors = dev_sectors; mddev->array_sectors = array_sectors; mddev->dev_sectors = dev_sectors; @@ -1512,9 +1597,9 @@ static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors) else if (dev_sectors == MaxSector) /* Prevent recovery */ __rs_setup_recovery(rs, MaxSector); - else if (rs->dev[0].rdev.sectors < dev_sectors) + else if (__rdev_sectors(rs) < dev_sectors) /* Grown raid set */ - __rs_setup_recovery(rs, rs->dev[0].rdev.sectors); + __rs_setup_recovery(rs, __rdev_sectors(rs)); else __rs_setup_recovery(rs, MaxSector); } @@ -1853,18 +1938,21 @@ static int rs_check_reshape(struct raid_set *rs) return -EPERM; } -static int read_disk_sb(struct md_rdev *rdev, int size) +static int read_disk_sb(struct md_rdev *rdev, int size, bool force_reload) { BUG_ON(!rdev->sb_page); - if (rdev->sb_loaded) + if (rdev->sb_loaded && !force_reload) return 0; + rdev->sb_loaded = 0; + if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true)) { DMERR("Failed to read superblock of device at position %d", rdev->raid_disk); md_error(rdev->mddev, rdev); - return -EINVAL; + set_bit(Faulty, &rdev->flags); + return -EIO; } rdev->sb_loaded = 1; @@ -1992,7 +2080,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev) return -EINVAL; } - r = read_disk_sb(rdev, rdev->sb_size); + r = read_disk_sb(rdev, rdev->sb_size, false); if (r) return r; @@ -2011,7 +2099,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev) sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V190); /* Force writing of superblocks to disk */ - set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags); + set_bit(MD_SB_CHANGE_DEVS, &rdev->mddev->sb_flags); /* Any superblock is better than none, choose that if given */ return refdev ? 0 : 1; @@ -2050,16 +2138,17 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) mddev->reshape_position = MaxSector; + mddev->raid_disks = le32_to_cpu(sb->num_devices); + mddev->level = le32_to_cpu(sb->level); + mddev->layout = le32_to_cpu(sb->layout); + mddev->chunk_sectors = le32_to_cpu(sb->stripe_sectors); + /* * Reshaping is supported, e.g. reshape_position is valid * in superblock and superblock content is authoritative. */ if (le32_to_cpu(sb->compat_features) & FEATURE_FLAG_SUPPORTS_V190) { /* Superblock is authoritative wrt given raid set layout! */ - mddev->raid_disks = le32_to_cpu(sb->num_devices); - mddev->level = le32_to_cpu(sb->level); - mddev->layout = le32_to_cpu(sb->layout); - mddev->chunk_sectors = le32_to_cpu(sb->stripe_sectors); mddev->new_level = le32_to_cpu(sb->new_level); mddev->new_layout = le32_to_cpu(sb->new_layout); mddev->new_chunk_sectors = le32_to_cpu(sb->new_stripe_sectors); @@ -2087,38 +2176,44 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) /* * No takeover/reshaping, because we don't have the extended v1.9.0 metadata */ - if (le32_to_cpu(sb->level) != mddev->new_level) { - DMERR("Reshaping/takeover raid sets not yet supported. (raid level/stripes/size change)"); - return -EINVAL; - } - if (le32_to_cpu(sb->layout) != mddev->new_layout) { - DMERR("Reshaping raid sets not yet supported. (raid layout change)"); - DMERR(" 0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout); - DMERR(" Old layout: %s w/ %d copies", - raid10_md_layout_to_format(le32_to_cpu(sb->layout)), - raid10_md_layout_to_copies(le32_to_cpu(sb->layout))); - DMERR(" New layout: %s w/ %d copies", - raid10_md_layout_to_format(mddev->layout), - raid10_md_layout_to_copies(mddev->layout)); - return -EINVAL; - } - if (le32_to_cpu(sb->stripe_sectors) != mddev->new_chunk_sectors) { - DMERR("Reshaping raid sets not yet supported. (stripe sectors change)"); - return -EINVAL; - } + struct raid_type *rt_cur = get_raid_type_by_ll(mddev->level, mddev->layout); + struct raid_type *rt_new = get_raid_type_by_ll(mddev->new_level, mddev->new_layout); - /* We can only change the number of devices in raid1 with old (i.e. pre 1.0.7) metadata */ - if (!rt_is_raid1(rs->raid_type) && - (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { - DMERR("Reshaping raid sets not yet supported. (device count change from %u to %u)", - sb->num_devices, mddev->raid_disks); + if (rs_takeover_requested(rs)) { + if (rt_cur && rt_new) + DMERR("Takeover raid sets from %s to %s not yet supported by metadata. (raid level change)", + rt_cur->name, rt_new->name); + else + DMERR("Takeover raid sets not yet supported by metadata. (raid level change)"); + return -EINVAL; + } else if (rs_reshape_requested(rs)) { + DMERR("Reshaping raid sets not yet supported by metadata. (raid layout change keeping level)"); + if (mddev->layout != mddev->new_layout) { + if (rt_cur && rt_new) + DMERR(" current layout %s vs new layout %s", + rt_cur->name, rt_new->name); + else + DMERR(" current layout 0x%X vs new layout 0x%X", + le32_to_cpu(sb->layout), mddev->new_layout); + } + if (mddev->chunk_sectors != mddev->new_chunk_sectors) + DMERR(" current stripe sectors %u vs new stripe sectors %u", + mddev->chunk_sectors, mddev->new_chunk_sectors); + if (rs->delta_disks) + DMERR(" current %u disks vs new %u disks", + mddev->raid_disks, mddev->raid_disks + rs->delta_disks); + if (rs_is_raid10(rs)) { + DMERR(" Old layout: %s w/ %u copies", + raid10_md_layout_to_format(mddev->layout), + raid10_md_layout_to_copies(mddev->layout)); + DMERR(" New layout: %s w/ %u copies", + raid10_md_layout_to_format(mddev->new_layout), + raid10_md_layout_to_copies(mddev->new_layout)); + } return -EINVAL; } DMINFO("Discovered old metadata format; upgrading to extended metadata format"); - - /* Table line is checked vs. authoritative superblock */ - rs_set_new(rs); } if (!test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) @@ -2141,6 +2236,9 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) */ d = 0; rdev_for_each(r, mddev) { + if (test_bit(Journal, &rdev->flags)) + continue; + if (test_bit(FirstUse, &r->flags)) new_devs++; @@ -2196,7 +2294,8 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) */ sb_retrieve_failed_devices(sb, failed_devices); rdev_for_each(r, mddev) { - if (!r->sb_page) + if (test_bit(Journal, &rdev->flags) || + !r->sb_page) continue; sb2 = page_address(r->sb_page); sb2->failed_devices = 0; @@ -2211,7 +2310,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) continue; if (role != r->raid_disk) { - if (__is_raid10_near(mddev->layout)) { + if (rs_is_raid10(rs) && __is_raid10_near(mddev->layout)) { if (mddev->raid_disks % __raid10_near_copies(mddev->layout) || rs->raid_disks % rs->raid10_copies) { rs->ti->error = @@ -2248,7 +2347,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev) struct mddev *mddev = &rs->md; struct dm_raid_superblock *sb; - if (rs_is_raid0(rs) || !rdev->sb_page) + if (rs_is_raid0(rs) || !rdev->sb_page || rdev->raid_disk < 0) return 0; sb = page_address(rdev->sb_page); @@ -2273,7 +2372,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev) /* Enable bitmap creation for RAID levels != 0 */ mddev->bitmap_info.offset = rt_is_raid0(rs->raid_type) ? 0 : to_sector(4096); - rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset; + mddev->bitmap_info.default_offset = mddev->bitmap_info.offset; if (!test_and_clear_bit(FirstUse, &rdev->flags)) { /* Retrieve device size stored in superblock to be prepared for shrink */ @@ -2311,21 +2410,22 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev) static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) { int r; - struct raid_dev *dev; - struct md_rdev *rdev, *tmp, *freshest; + struct md_rdev *rdev, *freshest; struct mddev *mddev = &rs->md; freshest = NULL; - rdev_for_each_safe(rdev, tmp, mddev) { + rdev_for_each(rdev, mddev) { + if (test_bit(Journal, &rdev->flags)) + continue; + /* * Skipping super_load due to CTR_FLAG_SYNC will cause * the array to undergo initialization again as * though it were new. This is the intended effect * of the "sync" directive. * - * When reshaping capability is added, we must ensure - * that the "sync" directive is disallowed during the - * reshape. + * With reshaping capability added, we must ensure that + * that the "sync" directive is disallowed during the reshape. */ if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) continue; @@ -2342,6 +2442,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) case 0: break; default: + /* This is a failure to read the superblock from the metadata device. */ /* * We have to keep any raid0 data/metadata device pairs or * the MD raid0 personality will fail to start the array. @@ -2349,33 +2450,16 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) if (rs_is_raid0(rs)) continue; - dev = container_of(rdev, struct raid_dev, rdev); - if (dev->meta_dev) - dm_put_device(ti, dev->meta_dev); - - dev->meta_dev = NULL; - rdev->meta_bdev = NULL; - - if (rdev->sb_page) - put_page(rdev->sb_page); - - rdev->sb_page = NULL; - - rdev->sb_loaded = 0; - /* - * We might be able to salvage the data device - * even though the meta device has failed. For - * now, we behave as though '- -' had been - * set for this device in the table. + * We keep the dm_devs to be able to emit the device tuple + * properly on the table line in raid_status() (rather than + * mistakenly acting as if '- -' got passed into the constructor). + * + * The rdev has to stay on the same_set list to allow for + * the attempt to restore faulty devices on second resume. */ - if (dev->data_dev) - dm_put_device(ti, dev->data_dev); - - dev->data_dev = NULL; - rdev->bdev = NULL; - - list_del(&rdev->same_set); + rdev->raid_disk = rdev->saved_raid_disk = -1; + break; } } @@ -2396,7 +2480,9 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) return -EINVAL; rdev_for_each(rdev, mddev) - if ((rdev != freshest) && super_validate(rs, rdev)) + if (!test_bit(Journal, &rdev->flags) && + rdev != freshest && + super_validate(rs, rdev)) return -EINVAL; return 0; } @@ -2483,10 +2569,12 @@ static int rs_adjust_data_offsets(struct raid_set *rs) return -ENOSPC; } out: - /* Adjust data offsets on all rdevs */ + /* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */ rdev_for_each(rdev, &rs->md) { - rdev->data_offset = data_offset; - rdev->new_data_offset = new_data_offset; + if (!test_bit(Journal, &rdev->flags)) { + rdev->data_offset = data_offset; + rdev->new_data_offset = new_data_offset; + } } return 0; @@ -2499,8 +2587,10 @@ static void __reorder_raid_disk_indexes(struct raid_set *rs) struct md_rdev *rdev; rdev_for_each(rdev, &rs->md) { - rdev->raid_disk = i++; - rdev->saved_raid_disk = rdev->new_raid_disk = -1; + if (!test_bit(Journal, &rdev->flags)) { + rdev->raid_disk = i++; + rdev->saved_raid_disk = rdev->new_raid_disk = -1; + } } } @@ -2840,7 +2930,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (r) goto bad; - calculated_dev_sectors = rs->dev[0].rdev.sectors; + calculated_dev_sectors = rs->md.dev_sectors; /* * Backup any new raid set level, layout, ... @@ -2853,7 +2943,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (r) goto bad; - resize = calculated_dev_sectors != rs->dev[0].rdev.sectors; + resize = calculated_dev_sectors != __rdev_sectors(rs); INIT_WORK(&rs->md.event_work, do_table_event); ti->private = rs; @@ -2897,6 +2987,13 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } + /* We can't takeover a journaled raid4/5/6 */ + if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) { + ti->error = "Can't takeover a journaled raid4/5/6 set"; + r = -EPERM; + goto bad; + } + /* * If a takeover is needed, userspace sets any additional * devices to rebuild and we can check for a valid request here. @@ -2919,6 +3016,18 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) rs_set_new(rs); } else if (rs_reshape_requested(rs)) { /* + * No need to check for 'ongoing' takeover here, because takeover + * is an instant operation as oposed to an ongoing reshape. + */ + + /* We can't reshape a journaled raid4/5/6 */ + if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) { + ti->error = "Can't reshape a journaled raid4/5/6 set"; + r = -EPERM; + goto bad; + } + + /* * We can only prepare for a reshape here, because the * raid set needs to run to provide the repective reshape * check functions via its MD personality instance. @@ -2994,6 +3103,9 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) } } + /* Disable/enable discard support on raid set. */ + configure_discard_support(rs); + mddev_unlock(&rs->md); return 0; @@ -3063,18 +3175,23 @@ static const char *decipher_sync_action(struct mddev *mddev) } /* - * Return status string @rdev + * Return status string for @rdev * * Status characters: * - * 'D' = Dead/Failed device + * 'D' = Dead/Failed raid set component or raid4/5/6 journal device * 'a' = Alive but not in-sync - * 'A' = Alive and in-sync + * 'A' = Alive and in-sync raid set component or alive raid4/5/6 journal device + * '-' = Non-existing device (i.e. uspace passed '- -' into the ctr) */ static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync) { - if (test_bit(Faulty, &rdev->flags)) + if (!rdev->bdev) + return "-"; + else if (test_bit(Faulty, &rdev->flags)) return "D"; + else if (test_bit(Journal, &rdev->flags)) + return "A"; else if (!array_in_sync || !test_bit(In_sync, &rdev->flags)) return "a"; else @@ -3143,7 +3260,8 @@ static sector_t rs_get_progress(struct raid_set *rs, * being initialized. */ rdev_for_each(rdev, mddev) - if (!test_bit(In_sync, &rdev->flags)) + if (!test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags)) *array_in_sync = true; #if 0 r = 0; /* HM FIXME: TESTME: https://bugzilla.redhat.com/show_bug.cgi?id=1210637 ? */ @@ -3175,7 +3293,6 @@ static void raid_status(struct dm_target *ti, status_type_t type, sector_t progress, resync_max_sectors, resync_mismatches; const char *sync_action; struct raid_type *rt; - struct md_rdev *rdev; switch (type) { case STATUSTYPE_INFO: @@ -3196,9 +3313,9 @@ static void raid_status(struct dm_target *ti, status_type_t type, atomic64_read(&mddev->resync_mismatches) : 0; sync_action = decipher_sync_action(&rs->md); - /* HM FIXME: do we want another state char for raid0? It shows 'D' or 'A' now */ - rdev_for_each(rdev, mddev) - DMEMIT(__raid_dev_status(rdev, array_in_sync)); + /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */ + for (i = 0; i < rs->raid_disks; i++) + DMEMIT(__raid_dev_status(&rs->dev[i].rdev, array_in_sync)); /* * In-sync/Reshape ratio: @@ -3244,6 +3361,12 @@ static void raid_status(struct dm_target *ti, status_type_t type, * so retrieving it from the first raid disk is sufficient. */ DMEMIT(" %llu", (unsigned long long) rs->dev[0].rdev.data_offset); + + /* + * v1.10.0+: + */ + DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? + __raid_dev_status(&rs->journal_dev.rdev, 0) : "-"); break; case STATUSTYPE_TABLE: @@ -3257,7 +3380,8 @@ static void raid_status(struct dm_target *ti, status_type_t type, raid_param_cnt += rebuild_disks * 2 + write_mostly_params + hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) + - hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2; + hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 + + (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0); /* Emit table line */ DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors); if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) @@ -3304,6 +3428,9 @@ static void raid_status(struct dm_target *ti, status_type_t type, if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE), mddev->sync_speed_min); + if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) + DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV), + __get_dev_name(rs->journal_dev.dev)); DMEMIT(" %d", rs->raid_disks); for (i = 0; i < rs->raid_disks; i++) DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev), @@ -3337,12 +3464,15 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv) else if (!strcasecmp(argv[0], "recover")) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); else { - if (!strcasecmp(argv[0], "check")) + if (!strcasecmp(argv[0], "check")) { set_bit(MD_RECOVERY_CHECK, &mddev->recovery); - else if (!!strcasecmp(argv[0], "repair")) + set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + } else if (!strcasecmp(argv[0], "repair")) { + set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + } else return -EINVAL; - set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - set_bit(MD_RECOVERY_SYNC, &mddev->recovery); } if (mddev->ro == 2) { /* A write to sync_action is enough to justify @@ -3419,11 +3549,14 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs) memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices)); - for (i = 0; i < rs->md.raid_disks; i++) { + for (i = 0; i < mddev->raid_disks; i++) { r = &rs->dev[i].rdev; - if (test_bit(Faulty, &r->flags) && r->sb_page && - sync_page_io(r, 0, r->sb_size, r->sb_page, - REQ_OP_READ, 0, true)) { + /* HM FIXME: enhance journal device recovery processing */ + if (test_bit(Journal, &r->flags)) + continue; + + if (test_bit(Faulty, &r->flags) && + r->meta_bdev && !read_disk_sb(r, r->sb_size, true)) { DMINFO("Faulty %s device #%d has readable super block." " Attempting to revive it.", rs->raid_type->name, i); @@ -3437,22 +3570,26 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs) * '>= 0' - meaning we must call this function * ourselves. */ - if ((r->raid_disk >= 0) && - (mddev->pers->hot_remove_disk(mddev, r) != 0)) - /* Failed to revive this device, try next */ - continue; - - r->raid_disk = i; - r->saved_raid_disk = i; flags = r->flags; + clear_bit(In_sync, &r->flags); /* Mandatory for hot remove. */ + if (r->raid_disk >= 0) { + if (mddev->pers->hot_remove_disk(mddev, r)) { + /* Failed to revive this device, try next */ + r->flags = flags; + continue; + } + } else + r->raid_disk = r->saved_raid_disk = i; + clear_bit(Faulty, &r->flags); clear_bit(WriteErrorSeen, &r->flags); - clear_bit(In_sync, &r->flags); + if (mddev->pers->hot_add_disk(mddev, r)) { - r->raid_disk = -1; - r->saved_raid_disk = -1; + /* Failed to revive this device, try next */ + r->raid_disk = r->saved_raid_disk = -1; r->flags = flags; } else { + clear_bit(In_sync, &r->flags); r->recovery_offset = 0; set_bit(i, (void *) cleared_failed_devices); cleared = true; @@ -3465,6 +3602,9 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs) uint64_t failed_devices[DISKS_ARRAY_ELEMS]; rdev_for_each(r, &rs->md) { + if (test_bit(Journal, &r->flags)) + continue; + sb = page_address(r->sb_page); sb_retrieve_failed_devices(sb, failed_devices); @@ -3497,7 +3637,7 @@ static void rs_update_sbs(struct raid_set *rs) struct mddev *mddev = &rs->md; int ro = mddev->ro; - set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); mddev->ro = 0; md_update_sb(mddev, 1); mddev->ro = ro; @@ -3580,12 +3720,6 @@ static int raid_preresume(struct dm_target *ti) if (test_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags)) rs_update_sbs(rs); - /* - * Disable/enable discard support on raid set after any - * conversion, because devices can have been added - */ - configure_discard_support(rs); - /* Load the bitmap from disk unless raid0 */ r = __load_dirty_region_bitmap(rs); if (r) @@ -3641,7 +3775,15 @@ static void raid_resume(struct dm_target *ti) mddev->ro = 0; mddev->in_sync = 0; - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + /* + * Keep the RAID set frozen if reshape/rebuild flags are set. + * The RAID set is unfrozen once the next table load/resume, + * which clears the reshape/rebuild flags, occurs. + * This ensures that the constructor for the inactive table + * retrieves an up-to-date reshape_position. + */ + if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS)) + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); if (mddev->suspended) mddev_resume(mddev); @@ -3649,7 +3791,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 9, 1}, + .version = {1, 10, 1}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 9a8b71067c6e..2ddc2d20e62d 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -260,7 +260,7 @@ static int mirror_flush(struct dm_target *ti) struct mirror *m; struct dm_io_request io_req = { .bi_op = REQ_OP_WRITE, - .bi_op_flags = WRITE_FLUSH, + .bi_op_flags = REQ_PREFLUSH, .mem.type = DM_IO_KMEM, .mem.ptr.addr = NULL, .client = ms->io_client, @@ -656,7 +656,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio) struct mirror *m; struct dm_io_request io_req = { .bi_op = REQ_OP_WRITE, - .bi_op_flags = bio->bi_opf & WRITE_FLUSH_FUA, + .bi_op_flags = bio->bi_opf & (REQ_FUA | REQ_PREFLUSH), .mem.type = DM_IO_BIO, .mem.ptr.bio = bio, .notify.fn = write_callback, diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c index 6c25213ab38c..bdbb7e6e8212 100644 --- a/drivers/md/dm-round-robin.c +++ b/drivers/md/dm-round-robin.c @@ -17,8 +17,8 @@ #include <linux/module.h> #define DM_MSG_PREFIX "multipath round-robin" -#define RR_MIN_IO 1000 -#define RR_VERSION "1.1.0" +#define RR_MIN_IO 1 +#define RR_VERSION "1.2.0" /*----------------------------------------------------------------- * Path-handling code, paths are held in lists @@ -47,44 +47,19 @@ struct selector { struct list_head valid_paths; struct list_head invalid_paths; spinlock_t lock; - struct dm_path * __percpu *current_path; - struct percpu_counter repeat_count; }; -static void set_percpu_current_path(struct selector *s, struct dm_path *path) -{ - int cpu; - - for_each_possible_cpu(cpu) - *per_cpu_ptr(s->current_path, cpu) = path; -} - static struct selector *alloc_selector(void) { struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return NULL; - - INIT_LIST_HEAD(&s->valid_paths); - INIT_LIST_HEAD(&s->invalid_paths); - spin_lock_init(&s->lock); - - s->current_path = alloc_percpu(struct dm_path *); - if (!s->current_path) - goto out_current_path; - set_percpu_current_path(s, NULL); - - if (percpu_counter_init(&s->repeat_count, 0, GFP_KERNEL)) - goto out_repeat_count; + if (s) { + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->invalid_paths); + spin_lock_init(&s->lock); + } return s; - -out_repeat_count: - free_percpu(s->current_path); -out_current_path: - kfree(s); - return NULL;; } static int rr_create(struct path_selector *ps, unsigned argc, char **argv) @@ -105,8 +80,6 @@ static void rr_destroy(struct path_selector *ps) free_paths(&s->valid_paths); free_paths(&s->invalid_paths); - free_percpu(s->current_path); - percpu_counter_destroy(&s->repeat_count); kfree(s); ps->context = NULL; } @@ -157,6 +130,11 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path, return -EINVAL; } + if (repeat_count > 1) { + DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead"); + repeat_count = 1; + } + /* allocate the path */ pi = kmalloc(sizeof(*pi), GFP_KERNEL); if (!pi) { @@ -183,9 +161,6 @@ static void rr_fail_path(struct path_selector *ps, struct dm_path *p) struct path_info *pi = p->pscontext; spin_lock_irqsave(&s->lock, flags); - if (p == *this_cpu_ptr(s->current_path)) - set_percpu_current_path(s, NULL); - list_move(&pi->list, &s->invalid_paths); spin_unlock_irqrestore(&s->lock, flags); } @@ -208,29 +183,15 @@ static struct dm_path *rr_select_path(struct path_selector *ps, size_t nr_bytes) unsigned long flags; struct selector *s = ps->context; struct path_info *pi = NULL; - struct dm_path *current_path = NULL; - - local_irq_save(flags); - current_path = *this_cpu_ptr(s->current_path); - if (current_path) { - percpu_counter_dec(&s->repeat_count); - if (percpu_counter_read_positive(&s->repeat_count) > 0) { - local_irq_restore(flags); - return current_path; - } - } - spin_lock(&s->lock); + spin_lock_irqsave(&s->lock, flags); if (!list_empty(&s->valid_paths)) { pi = list_entry(s->valid_paths.next, struct path_info, list); list_move_tail(&pi->list, &s->valid_paths); - percpu_counter_set(&s->repeat_count, pi->repeat_count); - set_percpu_current_path(s, pi->path); - current_path = pi->path; } spin_unlock_irqrestore(&s->lock, flags); - return current_path; + return pi ? pi->path : NULL; } static struct path_selector_type rr_ps = { diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 1d0d2adc050a..28955b94d2b2 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -23,11 +23,7 @@ static unsigned dm_mq_queue_depth = DM_MQ_QUEUE_DEPTH; #define RESERVED_REQUEST_BASED_IOS 256 static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; -#ifdef CONFIG_DM_MQ_DEFAULT -static bool use_blk_mq = true; -#else -static bool use_blk_mq = false; -#endif +static bool use_blk_mq = IS_ENABLED(CONFIG_DM_MQ_DEFAULT); bool dm_use_blk_mq_default(void) { @@ -75,12 +71,6 @@ static void dm_old_start_queue(struct request_queue *q) static void dm_mq_start_queue(struct request_queue *q) { - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - queue_flag_clear(QUEUE_FLAG_STOPPED, q); - spin_unlock_irqrestore(q->queue_lock, flags); - blk_mq_start_stopped_hw_queues(q, true); blk_mq_kick_requeue_list(q); } @@ -105,20 +95,10 @@ static void dm_old_stop_queue(struct request_queue *q) static void dm_mq_stop_queue(struct request_queue *q) { - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - if (blk_queue_stopped(q)) { - spin_unlock_irqrestore(q->queue_lock, flags); + if (blk_mq_queue_stopped(q)) return; - } - queue_flag_set(QUEUE_FLAG_STOPPED, q); - spin_unlock_irqrestore(q->queue_lock, flags); - - /* Avoid that requeuing could restart the queue. */ - blk_mq_cancel_requeue_work(q); - blk_mq_stop_hw_queues(q); + blk_mq_quiesce_queue(q); } void dm_stop_queue(struct request_queue *q) @@ -129,28 +109,6 @@ void dm_stop_queue(struct request_queue *q) dm_mq_stop_queue(q); } -static struct dm_rq_target_io *alloc_old_rq_tio(struct mapped_device *md, - gfp_t gfp_mask) -{ - return mempool_alloc(md->io_pool, gfp_mask); -} - -static void free_old_rq_tio(struct dm_rq_target_io *tio) -{ - mempool_free(tio, tio->md->io_pool); -} - -static struct request *alloc_old_clone_request(struct mapped_device *md, - gfp_t gfp_mask) -{ - return mempool_alloc(md->rq_pool, gfp_mask); -} - -static void free_old_clone_request(struct mapped_device *md, struct request *rq) -{ - mempool_free(rq, md->rq_pool); -} - /* * Partial completion handling for request-based dm */ @@ -205,7 +163,7 @@ static void end_clone_bio(struct bio *clone) static struct dm_rq_target_io *tio_from_request(struct request *rq) { - return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special); + return blk_mq_rq_to_pdu(rq); } static void rq_end_stats(struct mapped_device *md, struct request *orig) @@ -226,6 +184,9 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig) */ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) { + struct request_queue *q = md->queue; + unsigned long flags; + atomic_dec(&md->pending[rw]); /* nudge anyone waiting on suspend queue */ @@ -238,8 +199,11 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) * back into ->request_fn() could deadlock attempting to grab the * queue lock again. */ - if (!md->queue->mq_ops && run_queue) - blk_run_queue_async(md->queue); + if (!q->mq_ops && run_queue) { + spin_lock_irqsave(q->queue_lock, flags); + blk_run_queue_async(q); + spin_unlock_irqrestore(q->queue_lock, flags); + } /* * dm_put() must be at the end of this function. See the comment above @@ -247,31 +211,6 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) dm_put(md); } -static void free_rq_clone(struct request *clone) -{ - struct dm_rq_target_io *tio = clone->end_io_data; - struct mapped_device *md = tio->md; - - blk_rq_unprep_clone(clone); - - /* - * It is possible for a clone_old_rq() allocated clone to - * get passed in -- it may not yet have a request_queue. - * This is known to occur if the error target replaces - * a multipath target that has a request_fn queue stacked - * on blk-mq queue(s). - */ - if (clone->q && clone->q->mq_ops) - /* stacked on blk-mq queue(s) */ - tio->ti->type->release_clone_rq(clone); - else if (!md->queue->mq_ops) - /* request_fn queue stacked on request_fn queue(s) */ - free_old_clone_request(md, clone); - - if (!md->queue->mq_ops) - free_old_rq_tio(tio); -} - /* * Complete the clone and the original request. * Must be called without clone's queue lock held, @@ -284,20 +223,9 @@ static void dm_end_request(struct request *clone, int error) struct mapped_device *md = tio->md; struct request *rq = tio->orig; - if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { - rq->errors = clone->errors; - rq->resid_len = clone->resid_len; - - if (rq->sense) - /* - * We are using the sense buffer of the original - * request. - * So setting the length of the sense data is enough. - */ - rq->sense_len = clone->sense_len; - } + blk_rq_unprep_clone(clone); + tio->ti->type->release_clone_rq(clone); - free_rq_clone(clone); rq_end_stats(md, rq); if (!rq->q->mq_ops) blk_end_request_all(rq, error); @@ -306,22 +234,6 @@ static void dm_end_request(struct request *clone, int error) rq_completed(md, rw, true); } -static void dm_unprep_request(struct request *rq) -{ - struct dm_rq_target_io *tio = tio_from_request(rq); - struct request *clone = tio->clone; - - if (!rq->q->mq_ops) { - rq->special = NULL; - rq->cmd_flags &= ~REQ_DONTPREP; - } - - if (clone) - free_rq_clone(clone); - else if (!tio->md->queue->mq_ops) - free_old_rq_tio(tio); -} - /* * Requeue the original request of a clone. */ @@ -338,12 +250,7 @@ static void dm_old_requeue_request(struct request *rq) static void __dm_mq_kick_requeue_list(struct request_queue *q, unsigned long msecs) { - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - if (!blk_queue_stopped(q)) - blk_mq_delay_kick_requeue_list(q, msecs); - spin_unlock_irqrestore(q->queue_lock, flags); + blk_mq_delay_kick_requeue_list(q, msecs); } void dm_mq_kick_requeue_list(struct mapped_device *md) @@ -354,7 +261,7 @@ EXPORT_SYMBOL(dm_mq_kick_requeue_list); static void dm_mq_delay_requeue_request(struct request *rq, unsigned long msecs) { - blk_mq_requeue_request(rq); + blk_mq_requeue_request(rq, false); __dm_mq_kick_requeue_list(rq->q, msecs); } @@ -365,7 +272,10 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_ int rw = rq_data_dir(rq); rq_end_stats(md, rq); - dm_unprep_request(rq); + if (tio->clone) { + blk_rq_unprep_clone(tio->clone); + tio->ti->type->release_clone_rq(tio->clone); + } if (!rq->q->mq_ops) dm_old_requeue_request(rq); @@ -418,20 +328,19 @@ static void dm_softirq_done(struct request *rq) int rw; if (!clone) { - rq_end_stats(tio->md, rq); + struct mapped_device *md = tio->md; + + rq_end_stats(md, rq); rw = rq_data_dir(rq); - if (!rq->q->mq_ops) { + if (!rq->q->mq_ops) blk_end_request_all(rq, tio->error); - rq_completed(tio->md, rw, false); - free_old_rq_tio(tio); - } else { + else blk_mq_end_request(rq, tio->error); - rq_completed(tio->md, rw, false); - } + rq_completed(md, rw, false); return; } - if (rq->cmd_flags & REQ_FAILED) + if (rq->rq_flags & RQF_FAILED) mapped = false; dm_done(clone, tio->error, mapped); @@ -460,7 +369,7 @@ static void dm_complete_request(struct request *rq, int error) */ static void dm_kill_unmapped_request(struct request *rq, int error) { - rq->cmd_flags |= REQ_FAILED; + rq->rq_flags |= RQF_FAILED; dm_complete_request(rq, error); } @@ -471,16 +380,6 @@ static void end_clone_request(struct request *clone, int error) { struct dm_rq_target_io *tio = clone->end_io_data; - if (!clone->q->mq_ops) { - /* - * For just cleaning up the information of the queue in which - * the clone was dispatched. - * The clone is *NOT* freed actually here because it is alloced - * from dm own mempool (REQ_ALLOCED isn't set). - */ - __blk_put_request(clone->q, clone); - } - /* * Actual request completion is done in a softirq context which doesn't * hold the clone's queue lock. Otherwise, deadlock could occur because: @@ -497,7 +396,7 @@ static void dm_dispatch_clone_request(struct request *clone, struct request *rq) int r; if (blk_queue_io_stat(clone->q)) - clone->cmd_flags |= REQ_IO_STAT; + clone->rq_flags |= RQF_IO_STAT; clone->start_time = jiffies; r = blk_insert_cloned_request(clone->q, clone); @@ -530,9 +429,6 @@ static int setup_clone(struct request *clone, struct request *rq, if (r) return r; - clone->cmd = rq->cmd; - clone->cmd_len = rq->cmd_len; - clone->sense = rq->sense; clone->end_io = end_clone_request; clone->end_io_data = tio; @@ -541,28 +437,6 @@ static int setup_clone(struct request *clone, struct request *rq, return 0; } -static struct request *clone_old_rq(struct request *rq, struct mapped_device *md, - struct dm_rq_target_io *tio, gfp_t gfp_mask) -{ - /* - * Create clone for use with .request_fn request_queue - */ - struct request *clone; - - clone = alloc_old_clone_request(md, gfp_mask); - if (!clone) - return NULL; - - blk_rq_init(NULL, clone); - if (setup_clone(clone, rq, tio, gfp_mask)) { - /* -ENOMEM */ - free_old_clone_request(md, clone); - return NULL; - } - - return clone; -} - static void map_tio_request(struct kthread_work *work); static void init_tio(struct dm_rq_target_io *tio, struct request *rq, @@ -584,60 +458,6 @@ static void init_tio(struct dm_rq_target_io *tio, struct request *rq, kthread_init_work(&tio->work, map_tio_request); } -static struct dm_rq_target_io *dm_old_prep_tio(struct request *rq, - struct mapped_device *md, - gfp_t gfp_mask) -{ - struct dm_rq_target_io *tio; - int srcu_idx; - struct dm_table *table; - - tio = alloc_old_rq_tio(md, gfp_mask); - if (!tio) - return NULL; - - init_tio(tio, rq, md); - - table = dm_get_live_table(md, &srcu_idx); - /* - * Must clone a request if this .request_fn DM device - * is stacked on .request_fn device(s). - */ - if (!dm_table_all_blk_mq_devices(table)) { - if (!clone_old_rq(rq, md, tio, gfp_mask)) { - dm_put_live_table(md, srcu_idx); - free_old_rq_tio(tio); - return NULL; - } - } - dm_put_live_table(md, srcu_idx); - - return tio; -} - -/* - * Called with the queue lock held. - */ -static int dm_old_prep_fn(struct request_queue *q, struct request *rq) -{ - struct mapped_device *md = q->queuedata; - struct dm_rq_target_io *tio; - - if (unlikely(rq->special)) { - DMWARN("Already has something in rq->special."); - return BLKPREP_KILL; - } - - tio = dm_old_prep_tio(rq, md, GFP_ATOMIC); - if (!tio) - return BLKPREP_DEFER; - - rq->special = tio; - rq->cmd_flags |= REQ_DONTPREP; - - return BLKPREP_OK; -} - /* * Returns: * DM_MAPIO_* : the request has been processed as indicated @@ -652,31 +472,18 @@ static int map_request(struct dm_rq_target_io *tio) struct request *rq = tio->orig; struct request *clone = NULL; - if (tio->clone) { - clone = tio->clone; - r = ti->type->map_rq(ti, clone, &tio->info); - if (r == DM_MAPIO_DELAY_REQUEUE) - return DM_MAPIO_REQUEUE; /* .request_fn requeue is always immediate */ - } else { - r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone); - if (r < 0) { - /* The target wants to complete the I/O */ - dm_kill_unmapped_request(rq, r); - return r; - } - if (r == DM_MAPIO_REMAPPED && - setup_clone(clone, rq, tio, GFP_ATOMIC)) { - /* -ENOMEM */ - ti->type->release_clone_rq(clone); - return DM_MAPIO_REQUEUE; - } - } - + r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone); switch (r) { case DM_MAPIO_SUBMITTED: /* The target has taken the I/O to submit by itself later */ break; case DM_MAPIO_REMAPPED: + if (setup_clone(clone, rq, tio, GFP_ATOMIC)) { + /* -ENOMEM */ + ti->type->release_clone_rq(clone); + return DM_MAPIO_REQUEUE; + } + /* The target has remapped the I/O so dispatch it */ trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), blk_rq_pos(rq)); @@ -735,6 +542,29 @@ static void dm_start_request(struct mapped_device *md, struct request *orig) dm_get(md); } +static int __dm_rq_init_rq(struct mapped_device *md, struct request *rq) +{ + struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); + + /* + * Must initialize md member of tio, otherwise it won't + * be available in dm_mq_queue_rq. + */ + tio->md = md; + + if (md->init_tio_pdu) { + /* target-specific per-io data is immediately after the tio */ + tio->info.ptr = tio + 1; + } + + return 0; +} + +static int dm_rq_init_rq(struct request_queue *q, struct request *rq, gfp_t gfp) +{ + return __dm_rq_init_rq(q->rq_alloc_data, rq); +} + static void map_tio_request(struct kthread_work *work) { struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work); @@ -798,6 +628,10 @@ static void dm_old_request_fn(struct request_queue *q) int srcu_idx; struct dm_table *map = dm_get_live_table(md, &srcu_idx); + if (unlikely(!map)) { + dm_put_live_table(md, srcu_idx); + return; + } ti = dm_table_find_target(map, pos); dm_put_live_table(md, srcu_idx); } @@ -819,7 +653,7 @@ static void dm_old_request_fn(struct request_queue *q) pos = blk_rq_pos(rq); if ((dm_old_request_peeked_before_merge_deadline(md) && - md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 && + md_in_flight(md) && rq->bio && !bio_multiple_segments(rq->bio) && md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) || (ti->type->busy && ti->type->busy(ti))) { blk_delay_queue(q, 10); @@ -829,6 +663,7 @@ static void dm_old_request_fn(struct request_queue *q) dm_start_request(md, rq); tio = tio_from_request(rq); + init_tio(tio, rq, md); /* Establish tio->ti before queuing work (map_tio_request) */ tio->ti = ti; kthread_queue_work(&md->kworker, &tio->work); @@ -839,10 +674,23 @@ static void dm_old_request_fn(struct request_queue *q) /* * Fully initialize a .request_fn request-based queue. */ -int dm_old_init_request_queue(struct mapped_device *md) +int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t) { + struct dm_target *immutable_tgt; + /* Fully initialize the queue */ - if (!blk_init_allocated_queue(md->queue, dm_old_request_fn, NULL)) + md->queue->cmd_size = sizeof(struct dm_rq_target_io); + md->queue->rq_alloc_data = md; + md->queue->request_fn = dm_old_request_fn; + md->queue->init_rq_fn = dm_rq_init_rq; + + immutable_tgt = dm_table_get_immutable_target(t); + if (immutable_tgt && immutable_tgt->per_io_data_size) { + /* any target-specific per-io data is immediately after the tio */ + md->queue->cmd_size += immutable_tgt->per_io_data_size; + md->init_tio_pdu = true; + } + if (blk_init_allocated_queue(md->queue) < 0) return -EINVAL; /* disable dm_old_request_fn's merge heuristic by default */ @@ -850,7 +698,6 @@ int dm_old_init_request_queue(struct mapped_device *md) dm_init_normal_md_queue(md); blk_queue_softirq_done(md->queue, dm_softirq_done); - blk_queue_prep_rq(md->queue, dm_old_prep_fn); /* Initialize the request-based DM worker thread */ kthread_init_worker(&md->kworker); @@ -871,21 +718,7 @@ static int dm_mq_init_request(void *data, struct request *rq, unsigned int hctx_idx, unsigned int request_idx, unsigned int numa_node) { - struct mapped_device *md = data; - struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); - - /* - * Must initialize md member of tio, otherwise it won't - * be available in dm_mq_queue_rq. - */ - tio->md = md; - - if (md->init_tio_pdu) { - /* target-specific per-io data is immediately after the tio */ - tio->info.ptr = tio + 1; - } - - return 0; + return __dm_rq_init_rq(data, rq); } static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, @@ -904,17 +737,6 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, dm_put_live_table(md, srcu_idx); } - /* - * On suspend dm_stop_queue() handles stopping the blk-mq - * request_queue BUT: even though the hw_queues are marked - * BLK_MQ_S_STOPPED at that point there is still a race that - * is allowing block/blk-mq.c to call ->queue_rq against a - * hctx that it really shouldn't. The following check guards - * against this rarity (albeit _not_ race-free). - */ - if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) - return BLK_MQ_RQ_QUEUE_BUSY; - if (ti->type->busy && ti->type->busy(ti)) return BLK_MQ_RQ_QUEUE_BUSY; diff --git a/drivers/md/dm-rq.h b/drivers/md/dm-rq.h index 4da06cae7bad..f0020d21b95f 100644 --- a/drivers/md/dm-rq.h +++ b/drivers/md/dm-rq.h @@ -48,7 +48,7 @@ struct dm_rq_clone_bio_info { bool dm_use_blk_mq_default(void); bool dm_use_blk_mq(struct mapped_device *md); -int dm_old_init_request_queue(struct mapped_device *md); +int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t); int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t); void dm_mq_cleanup_mapped_device(struct mapped_device *md); diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index b8cf956b577b..b93476c3ba3f 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -741,7 +741,7 @@ static void persistent_commit_exception(struct dm_exception_store *store, /* * Commit exceptions to disk. */ - if (ps->valid && area_io(ps, REQ_OP_WRITE, WRITE_FLUSH_FUA)) + if (ps->valid && area_io(ps, REQ_OP_WRITE, REQ_PREFLUSH | REQ_FUA)) ps->valid = 0; /* @@ -818,7 +818,7 @@ static int persistent_commit_merge(struct dm_exception_store *store, for (i = 0; i < nr_merged; i++) clear_exception(ps, ps->current_committed - 1 - i); - r = area_io(ps, REQ_OP_WRITE, WRITE_FLUSH_FUA); + r = area_io(ps, REQ_OP_WRITE, REQ_PREFLUSH | REQ_FUA); if (r < 0) return r; diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 38b05f23b96c..0250e7e521ab 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -175,6 +175,7 @@ static void dm_stat_free(struct rcu_head *head) int cpu; struct dm_stat *s = container_of(head, struct dm_stat, rcu_head); + kfree(s->histogram_boundaries); kfree(s->program_id); kfree(s->aux_data); for_each_possible_cpu(cpu) { diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index c4b53b332607..3ad16d9c9d5a 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -871,7 +871,7 @@ static int dm_table_determine_type(struct dm_table *t) { unsigned i; unsigned bio_based = 0, request_based = 0, hybrid = 0; - bool verify_blk_mq = false; + unsigned sq_count = 0, mq_count = 0; struct dm_target *tgt; struct dm_dev_internal *dd; struct list_head *devices = dm_table_get_devices(t); @@ -924,12 +924,6 @@ static int dm_table_determine_type(struct dm_table *t) BUG_ON(!request_based); /* No targets in this table */ - if (list_empty(devices) && __table_type_request_based(live_md_type)) { - /* inherit live MD type */ - t->type = live_md_type; - return 0; - } - /* * The only way to establish DM_TYPE_MQ_REQUEST_BASED is by * having a compatible target use dm_table_set_type. @@ -948,6 +942,19 @@ verify_rq_based: return -EINVAL; } + if (list_empty(devices)) { + int srcu_idx; + struct dm_table *live_table = dm_get_live_table(t->md, &srcu_idx); + + /* inherit live table's type and all_blk_mq */ + if (live_table) { + t->type = live_table->type; + t->all_blk_mq = live_table->all_blk_mq; + } + dm_put_live_table(t->md, srcu_idx); + return 0; + } + /* Non-request-stackable devices can't be used for request-based dm */ list_for_each_entry(dd, devices, list) { struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev); @@ -959,19 +966,19 @@ verify_rq_based: } if (q->mq_ops) - verify_blk_mq = true; + mq_count++; + else + sq_count++; } + if (sq_count && mq_count) { + DMERR("table load rejected: not all devices are blk-mq request-stackable"); + return -EINVAL; + } + t->all_blk_mq = mq_count > 0; - if (verify_blk_mq) { - /* verify _all_ devices in the table are blk-mq devices */ - list_for_each_entry(dd, devices, list) - if (!bdev_get_queue(dd->dm_dev->bdev)->mq_ops) { - DMERR("table load rejected: not all devices" - " are blk-mq request-stackable"); - return -EINVAL; - } - - t->all_blk_mq = true; + if (t->type == DM_TYPE_MQ_REQUEST_BASED && !t->all_blk_mq) { + DMERR("table load rejected: all devices are not blk-mq request-stackable"); + return -EINVAL; } return 0; @@ -1743,7 +1750,7 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) char b[BDEVNAME_SIZE]; if (likely(q)) - r |= bdi_congested(&q->backing_dev_info, bdi_bits); + r |= bdi_congested(q->backing_dev_info, bdi_bits); else DMWARN_LIMIT("%s: any_congested: nonexistent device %s", dm_device_name(t->md), diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 710ae28fd618..43d3445b121d 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -131,12 +131,6 @@ static int io_err_map(struct dm_target *tt, struct bio *bio) return -EIO; } -static int io_err_map_rq(struct dm_target *ti, struct request *clone, - union map_info *map_context) -{ - return -EIO; -} - static int io_err_clone_and_map_rq(struct dm_target *ti, struct request *rq, union map_info *map_context, struct request **clone) @@ -161,7 +155,6 @@ static struct target_type error_target = { .ctr = io_err_ctr, .dtr = io_err_dtr, .map = io_err_map, - .map_rq = io_err_map_rq, .clone_and_map_rq = io_err_clone_and_map_rq, .release_clone_rq = io_err_release_clone_rq, .direct_access = io_err_direct_access, diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index d1c05c12a9db..2b266a2b5035 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -699,7 +699,7 @@ static void remap_to_origin(struct thin_c *tc, struct bio *bio) static int bio_triggers_commit(struct thin_c *tc, struct bio *bio) { - return (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) && + return op_is_flush(bio->bi_opf) && dm_thin_changed_this_transaction(tc->td); } @@ -870,8 +870,7 @@ static void __inc_remap_and_issue_cell(void *context, struct bio *bio; while ((bio = bio_list_pop(&cell->bios))) { - if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) || - bio_op(bio) == REQ_OP_DISCARD) + if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD) bio_list_add(&info->defer_bios, bio); else { inc_all_io_entry(info->tc->pool, bio); @@ -1716,9 +1715,8 @@ static void __remap_and_issue_shared_cell(void *context, struct bio *bio; while ((bio = bio_list_pop(&cell->bios))) { - if ((bio_data_dir(bio) == WRITE) || - (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) || - bio_op(bio) == REQ_OP_DISCARD)) + if (bio_data_dir(bio) == WRITE || op_is_flush(bio->bi_opf) || + bio_op(bio) == REQ_OP_DISCARD) bio_list_add(&info->defer_bios, bio); else { struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));; @@ -2635,8 +2633,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } - if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) || - bio_op(bio) == REQ_OP_DISCARD) { + if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD) { thin_defer_bio_with_throttle(tc, bio); return DM_MAPIO_SUBMITTED; } @@ -2714,7 +2711,7 @@ static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits) return 1; q = bdev_get_queue(pt->data_dev->bdev); - return bdi_congested(&q->backing_dev_info, bdi_bits); + return bdi_congested(q->backing_dev_info, bdi_bits); } static void requeue_bios(struct pool *pool) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 0aba34a7b3b3..7335d8a3fc47 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -868,7 +868,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) r = dm_get_device(ti, argv[2], FMODE_READ, &v->hash_dev); if (r) { - ti->error = "Data device lookup failed"; + ti->error = "Hash device lookup failed"; goto bad; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index ef7bf1dd6900..f4ffd1eb8f44 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -12,6 +12,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/mutex.h> +#include <linux/sched/signal.h> #include <linux/blkpg.h> #include <linux/bio.h> #include <linux/mempool.h> @@ -91,7 +92,6 @@ static int dm_numa_node = DM_NUMA_NODE; */ struct dm_md_mempools { mempool_t *io_pool; - mempool_t *rq_pool; struct bio_set *bs; }; @@ -466,13 +466,16 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, if (r > 0) { /* - * Target determined this ioctl is being issued against - * a logical partition of the parent bdev; so extra - * validation is needed. + * Target determined this ioctl is being issued against a + * subset of the parent bdev; require extra privileges. */ - r = scsi_verify_blk_ioctl(NULL, cmd); - if (r) + if (!capable(CAP_SYS_RAWIO)) { + DMWARN_LIMIT( + "%s: sending ioctl %x to DM device without required privilege.", + current->comm, cmd); + r = -ENOIOCTLCMD; goto out; + } } r = __blkdev_driver_ioctl(bdev, mode, cmd, arg); @@ -972,10 +975,61 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) } EXPORT_SYMBOL_GPL(dm_accept_partial_bio); +/* + * Flush current->bio_list when the target map method blocks. + * This fixes deadlocks in snapshot and possibly in other targets. + */ +struct dm_offload { + struct blk_plug plug; + struct blk_plug_cb cb; +}; + +static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule) +{ + struct dm_offload *o = container_of(cb, struct dm_offload, cb); + struct bio_list list; + struct bio *bio; + + INIT_LIST_HEAD(&o->cb.list); + + if (unlikely(!current->bio_list)) + return; + + list = *current->bio_list; + bio_list_init(current->bio_list); + + while ((bio = bio_list_pop(&list))) { + struct bio_set *bs = bio->bi_pool; + if (unlikely(!bs) || bs == fs_bio_set) { + bio_list_add(current->bio_list, bio); + continue; + } + + spin_lock(&bs->rescue_lock); + bio_list_add(&bs->rescue_list, bio); + queue_work(bs->rescue_workqueue, &bs->rescue_work); + spin_unlock(&bs->rescue_lock); + } +} + +static void dm_offload_start(struct dm_offload *o) +{ + blk_start_plug(&o->plug); + o->cb.callback = flush_current_bio_list; + list_add(&o->cb.list, ¤t->plug->cb_list); +} + +static void dm_offload_end(struct dm_offload *o) +{ + list_del(&o->cb.list); + blk_finish_plug(&o->plug); +} + static void __map_bio(struct dm_target_io *tio) { int r; sector_t sector; + struct dm_offload o; struct bio *clone = &tio->clone; struct dm_target *ti = tio->ti; @@ -988,7 +1042,11 @@ static void __map_bio(struct dm_target_io *tio) */ atomic_inc(&tio->io->io_count); sector = clone->bi_iter.bi_sector; + + dm_offload_start(&o); r = ti->type->map(ti, clone); + dm_offload_end(&o); + if (r == DM_MAPIO_REMAPPED) { /* the bio has been remapped so dispatch it */ @@ -1314,7 +1372,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits) * With request-based DM we only need to check the * top-level queue for congestion. */ - r = md->queue->backing_dev_info.wb.state & bdi_bits; + r = md->queue->backing_dev_info->wb.state & bdi_bits; } else { map = dm_get_live_table_fast(md); if (map) @@ -1397,7 +1455,7 @@ void dm_init_md_queue(struct mapped_device *md) * - must do so here (in alloc_dev callchain) before queue is used */ md->queue->queuedata = md; - md->queue->backing_dev_info.congested_data = md; + md->queue->backing_dev_info->congested_data = md; } void dm_init_normal_md_queue(struct mapped_device *md) @@ -1408,7 +1466,7 @@ void dm_init_normal_md_queue(struct mapped_device *md) /* * Initialize aspects of queue that aren't relevant for blk-mq */ - md->queue->backing_dev_info.congested_fn = dm_any_congested; + md->queue->backing_dev_info->congested_fn = dm_any_congested; blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); } @@ -1419,7 +1477,6 @@ static void cleanup_mapped_device(struct mapped_device *md) if (md->kworker_task) kthread_stop(md->kworker_task); mempool_destroy(md->io_pool); - mempool_destroy(md->rq_pool); if (md->bs) bioset_free(md->bs); @@ -1525,9 +1582,9 @@ static struct mapped_device *alloc_dev(int minor) if (!md->bdev) goto bad; - bio_init(&md->flush_bio); + bio_init(&md->flush_bio, NULL, 0); md->flush_bio.bi_bdev = md->bdev; - bio_set_op_attrs(&md->flush_bio, REQ_OP_WRITE, WRITE_FLUSH); + md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; dm_stats_init(&md->stats); @@ -1595,12 +1652,10 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t) goto out; } - BUG_ON(!p || md->io_pool || md->rq_pool || md->bs); + BUG_ON(!p || md->io_pool || md->bs); md->io_pool = p->io_pool; p->io_pool = NULL; - md->rq_pool = p->rq_pool; - p->rq_pool = NULL; md->bs = p->bs; p->bs = NULL; @@ -1777,7 +1832,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) switch (type) { case DM_TYPE_REQUEST_BASED: - r = dm_old_init_request_queue(md); + r = dm_old_init_request_queue(md, t); if (r) { DMERR("Cannot initialize queue for request-based mapped device"); return r; @@ -1886,9 +1941,7 @@ static void __dm_destroy(struct mapped_device *md, bool wait) set_bit(DMF_FREEING, &md->flags); spin_unlock(&_minor_lock); - spin_lock_irq(q->queue_lock); - queue_flag_set(QUEUE_FLAG_DYING, q); - spin_unlock_irq(q->queue_lock); + blk_set_queue_dying(q); if (dm_request_based(md) && md->kworker_task) kthread_flush_worker(&md->kworker); @@ -2495,7 +2548,6 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t unsigned integrity, unsigned per_io_data_size) { struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); - struct kmem_cache *cachep = NULL; unsigned int pool_size = 0; unsigned int front_pad; @@ -2505,20 +2557,16 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t switch (type) { case DM_TYPE_BIO_BASED: case DM_TYPE_DAX_BIO_BASED: - cachep = _io_cache; pool_size = dm_get_reserved_bio_based_ios(); front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); + + pools->io_pool = mempool_create_slab_pool(pool_size, _io_cache); + if (!pools->io_pool) + goto out; break; case DM_TYPE_REQUEST_BASED: - cachep = _rq_tio_cache; - pool_size = dm_get_reserved_rq_based_ios(); - pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache); - if (!pools->rq_pool) - goto out; - /* fall through to setup remaining rq-based pools */ case DM_TYPE_MQ_REQUEST_BASED: - if (!pool_size) - pool_size = dm_get_reserved_rq_based_ios(); + pool_size = dm_get_reserved_rq_based_ios(); front_pad = offsetof(struct dm_rq_clone_bio_info, clone); /* per_io_data_size is used for blk-mq pdu at queue allocation */ break; @@ -2526,12 +2574,6 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t BUG(); } - if (cachep) { - pools->io_pool = mempool_create_slab_pool(pool_size, cachep); - if (!pools->io_pool) - goto out; - } - pools->bs = bioset_create_nobvec(pool_size, front_pad); if (!pools->bs) goto out; @@ -2553,7 +2595,6 @@ void dm_free_md_mempools(struct dm_md_mempools *pools) return; mempool_destroy(pools->io_pool); - mempool_destroy(pools->rq_pool); if (pools->bs) bioset_free(pools->bs); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index f0aad08b9654..f298b01f7ab3 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -95,8 +95,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); /* * To check whether the target type is request-based or not (bio-based). */ -#define dm_target_request_based(t) (((t)->type->map_rq != NULL) || \ - ((t)->type->clone_and_map_rq != NULL)) +#define dm_target_request_based(t) ((t)->type->clone_and_map_rq != NULL) /* * To check whether the target type is a hybrid (capable of being diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 685aa2d77e25..b0536cfd8e17 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -214,7 +214,7 @@ static void faulty_make_request(struct mddev *mddev, struct bio *bio) } } if (failit) { - struct bio *b = bio_clone_mddev(bio, GFP_NOIO, mddev); + struct bio *b = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); b->bi_bdev = conf->rdev->bdev; b->bi_private = bio; diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 86f5d435901d..3e38e0207a3e 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -21,6 +21,7 @@ #include <linux/seq_file.h> #include <linux/module.h> #include <linux/slab.h> +#include <trace/events/block.h> #include "md.h" #include "linear.h" @@ -52,18 +53,26 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) return conf->disks + lo; } +/* + * In linear_congested() conf->raid_disks is used as a copy of + * mddev->raid_disks to iterate conf->disks[], because conf->raid_disks + * and conf->disks[] are created in linear_conf(), they are always + * consitent with each other, but mddev->raid_disks does not. + */ static int linear_congested(struct mddev *mddev, int bits) { struct linear_conf *conf; int i, ret = 0; - conf = mddev->private; + rcu_read_lock(); + conf = rcu_dereference(mddev->private); - for (i = 0; i < mddev->raid_disks && !ret ; i++) { + for (i = 0; i < conf->raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= bdi_congested(q->backing_dev_info, bits); } + rcu_read_unlock(); return ret; } @@ -101,8 +110,8 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) sector_t sectors; if (j < 0 || j >= raid_disks || disk->rdev) { - printk(KERN_ERR "md/linear:%s: disk numbering problem. Aborting!\n", - mdname(mddev)); + pr_warn("md/linear:%s: disk numbering problem. Aborting!\n", + mdname(mddev)); goto out; } @@ -123,8 +132,8 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) discard_supported = true; } if (cnt != raid_disks) { - printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n", - mdname(mddev)); + pr_warn("md/linear:%s: not enough drives present. Aborting!\n", + mdname(mddev)); goto out; } @@ -143,6 +152,19 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) conf->disks[i-1].end_sector + conf->disks[i].rdev->sectors; + /* + * conf->raid_disks is copy of mddev->raid_disks. The reason to + * keep a copy of mddev->raid_disks in struct linear_conf is, + * mddev->raid_disks may not be consistent with pointers number of + * conf->disks[] when it is updated in linear_add() and used to + * iterate old conf->disks[] earray in linear_congested(). + * Here conf->raid_disks is always consitent with number of + * pointers in conf->disks[] array, and mddev->private is updated + * with rcu_assign_pointer() in linear_addr(), such race can be + * avoided. + */ + conf->raid_disks = raid_disks; + return conf; out: @@ -195,15 +217,24 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) if (!newconf) return -ENOMEM; + /* newconf->raid_disks already keeps a copy of * the increased + * value of mddev->raid_disks, WARN_ONCE() is just used to make + * sure of this. It is possible that oldconf is still referenced + * in linear_congested(), therefore kfree_rcu() is used to free + * oldconf until no one uses it anymore. + */ mddev_suspend(mddev); - oldconf = mddev->private; + oldconf = rcu_dereference_protected(mddev->private, + lockdep_is_held(&mddev->reconfig_mutex)); mddev->raid_disks++; - mddev->private = newconf; + WARN_ONCE(mddev->raid_disks != newconf->raid_disks, + "copied raid_disks doesn't match mddev->raid_disks"); + rcu_assign_pointer(mddev->private, newconf); md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); mddev_resume(mddev); revalidate_disk(mddev->gendisk); - kfree(oldconf); + kfree_rcu(oldconf, rcu); return 0; } @@ -227,22 +258,22 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) } do { - tmp_dev = which_dev(mddev, bio->bi_iter.bi_sector); + sector_t bio_sector = bio->bi_iter.bi_sector; + tmp_dev = which_dev(mddev, bio_sector); start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; end_sector = tmp_dev->end_sector; data_offset = tmp_dev->rdev->data_offset; bio->bi_bdev = tmp_dev->rdev->bdev; - if (unlikely(bio->bi_iter.bi_sector >= end_sector || - bio->bi_iter.bi_sector < start_sector)) + if (unlikely(bio_sector >= end_sector || + bio_sector < start_sector)) goto out_of_bounds; if (unlikely(bio_end_sector(bio) > end_sector)) { /* This bio crosses a device boundary, so we have to * split it. */ - split = bio_split(bio, end_sector - - bio->bi_iter.bi_sector, + split = bio_split(bio, end_sector - bio_sector, GFP_NOIO, fs_bio_set); bio_chain(split, bio); } else { @@ -256,15 +287,19 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) { /* Just ignore it */ bio_endio(split); - } else + } else { + if (mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(split->bi_bdev), + split, disk_devt(mddev->gendisk), + bio_sector); + mddev_check_writesame(mddev, split); generic_make_request(split); + } } while (split != bio); return; out_of_bounds: - printk(KERN_ERR - "md/linear:%s: make_request: Sector %llu out of bounds on " - "dev %s: %llu sectors, offset %llu\n", + pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %s: %llu sectors, offset %llu\n", mdname(mddev), (unsigned long long)bio->bi_iter.bi_sector, bdevname(tmp_dev->rdev->bdev, b), @@ -275,7 +310,6 @@ out_of_bounds: static void linear_status (struct seq_file *seq, struct mddev *mddev) { - seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); } diff --git a/drivers/md/linear.h b/drivers/md/linear.h index b685ddd7d7f7..8d392e6098b3 100644 --- a/drivers/md/linear.h +++ b/drivers/md/linear.h @@ -10,6 +10,7 @@ struct linear_conf { struct rcu_head rcu; sector_t array_sectors; + int raid_disks; /* a copy of mddev->raid_disks */ struct dev_info disks[0]; }; #endif diff --git a/drivers/md/md.c b/drivers/md/md.c index 2089d46b0eb8..548d1b8014f8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -30,8 +30,21 @@ You should have received a copy of the GNU General Public License (for example /usr/src/linux/COPYING); if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + Errors, Warnings, etc. + Please use: + pr_crit() for error conditions that risk data loss + pr_err() for error conditions that are unexpected, like an IO error + or internal inconsistency + pr_warn() for error conditions that could have been predicated, like + adding a device to an array when it has incompatible metadata + pr_info() for every interesting, very rare events, like an array starting + or stopping, or resync starting or stopping + pr_debug() for everything else. + */ +#include <linux/sched/signal.h> #include <linux/kthread.h> #include <linux/blkdev.h> #include <linux/badblocks.h> @@ -52,6 +65,7 @@ #include <linux/raid/md_p.h> #include <linux/raid/md_u.h> #include <linux/slab.h> +#include <trace/events/block.h> #include "md.h" #include "bitmap.h" #include "md-cluster.h" @@ -177,16 +191,6 @@ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, } EXPORT_SYMBOL_GPL(bio_alloc_mddev); -struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, - struct mddev *mddev) -{ - if (!mddev || !mddev->bio_set) - return bio_clone(bio, gfp_mask); - - return bio_clone_bioset(bio, gfp_mask, mddev->bio_set); -} -EXPORT_SYMBOL_GPL(bio_clone_mddev); - /* * We have a system wide 'event count' that is incremented * on any 'interesting' event, and readers of /proc/mdstat @@ -394,7 +398,7 @@ static void submit_flushes(struct work_struct *ws) bi->bi_end_io = md_end_flush; bi->bi_private = rdev; bi->bi_bdev = rdev->bdev; - bio_set_op_attrs(bi, REQ_OP_WRITE, WRITE_FLUSH); + bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; atomic_inc(&mddev->flush_pending); submit_bio(bi); rcu_read_lock(); @@ -684,11 +688,8 @@ static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) static int alloc_disk_sb(struct md_rdev *rdev) { rdev->sb_page = alloc_page(GFP_KERNEL); - if (!rdev->sb_page) { - printk(KERN_ALERT "md: out of memory.\n"); + if (!rdev->sb_page) return -ENOMEM; - } - return 0; } @@ -715,9 +716,15 @@ static void super_written(struct bio *bio) struct mddev *mddev = rdev->mddev; if (bio->bi_error) { - printk("md: super_written gets error=%d\n", bio->bi_error); + pr_err("md: super_written gets error=%d\n", bio->bi_error); md_error(mddev, rdev); - } + if (!test_bit(Faulty, &rdev->flags) + && (bio->bi_opf & MD_FAILFAST)) { + set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); + set_bit(LastDev, &rdev->flags); + } + } else + clear_bit(LastDev, &rdev->flags); if (atomic_dec_and_test(&mddev->pending_writes)) wake_up(&mddev->sb_wait); @@ -734,7 +741,13 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, * if zero is reached. * If an error occurred, call md_error */ - struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); + struct bio *bio; + int ff = 0; + + if (test_bit(Faulty, &rdev->flags)) + return; + + bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); atomic_inc(&rdev->nr_pending); @@ -743,16 +756,24 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, bio_add_page(bio, page, size, 0); bio->bi_private = rdev; bio->bi_end_io = super_written; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH_FUA); + + if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && + test_bit(FailFast, &rdev->flags) && + !test_bit(LastDev, &rdev->flags)) + ff = MD_FAILFAST; + bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA | ff; atomic_inc(&mddev->pending_writes); submit_bio(bio); } -void md_super_wait(struct mddev *mddev) +int md_super_wait(struct mddev *mddev) { /* wait for all superblock writes that were scheduled to complete */ wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); + if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) + return -EAGAIN; + return 0; } int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, @@ -795,8 +816,8 @@ static int read_disk_sb(struct md_rdev *rdev, int size) return 0; fail: - printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", - bdevname(rdev->bdev,b)); + pr_err("md: disabled device %s, could not read superblock.\n", + bdevname(rdev->bdev,b)); return -EINVAL; } @@ -818,7 +839,6 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) if (!tmp1 || !tmp2) { ret = 0; - printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n"); goto abort; } @@ -932,7 +952,7 @@ int md_check_no_bitmap(struct mddev *mddev) { if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) return 0; - printk(KERN_ERR "%s: bitmaps are not supported for %s\n", + pr_warn("%s: bitmaps are not supported for %s\n", mdname(mddev), mddev->pers->name); return 1; } @@ -956,7 +976,8 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor rdev->sb_start = calc_dev_sboffset(rdev); ret = read_disk_sb(rdev, MD_SB_BYTES); - if (ret) return ret; + if (ret) + return ret; ret = -EINVAL; @@ -964,17 +985,15 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor sb = page_address(rdev->sb_page); if (sb->md_magic != MD_SB_MAGIC) { - printk(KERN_ERR "md: invalid raid superblock magic on %s\n", - b); + pr_warn("md: invalid raid superblock magic on %s\n", b); goto abort; } if (sb->major_version != 0 || sb->minor_version < 90 || sb->minor_version > 91) { - printk(KERN_WARNING "Bad version number %d.%d on %s\n", - sb->major_version, sb->minor_version, - b); + pr_warn("Bad version number %d.%d on %s\n", + sb->major_version, sb->minor_version, b); goto abort; } @@ -982,8 +1001,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor goto abort; if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { - printk(KERN_WARNING "md: invalid superblock checksum on %s\n", - b); + pr_warn("md: invalid superblock checksum on %s\n", b); goto abort; } @@ -1004,14 +1022,13 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor __u64 ev1, ev2; mdp_super_t *refsb = page_address(refdev->sb_page); if (!uuid_equal(refsb, sb)) { - printk(KERN_WARNING "md: %s has different UUID to %s\n", + pr_warn("md: %s has different UUID to %s\n", b, bdevname(refdev->bdev,b2)); goto abort; } if (!sb_equal(refsb, sb)) { - printk(KERN_WARNING "md: %s has same UUID" - " but different superblock to %s\n", - b, bdevname(refdev->bdev, b2)); + pr_warn("md: %s has same UUID but different superblock to %s\n", + b, bdevname(refdev->bdev, b2)); goto abort; } ev1 = md_event(sb); @@ -1158,6 +1175,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) } if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) set_bit(WriteMostly, &rdev->flags); + if (desc->state & (1<<MD_DISK_FAILFAST)) + set_bit(FailFast, &rdev->flags); } else /* MULTIPATH are always insync */ set_bit(In_sync, &rdev->flags); return 0; @@ -1283,6 +1302,8 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) } if (test_bit(WriteMostly, &rdev2->flags)) d->state |= (1<<MD_DISK_WRITEMOSTLY); + if (test_bit(FailFast, &rdev2->flags)) + d->state |= (1<<MD_DISK_FAILFAST); } /* now set the "removed" and "faulty" bits on any missing devices */ for (i=0 ; i < mddev->raid_disks ; i++) { @@ -1324,9 +1345,10 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) num_sectors = (sector_t)(2ULL << 32) - 2; - md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, + do { + md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page); - md_super_wait(rdev->mddev); + } while (md_super_wait(rdev->mddev) < 0); return num_sectors; } @@ -1413,13 +1435,13 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ return -EINVAL; if (calc_sb_1_csum(sb) != sb->sb_csum) { - printk("md: invalid superblock checksum on %s\n", + pr_warn("md: invalid superblock checksum on %s\n", bdevname(rdev->bdev,b)); return -EINVAL; } if (le64_to_cpu(sb->data_size) < 10) { - printk("md: data_size too small on %s\n", - bdevname(rdev->bdev,b)); + pr_warn("md: data_size too small on %s\n", + bdevname(rdev->bdev,b)); return -EINVAL; } if (sb->pad0 || @@ -1503,8 +1525,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ sb->level != refsb->level || sb->layout != refsb->layout || sb->chunksize != refsb->chunksize) { - printk(KERN_WARNING "md: %s has strangely different" - " superblock to %s\n", + pr_warn("md: %s has strangely different superblock to %s\n", bdevname(rdev->bdev,b), bdevname(refdev->bdev,b2)); return -EINVAL; @@ -1646,8 +1667,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) case MD_DISK_ROLE_JOURNAL: /* journal device */ if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { /* journal device without journal feature */ - printk(KERN_WARNING - "md: journal device provided without journal feature, ignoring the device\n"); + pr_warn("md: journal device provided without journal feature, ignoring the device\n"); return -EINVAL; } set_bit(Journal, &rdev->flags); @@ -1669,6 +1689,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) } if (sb->devflags & WriteMostly1) set_bit(WriteMostly, &rdev->flags); + if (sb->devflags & FailFast1) + set_bit(FailFast, &rdev->flags); if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) set_bit(Replacement, &rdev->flags); } else /* MULTIPATH are always insync */ @@ -1707,6 +1729,10 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->chunksize = cpu_to_le32(mddev->chunk_sectors); sb->level = cpu_to_le32(mddev->level); sb->layout = cpu_to_le32(mddev->layout); + if (test_bit(FailFast, &rdev->flags)) + sb->devflags |= FailFast1; + else + sb->devflags &= ~FailFast1; if (test_bit(WriteMostly, &rdev->flags)) sb->devflags |= WriteMostly1; @@ -1863,9 +1889,10 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) sb->data_size = cpu_to_le64(num_sectors); sb->super_offset = rdev->sb_start; sb->sb_csum = calc_sb_1_csum(sb); - md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, - rdev->sb_page); - md_super_wait(rdev->mddev); + do { + md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, + rdev->sb_page); + } while (md_super_wait(rdev->mddev) < 0); return num_sectors; } @@ -2004,9 +2031,9 @@ int md_integrity_register(struct mddev *mddev) blk_integrity_register(mddev->gendisk, bdev_get_integrity(reference->bdev)); - printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev)); + pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { - printk(KERN_ERR "md: failed to create integrity pool for %s\n", + pr_err("md: failed to create integrity pool for %s\n", mdname(mddev)); return -EINVAL; } @@ -2034,8 +2061,8 @@ int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) return 0; if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { - printk(KERN_NOTICE "%s: incompatible integrity profile for %s\n", - mdname(mddev), bdevname(rdev->bdev, name)); + pr_err("%s: incompatible integrity profile for %s\n", + mdname(mddev), bdevname(rdev->bdev, name)); return -ENXIO; } @@ -2089,15 +2116,15 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) rcu_read_unlock(); if (!test_bit(Journal, &rdev->flags) && mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { - printk(KERN_WARNING "md: %s: array is limited to %d devices\n", - mdname(mddev), mddev->max_disks); + pr_warn("md: %s: array is limited to %d devices\n", + mdname(mddev), mddev->max_disks); return -EBUSY; } bdevname(rdev->bdev,b); strreplace(b, '/', '!'); rdev->mddev = mddev; - printk(KERN_INFO "md: bind<%s>\n", b); + pr_debug("md: bind<%s>\n", b); if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) goto fail; @@ -2116,8 +2143,8 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) return 0; fail: - printk(KERN_WARNING "md: failed to register dev-%s for %s\n", - b, mdname(mddev)); + pr_warn("md: failed to register dev-%s for %s\n", + b, mdname(mddev)); return err; } @@ -2134,7 +2161,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev) bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); - printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); + pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b)); rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); @@ -2164,8 +2191,7 @@ static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, shared ? (struct md_rdev *)lock_rdev : rdev); if (IS_ERR(bdev)) { - printk(KERN_ERR "md: could not open %s.\n", - __bdevname(dev, b)); + pr_warn("md: could not open %s.\n", __bdevname(dev, b)); return PTR_ERR(bdev); } rdev->bdev = bdev; @@ -2185,8 +2211,7 @@ static void export_rdev(struct md_rdev *rdev) { char b[BDEVNAME_SIZE]; - printk(KERN_INFO "md: export_rdev(%s)\n", - bdevname(rdev->bdev,b)); + pr_debug("md: export_rdev(%s)\n", bdevname(rdev->bdev,b)); md_rdev_clear(rdev); #ifndef MODULE if (test_bit(AutoDetected, &rdev->flags)) @@ -2288,24 +2313,24 @@ void md_update_sb(struct mddev *mddev, int force_change) if (mddev->ro) { if (force_change) - set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); return; } repeat: if (mddev_is_clustered(mddev)) { - if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) + if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) force_change = 1; - if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) + if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) nospares = 1; ret = md_cluster_ops->metadata_update_start(mddev); /* Has someone else has updated the sb */ if (!does_sb_need_changing(mddev)) { if (ret == 0) md_cluster_ops->metadata_update_cancel(mddev); - bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING), - BIT(MD_CHANGE_DEVS) | - BIT(MD_CHANGE_CLEAN)); + bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), + BIT(MD_SB_CHANGE_DEVS) | + BIT(MD_SB_CHANGE_CLEAN)); return; } } @@ -2321,10 +2346,10 @@ repeat: } if (!mddev->persistent) { - clear_bit(MD_CHANGE_CLEAN, &mddev->flags); - clear_bit(MD_CHANGE_DEVS, &mddev->flags); + clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); + clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); if (!mddev->external) { - clear_bit(MD_CHANGE_PENDING, &mddev->flags); + clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); rdev_for_each(rdev, mddev) { if (rdev->badblocks.changed) { rdev->badblocks.changed = 0; @@ -2344,9 +2369,9 @@ repeat: mddev->utime = ktime_get_real_seconds(); - if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) + if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) force_change = 1; - if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) + if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) /* just a clean<-> dirty transition, possibly leave spares alone, * though if events isn't the right even/odd, we will have to do * spares after all @@ -2402,6 +2427,9 @@ repeat: pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", mdname(mddev), mddev->in_sync); + if (mddev->queue) + blk_add_trace_msg(mddev->queue, "md md_update_sb"); +rewrite: bitmap_update_sb(mddev->bitmap); rdev_for_each(rdev, mddev) { char b[BDEVNAME_SIZE]; @@ -2433,15 +2461,16 @@ repeat: /* only need to write one superblock... */ break; } - md_super_wait(mddev); - /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ + if (md_super_wait(mddev) < 0) + goto rewrite; + /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ if (mddev_is_clustered(mddev) && ret == 0) md_cluster_ops->metadata_update_finish(mddev); if (mddev->in_sync != sync_req || - !bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING), - BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_CLEAN))) + !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), + BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) /* have to write it out again */ goto repeat; wake_up(&mddev->sb_wait); @@ -2485,7 +2514,7 @@ static int add_bound_rdev(struct md_rdev *rdev) } sysfs_notify_dirent_safe(rdev->sysfs_state); - set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); if (mddev->degraded) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -2523,51 +2552,41 @@ struct rdev_sysfs_entry { static ssize_t state_show(struct md_rdev *rdev, char *page) { - char *sep = ""; + char *sep = ","; size_t len = 0; unsigned long flags = ACCESS_ONCE(rdev->flags); if (test_bit(Faulty, &flags) || - rdev->badblocks.unacked_exist) { - len+= sprintf(page+len, "%sfaulty",sep); - sep = ","; - } - if (test_bit(In_sync, &flags)) { - len += sprintf(page+len, "%sin_sync",sep); - sep = ","; - } - if (test_bit(Journal, &flags)) { - len += sprintf(page+len, "%sjournal",sep); - sep = ","; - } - if (test_bit(WriteMostly, &flags)) { - len += sprintf(page+len, "%swrite_mostly",sep); - sep = ","; - } + (!test_bit(ExternalBbl, &flags) && + rdev->badblocks.unacked_exist)) + len += sprintf(page+len, "faulty%s", sep); + if (test_bit(In_sync, &flags)) + len += sprintf(page+len, "in_sync%s", sep); + if (test_bit(Journal, &flags)) + len += sprintf(page+len, "journal%s", sep); + if (test_bit(WriteMostly, &flags)) + len += sprintf(page+len, "write_mostly%s", sep); if (test_bit(Blocked, &flags) || (rdev->badblocks.unacked_exist - && !test_bit(Faulty, &flags))) { - len += sprintf(page+len, "%sblocked", sep); - sep = ","; - } + && !test_bit(Faulty, &flags))) + len += sprintf(page+len, "blocked%s", sep); if (!test_bit(Faulty, &flags) && !test_bit(Journal, &flags) && - !test_bit(In_sync, &flags)) { - len += sprintf(page+len, "%sspare", sep); - sep = ","; - } - if (test_bit(WriteErrorSeen, &flags)) { - len += sprintf(page+len, "%swrite_error", sep); - sep = ","; - } - if (test_bit(WantReplacement, &flags)) { - len += sprintf(page+len, "%swant_replacement", sep); - sep = ","; - } - if (test_bit(Replacement, &flags)) { - len += sprintf(page+len, "%sreplacement", sep); - sep = ","; - } + !test_bit(In_sync, &flags)) + len += sprintf(page+len, "spare%s", sep); + if (test_bit(WriteErrorSeen, &flags)) + len += sprintf(page+len, "write_error%s", sep); + if (test_bit(WantReplacement, &flags)) + len += sprintf(page+len, "want_replacement%s", sep); + if (test_bit(Replacement, &flags)) + len += sprintf(page+len, "replacement%s", sep); + if (test_bit(ExternalBbl, &flags)) + len += sprintf(page+len, "external_bbl%s", sep); + if (test_bit(FailFast, &flags)) + len += sprintf(page+len, "failfast%s", sep); + + if (len) + len -= strlen(sep); return len+sprintf(page+len, "\n"); } @@ -2587,6 +2606,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) * so that it gets rebuilt based on bitmap * write_error - sets WriteErrorSeen * -write_error - clears WriteErrorSeen + * {,-}failfast - set/clear FailFast */ int err = -EINVAL; if (cmd_match(buf, "faulty") && rdev->mddev->pers) { @@ -2610,8 +2630,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) if (err == 0) { md_kick_rdev_from_array(rdev); - if (mddev->pers) - md_update_sb(mddev, 1); + if (mddev->pers) { + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + md_wakeup_thread(mddev->thread); + } md_new_event(mddev); } } @@ -2626,6 +2648,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) err = 0; } else if (cmd_match(buf, "-blocked")) { if (!test_bit(Faulty, &rdev->flags) && + !test_bit(ExternalBbl, &rdev->flags) && rdev->badblocks.unacked_exist) { /* metadata handler doesn't understand badblocks, * so we need to fail the device @@ -2642,6 +2665,12 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { set_bit(In_sync, &rdev->flags); err = 0; + } else if (cmd_match(buf, "failfast")) { + set_bit(FailFast, &rdev->flags); + err = 0; + } else if (cmd_match(buf, "-failfast")) { + clear_bit(FailFast, &rdev->flags); + err = 0; } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags)) { if (rdev->mddev->pers == NULL) { @@ -2708,6 +2737,13 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) } } else err = -EBUSY; + } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) { + set_bit(ExternalBbl, &rdev->flags); + rdev->badblocks.shift = 0; + err = 0; + } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) { + clear_bit(ExternalBbl, &rdev->flags); + err = 0; } if (!err) sysfs_notify_dirent_safe(rdev->sysfs_state); @@ -3211,10 +3247,8 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe sector_t size; rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); - if (!rdev) { - printk(KERN_ERR "md: could not alloc mem for new device!\n"); + if (!rdev) return ERR_PTR(-ENOMEM); - } err = md_rdev_init(rdev); if (err) @@ -3231,8 +3265,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; if (!size) { - printk(KERN_WARNING - "md: %s has zero or unknown size, marking faulty!\n", + pr_warn("md: %s has zero or unknown size, marking faulty!\n", bdevname(rdev->bdev,b)); err = -EINVAL; goto abort_free; @@ -3242,16 +3275,13 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe err = super_types[super_format]. load_super(rdev, NULL, super_minor); if (err == -EINVAL) { - printk(KERN_WARNING - "md: %s does not have a valid v%d.%d " - "superblock, not importing!\n", + pr_warn("md: %s does not have a valid v%d.%d superblock, not importing!\n", bdevname(rdev->bdev,b), - super_format, super_minor); + super_format, super_minor); goto abort_free; } if (err < 0) { - printk(KERN_WARNING - "md: could not read %s's sb, not importing!\n", + pr_warn("md: could not read %s's sb, not importing!\n", bdevname(rdev->bdev,b)); goto abort_free; } @@ -3287,9 +3317,7 @@ static void analyze_sbs(struct mddev *mddev) case 0: break; default: - printk( KERN_ERR \ - "md: fatal superblock inconsistency in %s" - " -- removing from array\n", + pr_warn("md: fatal superblock inconsistency in %s -- removing from array\n", bdevname(rdev->bdev,b)); md_kick_rdev_from_array(rdev); } @@ -3302,18 +3330,16 @@ static void analyze_sbs(struct mddev *mddev) if (mddev->max_disks && (rdev->desc_nr >= mddev->max_disks || i > mddev->max_disks)) { - printk(KERN_WARNING - "md: %s: %s: only %d devices permitted\n", - mdname(mddev), bdevname(rdev->bdev, b), - mddev->max_disks); + pr_warn("md: %s: %s: only %d devices permitted\n", + mdname(mddev), bdevname(rdev->bdev, b), + mddev->max_disks); md_kick_rdev_from_array(rdev); continue; } if (rdev != freshest) { if (super_types[mddev->major_version]. validate_super(mddev, rdev)) { - printk(KERN_WARNING "md: kicking non-fresh %s" - " from array!\n", + pr_warn("md: kicking non-fresh %s from array!\n", bdevname(rdev->bdev,b)); md_kick_rdev_from_array(rdev); continue; @@ -3384,7 +3410,7 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) unsigned long msec; if (mddev_is_clustered(mddev)) { - pr_info("md: Safemode is disabled for clustered mode\n"); + pr_warn("md: Safemode is disabled for clustered mode\n"); return -EINVAL; } @@ -3472,8 +3498,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) rv = -EINVAL; if (!mddev->pers->quiesce) { - printk(KERN_WARNING "md: %s: %s does not support online personality change\n", - mdname(mddev), mddev->pers->name); + pr_warn("md: %s: %s does not support online personality change\n", + mdname(mddev), mddev->pers->name); goto out_unlock; } @@ -3491,7 +3517,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) pers = find_pers(level, clevel); if (!pers || !try_module_get(pers->owner)) { spin_unlock(&pers_lock); - printk(KERN_WARNING "md: personality %s not loaded\n", clevel); + pr_warn("md: personality %s not loaded\n", clevel); rv = -EINVAL; goto out_unlock; } @@ -3505,8 +3531,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) } if (!pers->takeover) { module_put(pers->owner); - printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", - mdname(mddev), clevel); + pr_warn("md: %s: %s does not support personality takeover\n", + mdname(mddev), clevel); rv = -EINVAL; goto out_unlock; } @@ -3526,8 +3552,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->delta_disks = 0; mddev->reshape_backwards = 0; module_put(pers->owner); - printk(KERN_WARNING "md: %s: %s would not accept array\n", - mdname(mddev), clevel); + pr_warn("md: %s: %s would not accept array\n", + mdname(mddev), clevel); rv = PTR_ERR(priv); goto out_unlock; } @@ -3570,9 +3596,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) pers->sync_request != NULL) { /* need to add the md_redundancy_group */ if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) - printk(KERN_WARNING - "md: cannot register extra attributes for %s\n", - mdname(mddev)); + pr_warn("md: cannot register extra attributes for %s\n", + mdname(mddev)); mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); } if (oldpers->sync_request != NULL && @@ -3603,9 +3628,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) clear_bit(In_sync, &rdev->flags); else { if (sysfs_link_rdev(mddev, rdev)) - printk(KERN_WARNING "md: cannot register rd%d" - " for %s after level change\n", - rdev->raid_disk, mdname(mddev)); + pr_warn("md: cannot register rd%d for %s after level change\n", + rdev->raid_disk, mdname(mddev)); } } @@ -3618,7 +3642,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) } blk_set_stacking_limits(&mddev->queue->limits); pers->run(mddev); - set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); mddev_resume(mddev); if (!mddev->thread) md_update_sb(mddev, 1); @@ -3813,7 +3837,7 @@ resync_start_store(struct mddev *mddev, const char *buf, size_t len) if (!err) { mddev->recovery_cp = n; if (mddev->pers) - set_bit(MD_CHANGE_CLEAN, &mddev->flags); + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); } mddev_unlock(mddev); return err ?: len; @@ -3887,7 +3911,7 @@ array_state_show(struct mddev *mddev, char *page) st = read_auto; break; case 0: - if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) + if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) st = write_pending; else if (mddev->in_sync) st = clean; @@ -3925,7 +3949,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) spin_lock(&mddev->lock); if (st == active) { restart_array(mddev); - clear_bit(MD_CHANGE_PENDING, &mddev->flags); + clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); + md_wakeup_thread(mddev->thread); wake_up(&mddev->sb_wait); err = 0; } else /* st == clean */ { @@ -3935,7 +3960,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) mddev->in_sync = 1; if (mddev->safemode == 1) mddev->safemode = 0; - set_bit(MD_CHANGE_CLEAN, &mddev->flags); + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); } err = 0; } else @@ -4001,7 +4026,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) mddev->in_sync = 1; if (mddev->safemode == 1) mddev->safemode = 0; - set_bit(MD_CHANGE_CLEAN, &mddev->flags); + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); } err = 0; } else @@ -4015,7 +4040,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) err = restart_array(mddev); if (err) break; - clear_bit(MD_CHANGE_PENDING, &mddev->flags); + clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); wake_up(&mddev->sb_wait); err = 0; } else { @@ -5071,13 +5096,13 @@ static int md_alloc(dev_t dev, char *name) /* This isn't possible, but as kobject_init_and_add is marked * __must_check, we must do something with the result */ - printk(KERN_WARNING "md: cannot register %s/md - name in use\n", - disk->disk_name); + pr_debug("md: cannot register %s/md - name in use\n", + disk->disk_name); error = 0; } if (mddev->kobj.sd && sysfs_create_group(&mddev->kobj, &md_bitmap_group)) - printk(KERN_DEBUG "pointless warning\n"); + pr_debug("pointless warning\n"); mutex_unlock(&mddev->open_mutex); abort: mutex_unlock(&disks_mutex); @@ -5179,34 +5204,37 @@ int md_run(struct mddev *mddev) if (mddev->dev_sectors && rdev->data_offset + mddev->dev_sectors > rdev->sb_start) { - printk("md: %s: data overlaps metadata\n", - mdname(mddev)); + pr_warn("md: %s: data overlaps metadata\n", + mdname(mddev)); return -EINVAL; } } else { if (rdev->sb_start + rdev->sb_size/512 > rdev->data_offset) { - printk("md: %s: metadata overlaps data\n", - mdname(mddev)); + pr_warn("md: %s: metadata overlaps data\n", + mdname(mddev)); return -EINVAL; } } sysfs_notify_dirent_safe(rdev->sysfs_state); } - if (mddev->bio_set == NULL) + if (mddev->bio_set == NULL) { mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0); + if (!mddev->bio_set) + return -ENOMEM; + } spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); if (!pers || !try_module_get(pers->owner)) { spin_unlock(&pers_lock); if (mddev->level != LEVEL_NONE) - printk(KERN_WARNING "md: personality for level %d is not loaded!\n", - mddev->level); + pr_warn("md: personality for level %d is not loaded!\n", + mddev->level); else - printk(KERN_WARNING "md: personality for level %s is not loaded!\n", - mddev->clevel); + pr_warn("md: personality for level %s is not loaded!\n", + mddev->clevel); return -EINVAL; } spin_unlock(&pers_lock); @@ -5236,21 +5264,16 @@ int md_run(struct mddev *mddev) if (rdev < rdev2 && rdev->bdev->bd_contains == rdev2->bdev->bd_contains) { - printk(KERN_WARNING - "%s: WARNING: %s appears to be" - " on the same physical disk as" - " %s.\n", - mdname(mddev), - bdevname(rdev->bdev,b), - bdevname(rdev2->bdev,b2)); + pr_warn("%s: WARNING: %s appears to be on the same physical disk as %s.\n", + mdname(mddev), + bdevname(rdev->bdev,b), + bdevname(rdev2->bdev,b2)); warned = 1; } } if (warned) - printk(KERN_WARNING - "True protection against single-disk" - " failure might be compromised.\n"); + pr_warn("True protection against single-disk failure might be compromised.\n"); } mddev->recovery = 0; @@ -5262,16 +5285,21 @@ int md_run(struct mddev *mddev) if (start_readonly && mddev->ro == 0) mddev->ro = 2; /* read-only, but switch on first write */ + /* + * NOTE: some pers->run(), for example r5l_recovery_log(), wakes + * up mddev->thread. It is important to initialize critical + * resources for mddev->thread BEFORE calling pers->run(). + */ err = pers->run(mddev); if (err) - printk(KERN_ERR "md: pers->run() failed ...\n"); + pr_warn("md: pers->run() failed ...\n"); else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { - WARN_ONCE(!mddev->external_size, "%s: default size too small," - " but 'external_size' not in effect?\n", __func__); - printk(KERN_ERR - "md: invalid array_size %llu > default size %llu\n", - (unsigned long long)mddev->array_sectors / 2, - (unsigned long long)pers->size(mddev, 0, 0) / 2); + WARN_ONCE(!mddev->external_size, + "%s: default size too small, but 'external_size' not in effect?\n", + __func__); + pr_warn("md: invalid array_size %llu > default size %llu\n", + (unsigned long long)mddev->array_sectors / 2, + (unsigned long long)pers->size(mddev, 0, 0) / 2); err = -EINVAL; } if (err == 0 && pers->sync_request && @@ -5281,8 +5309,8 @@ int md_run(struct mddev *mddev) bitmap = bitmap_create(mddev, -1); if (IS_ERR(bitmap)) { err = PTR_ERR(bitmap); - printk(KERN_ERR "%s: failed to create bitmap (%d)\n", - mdname(mddev), err); + pr_warn("%s: failed to create bitmap (%d)\n", + mdname(mddev), err); } else mddev->bitmap = bitmap; @@ -5312,15 +5340,14 @@ int md_run(struct mddev *mddev) queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mddev->queue); else queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, mddev->queue); - mddev->queue->backing_dev_info.congested_data = mddev; - mddev->queue->backing_dev_info.congested_fn = md_congested; + mddev->queue->backing_dev_info->congested_data = mddev; + mddev->queue->backing_dev_info->congested_fn = md_congested; } if (pers->sync_request) { if (mddev->kobj.sd && sysfs_create_group(&mddev->kobj, &md_redundancy_group)) - printk(KERN_WARNING - "md: cannot register extra attributes for %s\n", - mdname(mddev)); + pr_warn("md: cannot register extra attributes for %s\n", + mdname(mddev)); mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); } else if (mddev->ro == 2) /* auto-readonly not meaningful */ mddev->ro = 0; @@ -5350,7 +5377,7 @@ int md_run(struct mddev *mddev) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - if (mddev->flags & MD_UPDATE_SB_FLAGS) + if (mddev->sb_flags) md_update_sb(mddev, 0); md_new_event(mddev); @@ -5421,8 +5448,7 @@ static int restart_array(struct mddev *mddev) mddev->safemode = 0; mddev->ro = 0; set_disk_ro(disk, 0); - printk(KERN_INFO "md: %s switched to read-write mode.\n", - mdname(mddev)); + pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); /* Kick recovery or resync if necessary */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); @@ -5446,6 +5472,7 @@ static void md_clean(struct mddev *mddev) mddev->level = LEVEL_NONE; mddev->clevel[0] = 0; mddev->flags = 0; + mddev->sb_flags = 0; mddev->ro = 0; mddev->metadata_type[0] = 0; mddev->chunk_sectors = 0; @@ -5490,12 +5517,15 @@ static void __md_stop_writes(struct mddev *mddev) del_timer_sync(&mddev->safemode_timer); + if (mddev->pers && mddev->pers->quiesce) { + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } bitmap_flush(mddev); - md_super_wait(mddev); if (mddev->ro == 0 && ((!mddev->in_sync && !mddev_is_clustered(mddev)) || - (mddev->flags & MD_UPDATE_SB_FLAGS))) { + mddev->sb_flags)) { /* mark array as shutdown cleanly */ if (!mddev_is_clustered(mddev)) mddev->in_sync = 1; @@ -5516,8 +5546,8 @@ static void mddev_detach(struct mddev *mddev) struct bitmap *bitmap = mddev->bitmap; /* wait for behind writes to complete */ if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { - printk(KERN_INFO "md:%s: behind writes in progress - waiting to stop.\n", - mdname(mddev)); + pr_debug("md:%s: behind writes in progress - waiting to stop.\n", + mdname(mddev)); /* need to kick something here to make sure I/O goes? */ wait_event(bitmap->behind_wait, atomic_read(&bitmap->behind_writes) == 0); @@ -5578,20 +5608,20 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) * which will now never happen */ wake_up_process(mddev->sync_thread->tsk); - if (mddev->external && test_bit(MD_CHANGE_PENDING, &mddev->flags)) + if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) return -EBUSY; mddev_unlock(mddev); wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); wait_event(mddev->sb_wait, - !test_bit(MD_CHANGE_PENDING, &mddev->flags)); + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); mddev_lock_nointr(mddev); mutex_lock(&mddev->open_mutex); if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || mddev->sync_thread || test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { - printk("md: %s still in use.\n",mdname(mddev)); + pr_warn("md: %s still in use.\n",mdname(mddev)); if (did_freeze) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -5653,7 +5683,7 @@ static int do_md_stop(struct mddev *mddev, int mode, mddev->sysfs_active || mddev->sync_thread || test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { - printk("md: %s still in use.\n",mdname(mddev)); + pr_warn("md: %s still in use.\n",mdname(mddev)); mutex_unlock(&mddev->open_mutex); if (did_freeze) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); @@ -5668,7 +5698,7 @@ static int do_md_stop(struct mddev *mddev, int mode, __md_stop_writes(mddev); __md_stop(mddev); - mddev->queue->backing_dev_info.congested_fn = NULL; + mddev->queue->backing_dev_info->congested_fn = NULL; /* tell userspace to handle 'inactive' */ sysfs_notify_dirent_safe(mddev->sysfs_state); @@ -5690,7 +5720,7 @@ static int do_md_stop(struct mddev *mddev, int mode, * Free resources if final stop */ if (mode == 0) { - printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); + pr_info("md: %s stopped.\n", mdname(mddev)); bitmap_destroy(mddev); if (mddev->bitmap_info.file) { @@ -5722,17 +5752,17 @@ static void autorun_array(struct mddev *mddev) if (list_empty(&mddev->disks)) return; - printk(KERN_INFO "md: running: "); + pr_info("md: running: "); rdev_for_each(rdev, mddev) { char b[BDEVNAME_SIZE]; - printk("<%s>", bdevname(rdev->bdev,b)); + pr_cont("<%s>", bdevname(rdev->bdev,b)); } - printk("\n"); + pr_cont("\n"); err = do_md_run(mddev); if (err) { - printk(KERN_WARNING "md: do_md_run() returned %d\n", err); + pr_warn("md: do_md_run() returned %d\n", err); do_md_stop(mddev, 0, NULL); } } @@ -5755,7 +5785,7 @@ static void autorun_devices(int part) struct mddev *mddev; char b[BDEVNAME_SIZE]; - printk(KERN_INFO "md: autorun ...\n"); + pr_info("md: autorun ...\n"); while (!list_empty(&pending_raid_disks)) { int unit; dev_t dev; @@ -5763,13 +5793,12 @@ static void autorun_devices(int part) rdev0 = list_entry(pending_raid_disks.next, struct md_rdev, same_set); - printk(KERN_INFO "md: considering %s ...\n", - bdevname(rdev0->bdev,b)); + pr_debug("md: considering %s ...\n", bdevname(rdev0->bdev,b)); INIT_LIST_HEAD(&candidates); rdev_for_each_list(rdev, tmp, &pending_raid_disks) if (super_90_load(rdev, rdev0, 0) >= 0) { - printk(KERN_INFO "md: adding %s ...\n", - bdevname(rdev->bdev,b)); + pr_debug("md: adding %s ...\n", + bdevname(rdev->bdev,b)); list_move(&rdev->same_set, &candidates); } /* @@ -5786,8 +5815,8 @@ static void autorun_devices(int part) unit = MINOR(dev); } if (rdev0->preferred_minor != unit) { - printk(KERN_INFO "md: unit number in %s is bad: %d\n", - bdevname(rdev0->bdev, b), rdev0->preferred_minor); + pr_warn("md: unit number in %s is bad: %d\n", + bdevname(rdev0->bdev, b), rdev0->preferred_minor); break; } @@ -5796,21 +5825,17 @@ static void autorun_devices(int part) if (!mddev || !mddev->gendisk) { if (mddev) mddev_put(mddev); - printk(KERN_ERR - "md: cannot allocate memory for md drive.\n"); break; } if (mddev_lock(mddev)) - printk(KERN_WARNING "md: %s locked, cannot run\n", - mdname(mddev)); + pr_warn("md: %s locked, cannot run\n", mdname(mddev)); else if (mddev->raid_disks || mddev->major_version || !list_empty(&mddev->disks)) { - printk(KERN_WARNING - "md: %s already running, cannot run %s\n", + pr_warn("md: %s already running, cannot run %s\n", mdname(mddev), bdevname(rdev0->bdev,b)); mddev_unlock(mddev); } else { - printk(KERN_INFO "md: created %s\n", mdname(mddev)); + pr_debug("md: created %s\n", mdname(mddev)); mddev->persistent = 1; rdev_for_each_list(rdev, tmp, &candidates) { list_del_init(&rdev->same_set); @@ -5829,7 +5854,7 @@ static void autorun_devices(int part) } mddev_put(mddev); } - printk(KERN_INFO "md: ... autorun DONE.\n"); + pr_info("md: ... autorun DONE.\n"); } #endif /* !MODULE */ @@ -5964,6 +5989,8 @@ static int get_disk_info(struct mddev *mddev, void __user * arg) info.state |= (1<<MD_DISK_JOURNAL); if (test_bit(WriteMostly, &rdev->flags)) info.state |= (1<<MD_DISK_WRITEMOSTLY); + if (test_bit(FailFast, &rdev->flags)) + info.state |= (1<<MD_DISK_FAILFAST); } else { info.major = info.minor = 0; info.raid_disk = -1; @@ -5985,8 +6012,8 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) if (mddev_is_clustered(mddev) && !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) { - pr_err("%s: Cannot add to clustered mddev.\n", - mdname(mddev)); + pr_warn("%s: Cannot add to clustered mddev.\n", + mdname(mddev)); return -EINVAL; } @@ -5998,8 +6025,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) /* expecting a device which has a superblock */ rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); if (IS_ERR(rdev)) { - printk(KERN_WARNING - "md: md_import_device returned %ld\n", + pr_warn("md: md_import_device returned %ld\n", PTR_ERR(rdev)); return PTR_ERR(rdev); } @@ -6010,8 +6036,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) err = super_types[mddev->major_version] .load_super(rdev, rdev0, mddev->minor_version); if (err < 0) { - printk(KERN_WARNING - "md: %s has different UUID to %s\n", + pr_warn("md: %s has different UUID to %s\n", bdevname(rdev->bdev,b), bdevname(rdev0->bdev,b2)); export_rdev(rdev); @@ -6032,9 +6057,8 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) if (mddev->pers) { int err; if (!mddev->pers->hot_add_disk) { - printk(KERN_WARNING - "%s: personality does not support diskops!\n", - mdname(mddev)); + pr_warn("%s: personality does not support diskops!\n", + mdname(mddev)); return -EINVAL; } if (mddev->persistent) @@ -6043,8 +6067,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) else rdev = md_import_device(dev, -1, -1); if (IS_ERR(rdev)) { - printk(KERN_WARNING - "md: md_import_device returned %ld\n", + pr_warn("md: md_import_device returned %ld\n", PTR_ERR(rdev)); return PTR_ERR(rdev); } @@ -6075,6 +6098,10 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) set_bit(WriteMostly, &rdev->flags); else clear_bit(WriteMostly, &rdev->flags); + if (info->state & (1<<MD_DISK_FAILFAST)) + set_bit(FailFast, &rdev->flags); + else + clear_bit(FailFast, &rdev->flags); if (info->state & (1<<MD_DISK_JOURNAL)) { struct md_rdev *rdev2; @@ -6140,8 +6167,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) * for major_version==0 superblocks */ if (mddev->major_version != 0) { - printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", - mdname(mddev)); + pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev)); return -EINVAL; } @@ -6149,8 +6175,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) int err; rdev = md_import_device(dev, -1, 0); if (IS_ERR(rdev)) { - printk(KERN_WARNING - "md: error, md_import_device() returned %ld\n", + pr_warn("md: error, md_import_device() returned %ld\n", PTR_ERR(rdev)); return PTR_ERR(rdev); } @@ -6166,9 +6191,11 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) if (info->state & (1<<MD_DISK_WRITEMOSTLY)) set_bit(WriteMostly, &rdev->flags); + if (info->state & (1<<MD_DISK_FAILFAST)) + set_bit(FailFast, &rdev->flags); if (!mddev->persistent) { - printk(KERN_INFO "md: nonpersistent superblock ...\n"); + pr_debug("md: nonpersistent superblock ...\n"); rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; } else rdev->sb_start = calc_dev_sboffset(rdev); @@ -6207,13 +6234,17 @@ kick_rdev: md_cluster_ops->remove_disk(mddev, rdev); md_kick_rdev_from_array(rdev); - md_update_sb(mddev, 1); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + if (mddev->thread) + md_wakeup_thread(mddev->thread); + else + md_update_sb(mddev, 1); md_new_event(mddev); return 0; busy: - printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", - bdevname(rdev->bdev,b), mdname(mddev)); + pr_debug("md: cannot remove active disk %s from %s ...\n", + bdevname(rdev->bdev,b), mdname(mddev)); return -EBUSY; } @@ -6227,22 +6258,19 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) return -ENODEV; if (mddev->major_version != 0) { - printk(KERN_WARNING "%s: HOT_ADD may only be used with" - " version-0 superblocks.\n", + pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n", mdname(mddev)); return -EINVAL; } if (!mddev->pers->hot_add_disk) { - printk(KERN_WARNING - "%s: personality does not support diskops!\n", + pr_warn("%s: personality does not support diskops!\n", mdname(mddev)); return -EINVAL; } rdev = md_import_device(dev, -1, 0); if (IS_ERR(rdev)) { - printk(KERN_WARNING - "md: error, md_import_device() returned %ld\n", + pr_warn("md: error, md_import_device() returned %ld\n", PTR_ERR(rdev)); return -EINVAL; } @@ -6255,8 +6283,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) rdev->sectors = rdev->sb_start; if (test_bit(Faulty, &rdev->flags)) { - printk(KERN_WARNING - "md: can not hot-add faulty %s disk to %s!\n", + pr_warn("md: can not hot-add faulty %s disk to %s!\n", bdevname(rdev->bdev,b), mdname(mddev)); err = -EINVAL; goto abort_export; @@ -6276,7 +6303,9 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) rdev->raid_disk = -1; - md_update_sb(mddev, 1); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + if (!mddev->thread) + md_update_sb(mddev, 1); /* * Kick recovery, maybe this spare has to be added to the * array immediately. @@ -6312,23 +6341,23 @@ static int set_bitmap_file(struct mddev *mddev, int fd) f = fget(fd); if (f == NULL) { - printk(KERN_ERR "%s: error: failed to get bitmap file\n", - mdname(mddev)); + pr_warn("%s: error: failed to get bitmap file\n", + mdname(mddev)); return -EBADF; } inode = f->f_mapping->host; if (!S_ISREG(inode->i_mode)) { - printk(KERN_ERR "%s: error: bitmap file must be a regular file\n", - mdname(mddev)); + pr_warn("%s: error: bitmap file must be a regular file\n", + mdname(mddev)); err = -EBADF; } else if (!(f->f_mode & FMODE_WRITE)) { - printk(KERN_ERR "%s: error: bitmap file must open for write\n", - mdname(mddev)); + pr_warn("%s: error: bitmap file must open for write\n", + mdname(mddev)); err = -EBADF; } else if (atomic_read(&inode->i_writecount) != 1) { - printk(KERN_ERR "%s: error: bitmap file is already in use\n", - mdname(mddev)); + pr_warn("%s: error: bitmap file is already in use\n", + mdname(mddev)); err = -EBUSY; } if (err) { @@ -6393,8 +6422,7 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) info->major_version >= ARRAY_SIZE(super_types) || super_types[info->major_version].name == NULL) { /* maybe try to auto-load a module? */ - printk(KERN_INFO - "md: superblock version %d not known\n", + pr_warn("md: superblock version %d not known\n", info->major_version); return -EINVAL; } @@ -6432,9 +6460,11 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->max_disks = MD_SB_DISKS; - if (mddev->persistent) + if (mddev->persistent) { mddev->flags = 0; - set_bit(MD_CHANGE_DEVS, &mddev->flags); + mddev->sb_flags = 0; + } + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); @@ -6660,8 +6690,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) if (mddev->bitmap_info.nodes) { /* hold PW on all the bitmap lock */ if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { - printk("md: can't change bitmap to none since the" - " array is in use by more than one node\n"); + pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n"); rv = -EPERM; md_cluster_ops->unlock_all_bitmaps(mddev); goto err; @@ -6829,7 +6858,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, /* need to ensure recovery thread has run */ wait_event_interruptible_timeout(mddev->sb_wait, !test_bit(MD_RECOVERY_NEEDED, - &mddev->flags), + &mddev->recovery), msecs_to_jiffies(5000)); if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { /* Need to flush page cache, and ensure no-one else opens @@ -6847,9 +6876,8 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, } err = mddev_lock(mddev); if (err) { - printk(KERN_INFO - "md: ioctl lock interrupted, reason %d, cmd %d\n", - err, cmd); + pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n", + err, cmd); goto out; } @@ -6864,30 +6892,24 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, if (mddev->pers) { err = update_array_info(mddev, &info); if (err) { - printk(KERN_WARNING "md: couldn't update" - " array info. %d\n", err); + pr_warn("md: couldn't update array info. %d\n", err); goto unlock; } goto unlock; } if (!list_empty(&mddev->disks)) { - printk(KERN_WARNING - "md: array %s already has disks!\n", - mdname(mddev)); + pr_warn("md: array %s already has disks!\n", mdname(mddev)); err = -EBUSY; goto unlock; } if (mddev->raid_disks) { - printk(KERN_WARNING - "md: array %s already initialised!\n", - mdname(mddev)); + pr_warn("md: array %s already initialised!\n", mdname(mddev)); err = -EBUSY; goto unlock; } err = set_array_info(mddev, &info); if (err) { - printk(KERN_WARNING "md: couldn't set" - " array info. %d\n", err); + pr_warn("md: couldn't set array info. %d\n", err); goto unlock; } goto unlock; @@ -6987,11 +7009,11 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, /* If a device failed while we were read-only, we * need to make sure the metadata is updated now. */ - if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) { + if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) { mddev_unlock(mddev); wait_event(mddev->sb_wait, - !test_bit(MD_CHANGE_DEVS, &mddev->flags) && - !test_bit(MD_CHANGE_PENDING, &mddev->flags)); + !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) && + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); mddev_lock_nointr(mddev); } } else { @@ -7092,7 +7114,8 @@ static int md_open(struct block_device *bdev, fmode_t mode) if (test_bit(MD_CLOSING, &mddev->flags)) { mutex_unlock(&mddev->open_mutex); - return -ENODEV; + err = -ENODEV; + goto out; } err = 0; @@ -7101,6 +7124,8 @@ static int md_open(struct block_device *bdev, fmode_t mode) check_disk_change(bdev); out: + if (err) + mddev_put(mddev); return err; } @@ -7171,10 +7196,12 @@ static int md_thread(void *arg) wait_event_interruptible_timeout (thread->wqueue, test_bit(THREAD_WAKEUP, &thread->flags) - || kthread_should_stop(), + || kthread_should_stop() || kthread_should_park(), thread->timeout); clear_bit(THREAD_WAKEUP, &thread->flags); + if (kthread_should_park()) + kthread_parkme(); if (!kthread_should_stop()) thread->run(thread); } @@ -7588,8 +7615,8 @@ static const struct file_operations md_seq_fops = { int register_md_personality(struct md_personality *p) { - printk(KERN_INFO "md: %s personality registered for level %d\n", - p->name, p->level); + pr_debug("md: %s personality registered for level %d\n", + p->name, p->level); spin_lock(&pers_lock); list_add_tail(&p->list, &pers_list); spin_unlock(&pers_lock); @@ -7599,7 +7626,7 @@ EXPORT_SYMBOL(register_md_personality); int unregister_md_personality(struct md_personality *p) { - printk(KERN_INFO "md: %s personality unregistered\n", p->name); + pr_debug("md: %s personality unregistered\n", p->name); spin_lock(&pers_lock); list_del_init(&p->list); spin_unlock(&pers_lock); @@ -7639,7 +7666,7 @@ int md_setup_cluster(struct mddev *mddev, int nodes) spin_lock(&pers_lock); /* ensure module won't be unloaded */ if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { - pr_err("can't find md-cluster module or get it's reference.\n"); + pr_warn("can't find md-cluster module or get it's reference.\n"); spin_unlock(&pers_lock); return -ENOENT; } @@ -7741,8 +7768,8 @@ void md_write_start(struct mddev *mddev, struct bio *bi) spin_lock(&mddev->lock); if (mddev->in_sync) { mddev->in_sync = 0; - set_bit(MD_CHANGE_CLEAN, &mddev->flags); - set_bit(MD_CHANGE_PENDING, &mddev->flags); + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); + set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); md_wakeup_thread(mddev->thread); did_change = 1; } @@ -7751,7 +7778,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi) if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); wait_event(mddev->sb_wait, - !test_bit(MD_CHANGE_PENDING, &mddev->flags)); + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); } EXPORT_SYMBOL(md_write_start); @@ -7772,7 +7799,7 @@ EXPORT_SYMBOL(md_write_end); * attempting a GFP_KERNEL allocation while holding the mddev lock. * Must be called with mddev_lock held. * - * In the ->external case MD_CHANGE_PENDING can not be cleared until mddev->lock + * In the ->external case MD_SB_CHANGE_PENDING can not be cleared until mddev->lock * is dropped, so return -EAGAIN after notifying userspace. */ int md_allow_write(struct mddev *mddev) @@ -7787,8 +7814,8 @@ int md_allow_write(struct mddev *mddev) spin_lock(&mddev->lock); if (mddev->in_sync) { mddev->in_sync = 0; - set_bit(MD_CHANGE_CLEAN, &mddev->flags); - set_bit(MD_CHANGE_PENDING, &mddev->flags); + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); + set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); if (mddev->safemode_delay && mddev->safemode == 0) mddev->safemode = 1; @@ -7798,7 +7825,7 @@ int md_allow_write(struct mddev *mddev) } else spin_unlock(&mddev->lock); - if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) + if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) return -EAGAIN; else return 0; @@ -7914,11 +7941,9 @@ void md_do_sync(struct md_thread *thread) mddev2->curr_resync >= mddev->curr_resync) { if (mddev2_minor != mddev2->md_minor) { mddev2_minor = mddev2->md_minor; - printk(KERN_INFO "md: delaying %s of %s" - " until %s has finished (they" - " share one or more physical units)\n", - desc, mdname(mddev), - mdname(mddev2)); + pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n", + desc, mdname(mddev), + mdname(mddev2)); } mddev_put(mddev2); if (signal_pending(current)) @@ -7975,12 +8000,10 @@ void md_do_sync(struct md_thread *thread) } } - printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); - printk(KERN_INFO "md: minimum _guaranteed_ speed:" - " %d KB/sec/disk.\n", speed_min(mddev)); - printk(KERN_INFO "md: using maximum available idle IO bandwidth " - "(but not more than %d KB/sec) for %s.\n", - speed_max(mddev), desc); + pr_info("md: %s of RAID array %s\n", desc, mdname(mddev)); + pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev)); + pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n", + speed_max(mddev), desc); is_mddev_idle(mddev, 1); /* this initializes IO event counters */ @@ -7997,16 +8020,15 @@ void md_do_sync(struct md_thread *thread) * Tune reconstruction: */ window = 32*(PAGE_SIZE/512); - printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n", - window/2, (unsigned long long)max_sectors/2); + pr_debug("md: using %dk window, over a total of %lluk.\n", + window/2, (unsigned long long)max_sectors/2); atomic_set(&mddev->recovery_active, 0); last_check = 0; if (j>2) { - printk(KERN_INFO - "md: resuming %s of %s from checkpoint.\n", - desc, mdname(mddev)); + pr_debug("md: resuming %s of %s from checkpoint.\n", + desc, mdname(mddev)); mddev->curr_resync = j; } else mddev->curr_resync = 3; /* no longer delayed */ @@ -8038,7 +8060,7 @@ void md_do_sync(struct md_thread *thread) j > mddev->recovery_cp) mddev->recovery_cp = j; update_time = jiffies; - set_bit(MD_CHANGE_CLEAN, &mddev->flags); + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } @@ -8133,9 +8155,9 @@ void md_do_sync(struct md_thread *thread) } } } - printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc, - test_bit(MD_RECOVERY_INTR, &mddev->recovery) - ? "interrupted" : "done"); + pr_info("md: %s: %s %s.\n",mdname(mddev), desc, + test_bit(MD_RECOVERY_INTR, &mddev->recovery) + ? "interrupted" : "done"); /* * this also signals 'finished resyncing' to md_stop */ @@ -8155,9 +8177,8 @@ void md_do_sync(struct md_thread *thread) if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { if (mddev->curr_resync >= mddev->recovery_cp) { - printk(KERN_INFO - "md: checkpointing %s of %s.\n", - desc, mdname(mddev)); + pr_debug("md: checkpointing %s of %s.\n", + desc, mdname(mddev)); if (test_bit(MD_RECOVERY_ERROR, &mddev->recovery)) mddev->recovery_cp = @@ -8187,8 +8208,8 @@ void md_do_sync(struct md_thread *thread) /* set CHANGE_PENDING here since maybe another update is needed, * so other nodes are informed. It should be harmless for normal * raid */ - set_mask_bits(&mddev->flags, 0, - BIT(MD_CHANGE_PENDING) | BIT(MD_CHANGE_DEVS)); + set_mask_bits(&mddev->sb_flags, 0, + BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); spin_lock(&mddev->lock); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { @@ -8288,12 +8309,12 @@ static int remove_and_add_spares(struct mddev *mddev, if (!test_bit(Journal, &rdev->flags)) spares++; md_new_event(mddev); - set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); } } no_add: if (removed) - set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); return spares; } @@ -8305,8 +8326,8 @@ static void md_start_sync(struct work_struct *ws) mddev, "resync"); if (!mddev->sync_thread) { - printk(KERN_ERR "%s: could not start resync thread...\n", - mdname(mddev)); + pr_warn("%s: could not start resync thread...\n", + mdname(mddev)); /* leave the spares where they are, it shouldn't hurt */ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); @@ -8356,8 +8377,8 @@ void md_check_recovery(struct mddev *mddev) if (signal_pending(current)) { if (mddev->pers->sync_request && !mddev->external) { - printk(KERN_INFO "md: %s in immediate safe mode\n", - mdname(mddev)); + pr_debug("md: %s in immediate safe mode\n", + mdname(mddev)); mddev->safemode = 2; } flush_signals(current); @@ -8366,7 +8387,7 @@ void md_check_recovery(struct mddev *mddev) if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) return; if ( ! ( - (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) || + (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) || test_bit(MD_RELOAD_SB, &mddev->flags) || @@ -8404,7 +8425,7 @@ void md_check_recovery(struct mddev *mddev) md_reap_sync_thread(mddev); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - clear_bit(MD_CHANGE_PENDING, &mddev->flags); + clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); goto unlock; } @@ -8432,7 +8453,7 @@ void md_check_recovery(struct mddev *mddev) mddev->recovery_cp == MaxSector) { mddev->in_sync = 1; did_change = 1; - set_bit(MD_CHANGE_CLEAN, &mddev->flags); + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); } if (mddev->safemode == 1) mddev->safemode = 0; @@ -8441,7 +8462,7 @@ void md_check_recovery(struct mddev *mddev) sysfs_notify_dirent_safe(mddev->sysfs_state); } - if (mddev->flags & MD_UPDATE_SB_FLAGS) + if (mddev->sb_flags) md_update_sb(mddev, 0); if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && @@ -8537,7 +8558,7 @@ void md_reap_sync_thread(struct mddev *mddev) if (mddev->pers->spare_active(mddev)) { sysfs_notify(&mddev->kobj, NULL, "degraded"); - set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); } } if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && @@ -8552,7 +8573,7 @@ void md_reap_sync_thread(struct mddev *mddev) rdev->saved_raid_disk = -1; md_update_sb(mddev, 1); - /* MD_CHANGE_PENDING should be cleared by md_update_sb, so we can + /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by * clustered raid */ if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags)) @@ -8614,9 +8635,12 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, rv = badblocks_set(&rdev->badblocks, s, sectors, 0); if (rv == 0) { /* Make sure they get written out promptly */ + if (test_bit(ExternalBbl, &rdev->flags)) + sysfs_notify(&rdev->kobj, NULL, + "unacknowledged_bad_blocks"); sysfs_notify_dirent_safe(rdev->sysfs_state); - set_mask_bits(&mddev->flags, 0, - BIT(MD_CHANGE_CLEAN) | BIT(MD_CHANGE_PENDING)); + set_mask_bits(&mddev->sb_flags, 0, + BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING)); md_wakeup_thread(rdev->mddev->thread); return 1; } else @@ -8627,12 +8651,15 @@ EXPORT_SYMBOL_GPL(rdev_set_badblocks); int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new) { + int rv; if (is_new) s += rdev->new_data_offset; else s += rdev->data_offset; - return badblocks_clear(&rdev->badblocks, - s, sectors); + rv = badblocks_clear(&rdev->badblocks, s, sectors); + if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags)) + sysfs_notify(&rdev->kobj, NULL, "bad_blocks"); + return rv; } EXPORT_SYMBOL_GPL(rdev_clear_badblocks); @@ -8749,7 +8776,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) rdev2->saved_raid_disk = role; ret = remove_and_add_spares(mddev, rdev2); pr_info("Activated spare: %s\n", - bdevname(rdev2->bdev,b)); + bdevname(rdev2->bdev,b)); /* wakeup mddev->thread here, so array could * perform resync with the new activated disk */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -8785,15 +8812,18 @@ static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) * variable in case we err in the future */ rdev->sb_page = NULL; - alloc_disk_sb(rdev); - ClearPageUptodate(rdev->sb_page); - rdev->sb_loaded = 0; - err = super_types[mddev->major_version].load_super(rdev, NULL, mddev->minor_version); - + err = alloc_disk_sb(rdev); + if (err == 0) { + ClearPageUptodate(rdev->sb_page); + rdev->sb_loaded = 0; + err = super_types[mddev->major_version]. + load_super(rdev, NULL, mddev->minor_version); + } if (err < 0) { pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", __func__, __LINE__, rdev->desc_nr, err); - put_page(rdev->sb_page); + if (rdev->sb_page) + put_page(rdev->sb_page); rdev->sb_page = swapout; rdev->sb_loaded = 1; return err; @@ -8871,9 +8901,6 @@ void md_autodetect_dev(dev_t dev) mutex_lock(&detected_devices_mutex); list_add_tail(&node_detected_dev->list, &all_detected_devices); mutex_unlock(&detected_devices_mutex); - } else { - printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed" - ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev)); } } @@ -8887,7 +8914,7 @@ static void autostart_arrays(int part) i_scanned = 0; i_passed = 0; - printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); + pr_info("md: Autodetecting RAID arrays.\n"); mutex_lock(&detected_devices_mutex); while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { @@ -8912,8 +8939,7 @@ static void autostart_arrays(int part) } mutex_unlock(&detected_devices_mutex); - printk(KERN_INFO "md: Scanned %d and added %d devices.\n", - i_scanned, i_passed); + pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed); autorun_devices(part); } @@ -8948,7 +8974,14 @@ static __exit void md_exit(void) for_each_mddev(mddev, tmp) { export_array(mddev); + mddev->ctime = 0; mddev->hold_active = 0; + /* + * for_each_mddev() will call mddev_put() at the end of each + * iteration. As the mddev is now fully clear, this will + * schedule the mddev for destruction by a workqueue, and the + * destroy_workqueue() below will wait for that to complete. + */ } destroy_workqueue(md_misc_wq); destroy_workqueue(md_wq); diff --git a/drivers/md/md.h b/drivers/md/md.h index 2b2041773e79..b8859cbf84b6 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -30,6 +30,16 @@ #define MaxSector (~(sector_t)0) /* + * These flags should really be called "NO_RETRY" rather than + * "FAILFAST" because they don't make any promise about time lapse, + * only about the number of retries, which will be zero. + * REQ_FAILFAST_DRIVER is not included because + * Commit: 4a27446f3e39 ("[SCSI] modify scsi to handle new fail fast flags.") + * seems to suggest that the errors it avoids retrying should usually + * be retried. + */ +#define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT) +/* * MD's 'extended' device */ struct md_rdev { @@ -168,6 +178,19 @@ enum flag_bits { * so it is safe to remove without * another synchronize_rcu() call. */ + ExternalBbl, /* External metadata provides bad + * block management for a disk + */ + FailFast, /* Minimal retries should be attempted on + * this device, so use REQ_FAILFAST_DEV. + * Also don't try to repair failed reads. + * It is expects that no bad block log + * is present. + */ + LastDev, /* Seems to be the last working dev as + * it didn't fail, so don't use FailFast + * any more for metadata + */ }; static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, @@ -189,6 +212,32 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new); struct md_cluster_info; +/* change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added */ +enum mddev_flags { + MD_ARRAY_FIRST_USE, /* First use of array, needs initialization */ + MD_CLOSING, /* If set, we are closing the array, do not open + * it then */ + MD_JOURNAL_CLEAN, /* A raid with journal is already clean */ + MD_HAS_JOURNAL, /* The raid array has journal feature set */ + MD_RELOAD_SB, /* Reload the superblock because another node + * updated it. + */ + MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node + * already took resync lock, need to + * release the lock */ + MD_FAILFAST_SUPPORTED, /* Using MD_FAILFAST on metadata writes is + * supported as calls to md_error() will + * never cause the array to become failed. + */ +}; + +enum mddev_sb_flags { + MD_SB_CHANGE_DEVS, /* Some device status has changed */ + MD_SB_CHANGE_CLEAN, /* transition to or from 'clean' */ + MD_SB_CHANGE_PENDING, /* switch from 'clean' to 'active' in progress */ + MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ +}; + struct mddev { void *private; struct md_personality *pers; @@ -196,21 +245,7 @@ struct mddev { int md_minor; struct list_head disks; unsigned long flags; -#define MD_CHANGE_DEVS 0 /* Some device status has changed */ -#define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */ -#define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */ -#define MD_UPDATE_SB_FLAGS (1 | 2 | 4) /* If these are set, md_update_sb needed */ -#define MD_ARRAY_FIRST_USE 3 /* First use of array, needs initialization */ -#define MD_CLOSING 4 /* If set, we are closing the array, do not open - * it then */ -#define MD_JOURNAL_CLEAN 5 /* A raid with journal is already clean */ -#define MD_HAS_JOURNAL 6 /* The raid array has journal feature set */ -#define MD_RELOAD_SB 7 /* Reload the superblock because another node - * updated it. - */ -#define MD_CLUSTER_RESYNC_LOCKED 8 /* cluster raid only, which means node - * already took resync lock, need to - * release the lock */ + unsigned long sb_flags; int suspended; atomic_t active_io; @@ -304,31 +339,6 @@ struct mddev { int parallel_resync; int ok_start_degraded; - /* recovery/resync flags - * NEEDED: we might need to start a resync/recover - * RUNNING: a thread is running, or about to be started - * SYNC: actually doing a resync, not a recovery - * RECOVER: doing recovery, or need to try it. - * INTR: resync needs to be aborted for some reason - * DONE: thread is done and is waiting to be reaped - * REQUEST: user-space has requested a sync (used with SYNC) - * CHECK: user-space request for check-only, no repair - * RESHAPE: A reshape is happening - * ERROR: sync-action interrupted because io-error - * - * If neither SYNC or RESHAPE are set, then it is a recovery. - */ -#define MD_RECOVERY_RUNNING 0 -#define MD_RECOVERY_SYNC 1 -#define MD_RECOVERY_RECOVER 2 -#define MD_RECOVERY_INTR 3 -#define MD_RECOVERY_DONE 4 -#define MD_RECOVERY_NEEDED 5 -#define MD_RECOVERY_REQUESTED 6 -#define MD_RECOVERY_CHECK 7 -#define MD_RECOVERY_RESHAPE 8 -#define MD_RECOVERY_FROZEN 9 -#define MD_RECOVERY_ERROR 10 unsigned long recovery; /* If a RAID personality determines that recovery (of a particular @@ -442,6 +452,23 @@ struct mddev { unsigned int good_device_nr; /* good device num within cluster raid */ }; +enum recovery_flags { + /* + * If neither SYNC or RESHAPE are set, then it is a recovery. + */ + MD_RECOVERY_RUNNING, /* a thread is running, or about to be started */ + MD_RECOVERY_SYNC, /* actually doing a resync, not a recovery */ + MD_RECOVERY_RECOVER, /* doing recovery, or need to try it. */ + MD_RECOVERY_INTR, /* resync needs to be aborted for some reason */ + MD_RECOVERY_DONE, /* thread is done and is waiting to be reaped */ + MD_RECOVERY_NEEDED, /* we might need to start a resync/recover */ + MD_RECOVERY_REQUESTED, /* user-space has requested a sync (used with SYNC) */ + MD_RECOVERY_CHECK, /* user-space request for check-only, no repair */ + MD_RECOVERY_RESHAPE, /* A reshape is happening */ + MD_RECOVERY_FROZEN, /* User request to abort, and not restart, any action */ + MD_RECOVERY_ERROR, /* sync-action interrupted because io-error */ +}; + static inline int __must_check mddev_lock(struct mddev *mddev) { return mutex_lock_interruptible(&mddev->reconfig_mutex); @@ -623,7 +650,7 @@ extern int mddev_congested(struct mddev *mddev, int bits); extern void md_flush_request(struct mddev *mddev, struct bio *bio); extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, sector_t sector, int size, struct page *page); -extern void md_super_wait(struct mddev *mddev); +extern int md_super_wait(struct mddev *mddev); extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, int op, int op_flags, bool metadata_op); @@ -646,8 +673,6 @@ extern void md_rdev_clear(struct md_rdev *rdev); extern void mddev_suspend(struct mddev *mddev); extern void mddev_resume(struct mddev *mddev); -extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, - struct mddev *mddev); extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, struct mddev *mddev); @@ -676,4 +701,18 @@ static inline int mddev_is_clustered(struct mddev *mddev) { return mddev->cluster_info && mddev->bitmap_info.nodes > 1; } + +/* clear unsupported mddev_flags */ +static inline void mddev_clear_unsupported_flags(struct mddev *mddev, + unsigned long unsupported_flags) +{ + mddev->flags &= ~unsupported_flags; +} + +static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio) +{ + if (bio_op(bio) == REQ_OP_WRITE_SAME && + !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors) + mddev->queue->limits.max_write_same_sectors = 0; +} #endif /* _MD_MD_H */ diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 673efbd6fc47..79a12b59250b 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -52,7 +52,7 @@ static int multipath_map (struct mpconf *conf) } rcu_read_unlock(); - printk(KERN_ERR "multipath_map(): no more operational IO paths?\n"); + pr_crit_ratelimited("multipath_map(): no more operational IO paths?\n"); return (-1); } @@ -97,9 +97,9 @@ static void multipath_end_request(struct bio *bio) */ char b[BDEVNAME_SIZE]; md_error (mp_bh->mddev, rdev); - printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n", - bdevname(rdev->bdev,b), - (unsigned long long)bio->bi_iter.bi_sector); + pr_info("multipath: %s: rescheduling sector %llu\n", + bdevname(rdev->bdev,b), + (unsigned long long)bio->bi_iter.bi_sector); multipath_reschedule_retry(mp_bh); } else multipath_end_bh_io(mp_bh, bio->bi_error); @@ -130,7 +130,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio) } multipath = conf->multipaths + mp_bh->path; - bio_init(&mp_bh->bio); + bio_init(&mp_bh->bio, NULL, 0); __bio_clone_fast(&mp_bh->bio, bio); mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset; @@ -138,6 +138,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio) mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT; mp_bh->bio.bi_end_io = multipath_end_request; mp_bh->bio.bi_private = mp_bh; + mddev_check_writesame(mddev, &mp_bh->bio); generic_make_request(&mp_bh->bio); return; } @@ -169,7 +170,7 @@ static int multipath_congested(struct mddev *mddev, int bits) if (rdev && !test_bit(Faulty, &rdev->flags)) { struct request_queue *q = bdev_get_queue(rdev->bdev); - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= bdi_congested(q->backing_dev_info, bits); /* Just like multipath_map, we just check the * first available device */ @@ -194,8 +195,7 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev) * first check if this is a queued request for a device * which has just failed. */ - printk(KERN_ALERT - "multipath: only one IO path left and IO error.\n"); + pr_warn("multipath: only one IO path left and IO error.\n"); /* leave it active... it's all we have */ return; } @@ -209,11 +209,9 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev) spin_unlock_irqrestore(&conf->device_lock, flags); } set_bit(Faulty, &rdev->flags); - set_bit(MD_CHANGE_DEVS, &mddev->flags); - printk(KERN_ALERT "multipath: IO failure on %s," - " disabling IO path.\n" - "multipath: Operation continuing" - " on %d IO paths.\n", + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + pr_err("multipath: IO failure on %s, disabling IO path.\n" + "multipath: Operation continuing on %d IO paths.\n", bdevname(rdev->bdev, b), conf->raid_disks - mddev->degraded); } @@ -223,21 +221,21 @@ static void print_multipath_conf (struct mpconf *conf) int i; struct multipath_info *tmp; - printk("MULTIPATH conf printout:\n"); + pr_debug("MULTIPATH conf printout:\n"); if (!conf) { - printk("(conf==NULL)\n"); + pr_debug("(conf==NULL)\n"); return; } - printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, - conf->raid_disks); + pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, + conf->raid_disks); for (i = 0; i < conf->raid_disks; i++) { char b[BDEVNAME_SIZE]; tmp = conf->multipaths + i; if (tmp->rdev) - printk(" disk%d, o:%d, dev:%s\n", - i,!test_bit(Faulty, &tmp->rdev->flags), - bdevname(tmp->rdev->bdev,b)); + pr_debug(" disk%d, o:%d, dev:%s\n", + i,!test_bit(Faulty, &tmp->rdev->flags), + bdevname(tmp->rdev->bdev,b)); } } @@ -292,8 +290,7 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev == p->rdev) { if (test_bit(In_sync, &rdev->flags) || atomic_read(&rdev->nr_pending)) { - printk(KERN_ERR "hot-remove-disk, slot %d is identified" - " but is still operational!\n", number); + pr_warn("hot-remove-disk, slot %d is identified but is still operational!\n", number); err = -EBUSY; goto abort; } @@ -346,16 +343,14 @@ static void multipathd(struct md_thread *thread) bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector; if ((mp_bh->path = multipath_map (conf))<0) { - printk(KERN_ALERT "multipath: %s: unrecoverable IO read" - " error for block %llu\n", - bdevname(bio->bi_bdev,b), - (unsigned long long)bio->bi_iter.bi_sector); + pr_err("multipath: %s: unrecoverable IO read error for block %llu\n", + bdevname(bio->bi_bdev,b), + (unsigned long long)bio->bi_iter.bi_sector); multipath_end_bh_io(mp_bh, -EIO); } else { - printk(KERN_ERR "multipath: %s: redirecting sector %llu" - " to another IO path\n", - bdevname(bio->bi_bdev,b), - (unsigned long long)bio->bi_iter.bi_sector); + pr_err("multipath: %s: redirecting sector %llu to another IO path\n", + bdevname(bio->bi_bdev,b), + (unsigned long long)bio->bi_iter.bi_sector); *bio = *(mp_bh->master_bio); bio->bi_iter.bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset; @@ -389,8 +384,8 @@ static int multipath_run (struct mddev *mddev) return -EINVAL; if (mddev->level != LEVEL_MULTIPATH) { - printk("multipath: %s: raid level not set to multipath IO (%d)\n", - mdname(mddev), mddev->level); + pr_warn("multipath: %s: raid level not set to multipath IO (%d)\n", + mdname(mddev), mddev->level); goto out; } /* @@ -401,21 +396,13 @@ static int multipath_run (struct mddev *mddev) conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL); mddev->private = conf; - if (!conf) { - printk(KERN_ERR - "multipath: couldn't allocate memory for %s\n", - mdname(mddev)); + if (!conf) goto out; - } conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks, GFP_KERNEL); - if (!conf->multipaths) { - printk(KERN_ERR - "multipath: couldn't allocate memory for %s\n", - mdname(mddev)); + if (!conf->multipaths) goto out_free_conf; - } working_disks = 0; rdev_for_each(rdev, mddev) { @@ -439,7 +426,7 @@ static int multipath_run (struct mddev *mddev) INIT_LIST_HEAD(&conf->retry_list); if (!working_disks) { - printk(KERN_ERR "multipath: no operational IO paths for %s\n", + pr_warn("multipath: no operational IO paths for %s\n", mdname(mddev)); goto out_free_conf; } @@ -447,27 +434,17 @@ static int multipath_run (struct mddev *mddev) conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS, sizeof(struct multipath_bh)); - if (conf->pool == NULL) { - printk(KERN_ERR - "multipath: couldn't allocate memory for %s\n", - mdname(mddev)); + if (conf->pool == NULL) goto out_free_conf; - } - { - mddev->thread = md_register_thread(multipathd, mddev, - "multipath"); - if (!mddev->thread) { - printk(KERN_ERR "multipath: couldn't allocate thread" - " for %s\n", mdname(mddev)); - goto out_free_conf; - } - } + mddev->thread = md_register_thread(multipathd, mddev, + "multipath"); + if (!mddev->thread) + goto out_free_conf; - printk(KERN_INFO - "multipath: array %s active with %d out of %d IO paths\n", + pr_info("multipath: array %s active with %d out of %d IO paths\n", mdname(mddev), conf->raid_disks - mddev->degraded, - mddev->raid_disks); + mddev->raid_disks); /* * Ok, everything is just fine now */ diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index e83047cbb2da..185dc60360b5 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -700,13 +700,11 @@ static int populate_ablock_with_values(struct dm_array_info *info, struct array_ { int r; unsigned i; - uint32_t nr_entries; struct dm_btree_value_type *vt = &info->value_type; BUG_ON(le32_to_cpu(ab->nr_entries)); BUG_ON(new_nr > le32_to_cpu(ab->max_entries)); - nr_entries = le32_to_cpu(ab->nr_entries); for (i = 0; i < new_nr; i++) { r = fn(base + i, element_at(info, ab, i), context); if (r) @@ -978,6 +976,27 @@ int dm_array_cursor_next(struct dm_array_cursor *c) } EXPORT_SYMBOL_GPL(dm_array_cursor_next); +int dm_array_cursor_skip(struct dm_array_cursor *c, uint32_t count) +{ + int r; + + do { + uint32_t remaining = le32_to_cpu(c->ab->nr_entries) - c->index; + + if (count < remaining) { + c->index += count; + return 0; + } + + count -= remaining; + r = dm_array_cursor_next(c); + + } while (!r); + + return r; +} +EXPORT_SYMBOL_GPL(dm_array_cursor_skip); + void dm_array_cursor_get_value(struct dm_array_cursor *c, void **value_le) { *value_le = element_at(c->info, c->ab, c->index); diff --git a/drivers/md/persistent-data/dm-array.h b/drivers/md/persistent-data/dm-array.h index 27ee49a55473..d7d2d579c662 100644 --- a/drivers/md/persistent-data/dm-array.h +++ b/drivers/md/persistent-data/dm-array.h @@ -207,6 +207,7 @@ void dm_array_cursor_end(struct dm_array_cursor *c); uint32_t dm_array_cursor_index(struct dm_array_cursor *c); int dm_array_cursor_next(struct dm_array_cursor *c); +int dm_array_cursor_skip(struct dm_array_cursor *c, uint32_t count); /* * value_le is only valid while the cursor points at the current value. diff --git a/drivers/md/persistent-data/dm-bitset.c b/drivers/md/persistent-data/dm-bitset.c index 36f7cc2c7109..b7208d82e748 100644 --- a/drivers/md/persistent-data/dm-bitset.c +++ b/drivers/md/persistent-data/dm-bitset.c @@ -39,6 +39,48 @@ int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *root) } EXPORT_SYMBOL_GPL(dm_bitset_empty); +struct packer_context { + bit_value_fn fn; + unsigned nr_bits; + void *context; +}; + +static int pack_bits(uint32_t index, void *value, void *context) +{ + int r; + struct packer_context *p = context; + unsigned bit, nr = min(64u, p->nr_bits - (index * 64)); + uint64_t word = 0; + bool bv; + + for (bit = 0; bit < nr; bit++) { + r = p->fn(index * 64 + bit, &bv, p->context); + if (r) + return r; + + if (bv) + set_bit(bit, (unsigned long *) &word); + else + clear_bit(bit, (unsigned long *) &word); + } + + *((__le64 *) value) = cpu_to_le64(word); + + return 0; +} + +int dm_bitset_new(struct dm_disk_bitset *info, dm_block_t *root, + uint32_t size, bit_value_fn fn, void *context) +{ + struct packer_context p; + p.fn = fn; + p.nr_bits = size; + p.context = context; + + return dm_array_new(&info->array_info, root, dm_div_up(size, 64), pack_bits, &p); +} +EXPORT_SYMBOL_GPL(dm_bitset_new); + int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t root, uint32_t old_nr_entries, uint32_t new_nr_entries, bool default_value, dm_block_t *new_root) @@ -168,4 +210,108 @@ int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root, } EXPORT_SYMBOL_GPL(dm_bitset_test_bit); +static int cursor_next_array_entry(struct dm_bitset_cursor *c) +{ + int r; + __le64 *value; + + r = dm_array_cursor_next(&c->cursor); + if (r) + return r; + + dm_array_cursor_get_value(&c->cursor, (void **) &value); + c->array_index++; + c->bit_index = 0; + c->current_bits = le64_to_cpu(*value); + return 0; +} + +int dm_bitset_cursor_begin(struct dm_disk_bitset *info, + dm_block_t root, uint32_t nr_entries, + struct dm_bitset_cursor *c) +{ + int r; + __le64 *value; + + if (!nr_entries) + return -ENODATA; + + c->info = info; + c->entries_remaining = nr_entries; + + r = dm_array_cursor_begin(&info->array_info, root, &c->cursor); + if (r) + return r; + + dm_array_cursor_get_value(&c->cursor, (void **) &value); + c->array_index = 0; + c->bit_index = 0; + c->current_bits = le64_to_cpu(*value); + + return r; +} +EXPORT_SYMBOL_GPL(dm_bitset_cursor_begin); + +void dm_bitset_cursor_end(struct dm_bitset_cursor *c) +{ + return dm_array_cursor_end(&c->cursor); +} +EXPORT_SYMBOL_GPL(dm_bitset_cursor_end); + +int dm_bitset_cursor_next(struct dm_bitset_cursor *c) +{ + int r = 0; + + if (!c->entries_remaining) + return -ENODATA; + + c->entries_remaining--; + if (++c->bit_index > 63) + r = cursor_next_array_entry(c); + + return r; +} +EXPORT_SYMBOL_GPL(dm_bitset_cursor_next); + +int dm_bitset_cursor_skip(struct dm_bitset_cursor *c, uint32_t count) +{ + int r; + __le64 *value; + uint32_t nr_array_skip; + uint32_t remaining_in_word = 64 - c->bit_index; + + if (c->entries_remaining < count) + return -ENODATA; + + if (count < remaining_in_word) { + c->bit_index += count; + c->entries_remaining -= count; + return 0; + + } else { + c->entries_remaining -= remaining_in_word; + count -= remaining_in_word; + } + + nr_array_skip = (count / 64) + 1; + r = dm_array_cursor_skip(&c->cursor, nr_array_skip); + if (r) + return r; + + dm_array_cursor_get_value(&c->cursor, (void **) &value); + c->entries_remaining -= count; + c->array_index += nr_array_skip; + c->bit_index = count & 63; + c->current_bits = le64_to_cpu(*value); + + return 0; +} +EXPORT_SYMBOL_GPL(dm_bitset_cursor_skip); + +bool dm_bitset_cursor_get_value(struct dm_bitset_cursor *c) +{ + return test_bit(c->bit_index, (unsigned long *) &c->current_bits); +} +EXPORT_SYMBOL_GPL(dm_bitset_cursor_get_value); + /*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-bitset.h b/drivers/md/persistent-data/dm-bitset.h index c2287d672ef5..df888da04ee1 100644 --- a/drivers/md/persistent-data/dm-bitset.h +++ b/drivers/md/persistent-data/dm-bitset.h @@ -93,6 +93,22 @@ void dm_disk_bitset_init(struct dm_transaction_manager *tm, int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *new_root); /* + * Creates a new bitset populated with values provided by a callback + * function. This is more efficient than creating an empty bitset, + * resizing, and then setting values since that process incurs a lot of + * copying. + * + * info - describes the array + * root - the root block of the array on disk + * size - the number of entries in the array + * fn - the callback + * context - passed to the callback + */ +typedef int (*bit_value_fn)(uint32_t index, bool *value, void *context); +int dm_bitset_new(struct dm_disk_bitset *info, dm_block_t *root, + uint32_t size, bit_value_fn fn, void *context); + +/* * Resize the bitset. * * info - describes the bitset @@ -161,6 +177,29 @@ int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root, int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root, dm_block_t *new_root); +struct dm_bitset_cursor { + struct dm_disk_bitset *info; + struct dm_array_cursor cursor; + + uint32_t entries_remaining; + uint32_t array_index; + uint32_t bit_index; + uint64_t current_bits; +}; + +/* + * Make sure you've flush any dm_disk_bitset and updated the root before + * using this. + */ +int dm_bitset_cursor_begin(struct dm_disk_bitset *info, + dm_block_t root, uint32_t nr_entries, + struct dm_bitset_cursor *c); +void dm_bitset_cursor_end(struct dm_bitset_cursor *c); + +int dm_bitset_cursor_next(struct dm_bitset_cursor *c); +int dm_bitset_cursor_skip(struct dm_bitset_cursor *c, uint32_t count); +bool dm_bitset_cursor_get_value(struct dm_bitset_cursor *c); + /*----------------------------------------------------------------*/ #endif /* _LINUX_DM_BITSET_H */ diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index 1e33dd51c21f..8589e0a14068 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -13,11 +13,14 @@ #include <linux/rwsem.h> #include <linux/device-mapper.h> #include <linux/stacktrace.h> +#include <linux/sched/task.h> #define DM_MSG_PREFIX "block manager" /*----------------------------------------------------------------*/ +#ifdef CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING + /* * This is a read/write semaphore with a couple of differences. * @@ -118,7 +121,7 @@ static int __check_holder(struct block_lock *lock) static void __wait(struct waiter *w) { for (;;) { - set_task_state(current, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); if (!w->task) break; @@ -126,7 +129,7 @@ static void __wait(struct waiter *w) schedule(); } - set_task_state(current, TASK_RUNNING); + set_current_state(TASK_RUNNING); } static void __wake_waiter(struct waiter *w) @@ -302,6 +305,18 @@ static void report_recursive_bug(dm_block_t b, int r) (unsigned long long) b); } +#else /* !CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING */ + +#define bl_init(x) do { } while (0) +#define bl_down_read(x) 0 +#define bl_down_read_nonblock(x) 0 +#define bl_up_read(x) do { } while (0) +#define bl_down_write(x) 0 +#define bl_up_write(x) do { } while (0) +#define report_recursive_bug(x, y) do { } while (0) + +#endif /* CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING */ + /*----------------------------------------------------------------*/ /* @@ -330,8 +345,11 @@ EXPORT_SYMBOL_GPL(dm_block_data); struct buffer_aux { struct dm_block_validator *validator; - struct block_lock lock; int write_locked; + +#ifdef CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING + struct block_lock lock; +#endif }; static void dm_block_manager_alloc_callback(struct dm_buffer *buf) @@ -445,7 +463,7 @@ int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, int r; p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result); - if (IS_ERR(p)) + if (unlikely(IS_ERR(p))) return PTR_ERR(p); aux = dm_bufio_get_aux_data(to_buffer(*result)); @@ -481,7 +499,7 @@ int dm_bm_write_lock(struct dm_block_manager *bm, return -EPERM; p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result); - if (IS_ERR(p)) + if (unlikely(IS_ERR(p))) return PTR_ERR(p); aux = dm_bufio_get_aux_data(to_buffer(*result)); @@ -514,7 +532,7 @@ int dm_bm_read_try_lock(struct dm_block_manager *bm, int r; p = dm_bufio_get(bm->bufio, b, (struct dm_buffer **) result); - if (IS_ERR(p)) + if (unlikely(IS_ERR(p))) return PTR_ERR(p); if (unlikely(!p)) return -EWOULDBLOCK; @@ -550,7 +568,7 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm, return -EPERM; p = dm_bufio_new(bm->bufio, b, (struct dm_buffer **) result); - if (IS_ERR(p)) + if (unlikely(IS_ERR(p))) return PTR_ERR(p); memset(p, 0, dm_bm_block_size(bm)); diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index 20a40329d84a..02e2ee0d8a00 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -272,7 +272,12 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root) int r; struct del_stack *s; - s = kmalloc(sizeof(*s), GFP_NOIO); + /* + * dm_btree_del() is called via an ioctl, as such should be + * considered an FS op. We can't recurse back into the FS, so we + * allocate GFP_NOFS. + */ + s = kmalloc(sizeof(*s), GFP_NOFS); if (!s) return -ENOMEM; s->info = info; @@ -1139,6 +1144,17 @@ int dm_btree_cursor_next(struct dm_btree_cursor *c) } EXPORT_SYMBOL_GPL(dm_btree_cursor_next); +int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count) +{ + int r = 0; + + while (count-- && !r) + r = dm_btree_cursor_next(c); + + return r; +} +EXPORT_SYMBOL_GPL(dm_btree_cursor_skip); + int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le) { if (c->depth) { diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h index db9bd26adf31..3dc5bb1a4748 100644 --- a/drivers/md/persistent-data/dm-btree.h +++ b/drivers/md/persistent-data/dm-btree.h @@ -209,6 +209,7 @@ int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root, bool prefetch_leaves, struct dm_btree_cursor *c); void dm_btree_cursor_end(struct dm_btree_cursor *c); int dm_btree_cursor_next(struct dm_btree_cursor *c); +int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count); int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le); #endif /* _LINUX_DM_BTREE_H */ diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index 306d2e4502c4..829b4ce057d8 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -464,7 +464,8 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, ll->nr_allocated--; le32_add_cpu(&ie_disk.nr_free, 1); ie_disk.none_free_before = cpu_to_le32(min(le32_to_cpu(ie_disk.none_free_before), bit)); - } + } else + *ev = SM_NONE; return ll->save_ie(ll, index, &ie_disk); } @@ -547,7 +548,6 @@ static int metadata_ll_init_index(struct ll_disk *ll) if (r < 0) return r; - memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le)); ll->bitmap_root = dm_block_location(b); dm_tm_unlock(ll->tm, b); @@ -626,13 +626,19 @@ int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm, void *root_le, size_t len) { int r; - struct disk_sm_root *smr = root_le; + struct disk_sm_root smr; if (len < sizeof(struct disk_sm_root)) { DMERR("sm_metadata root too small"); return -ENOMEM; } + /* + * We don't know the alignment of the root_le buffer, so need to + * copy into a new structure. + */ + memcpy(&smr, root_le, sizeof(smr)); + r = sm_ll_init(ll, tm); if (r < 0) return r; @@ -644,10 +650,10 @@ int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm, ll->max_entries = metadata_ll_max_entries; ll->commit = metadata_ll_commit; - ll->nr_blocks = le64_to_cpu(smr->nr_blocks); - ll->nr_allocated = le64_to_cpu(smr->nr_allocated); - ll->bitmap_root = le64_to_cpu(smr->bitmap_root); - ll->ref_count_root = le64_to_cpu(smr->ref_count_root); + ll->nr_blocks = le64_to_cpu(smr.nr_blocks); + ll->nr_allocated = le64_to_cpu(smr.nr_allocated); + ll->bitmap_root = le64_to_cpu(smr.bitmap_root); + ll->ref_count_root = le64_to_cpu(smr.ref_count_root); return ll->open_index(ll); } diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 7e44005595c1..4aed69d9dd17 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -544,7 +544,7 @@ static int sm_metadata_copy_root(struct dm_space_map *sm, void *where_le, size_t static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks); -static struct dm_space_map ops = { +static const struct dm_space_map ops = { .destroy = sm_metadata_destroy, .extend = sm_metadata_extend, .get_nr_blocks = sm_metadata_get_nr_blocks, @@ -671,7 +671,7 @@ static int sm_bootstrap_copy_root(struct dm_space_map *sm, void *where, return -EINVAL; } -static struct dm_space_map bootstrap_ops = { +static const struct dm_space_map bootstrap_ops = { .destroy = sm_bootstrap_destroy, .extend = sm_bootstrap_extend, .get_nr_blocks = sm_bootstrap_get_nr_blocks, @@ -775,17 +775,15 @@ int dm_sm_metadata_create(struct dm_space_map *sm, memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); r = sm_ll_new_metadata(&smm->ll, tm); + if (!r) { + if (nr_blocks > DM_SM_METADATA_MAX_BLOCKS) + nr_blocks = DM_SM_METADATA_MAX_BLOCKS; + r = sm_ll_extend(&smm->ll, nr_blocks); + } + memcpy(&smm->sm, &ops, sizeof(smm->sm)); if (r) return r; - if (nr_blocks > DM_SM_METADATA_MAX_BLOCKS) - nr_blocks = DM_SM_METADATA_MAX_BLOCKS; - r = sm_ll_extend(&smm->ll, nr_blocks); - if (r) - return r; - - memcpy(&smm->sm, &ops, sizeof(smm->sm)); - /* * Now we need to update the newly created data structures with the * allocated blocks that they were built from. diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 258986a2699d..93347ca7c7a6 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -21,10 +21,16 @@ #include <linux/seq_file.h> #include <linux/module.h> #include <linux/slab.h> +#include <trace/events/block.h> #include "md.h" #include "raid0.h" #include "raid5.h" +#define UNSUPPORTED_MDDEV_FLAGS \ + ((1L << MD_HAS_JOURNAL) | \ + (1L << MD_JOURNAL_CLEAN) | \ + (1L << MD_FAILFAST_SUPPORTED)) + static int raid0_congested(struct mddev *mddev, int bits) { struct r0conf *conf = mddev->private; @@ -35,7 +41,7 @@ static int raid0_congested(struct mddev *mddev, int bits) for (i = 0; i < raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(devlist[i]->bdev); - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= bdi_congested(q->backing_dev_info, bits); } return ret; } @@ -51,20 +57,21 @@ static void dump_zones(struct mddev *mddev) char b[BDEVNAME_SIZE]; struct r0conf *conf = mddev->private; int raid_disks = conf->strip_zone[0].nb_dev; - printk(KERN_INFO "md: RAID0 configuration for %s - %d zone%s\n", - mdname(mddev), - conf->nr_strip_zones, conf->nr_strip_zones==1?"":"s"); + pr_debug("md: RAID0 configuration for %s - %d zone%s\n", + mdname(mddev), + conf->nr_strip_zones, conf->nr_strip_zones==1?"":"s"); for (j = 0; j < conf->nr_strip_zones; j++) { - printk(KERN_INFO "md: zone%d=[", j); + char line[200]; + int len = 0; + for (k = 0; k < conf->strip_zone[j].nb_dev; k++) - printk(KERN_CONT "%s%s", k?"/":"", - bdevname(conf->devlist[j*raid_disks - + k]->bdev, b)); - printk(KERN_CONT "]\n"); + len += snprintf(line+len, 200-len, "%s%s", k?"/":"", + bdevname(conf->devlist[j*raid_disks + + k]->bdev, b)); + pr_debug("md: zone%d=[%s]\n", j, line); zone_size = conf->strip_zone[j].zone_end - zone_start; - printk(KERN_INFO " zone-offset=%10lluKB, " - "device-offset=%10lluKB, size=%10lluKB\n", + pr_debug(" zone-offset=%10lluKB, device-offset=%10lluKB, size=%10lluKB\n", (unsigned long long)zone_start>>1, (unsigned long long)conf->strip_zone[j].dev_start>>1, (unsigned long long)zone_size>>1); @@ -142,9 +149,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) * chunk size is a multiple of that sector size */ if ((mddev->chunk_sectors << 9) % blksize) { - printk(KERN_ERR "md/raid0:%s: chunk_size of %d not multiple of block size %d\n", - mdname(mddev), - mddev->chunk_sectors << 9, blksize); + pr_warn("md/raid0:%s: chunk_size of %d not multiple of block size %d\n", + mdname(mddev), + mddev->chunk_sectors << 9, blksize); err = -EINVAL; goto abort; } @@ -186,19 +193,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) } if (j < 0) { - printk(KERN_ERR - "md/raid0:%s: remove inactive devices before converting to RAID0\n", - mdname(mddev)); + pr_warn("md/raid0:%s: remove inactive devices before converting to RAID0\n", + mdname(mddev)); goto abort; } if (j >= mddev->raid_disks) { - printk(KERN_ERR "md/raid0:%s: bad disk number %d - " - "aborting!\n", mdname(mddev), j); + pr_warn("md/raid0:%s: bad disk number %d - aborting!\n", + mdname(mddev), j); goto abort; } if (dev[j]) { - printk(KERN_ERR "md/raid0:%s: multiple devices for %d - " - "aborting!\n", mdname(mddev), j); + pr_warn("md/raid0:%s: multiple devices for %d - aborting!\n", + mdname(mddev), j); goto abort; } dev[j] = rdev1; @@ -208,8 +214,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) cnt++; } if (cnt != mddev->raid_disks) { - printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - " - "aborting!\n", mdname(mddev), cnt, mddev->raid_disks); + pr_warn("md/raid0:%s: too few disks (%d of %d) - aborting!\n", + mdname(mddev), cnt, mddev->raid_disks); goto abort; } zone->nb_dev = cnt; @@ -357,8 +363,7 @@ static int raid0_run(struct mddev *mddev) int ret; if (mddev->chunk_sectors == 0) { - printk(KERN_ERR "md/raid0:%s: chunk size must be set.\n", - mdname(mddev)); + pr_warn("md/raid0:%s: chunk size must be set.\n", mdname(mddev)); return -EINVAL; } if (md_check_no_bitmap(mddev)) @@ -399,9 +404,9 @@ static int raid0_run(struct mddev *mddev) /* calculate array device size */ md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); - printk(KERN_INFO "md/raid0:%s: md_size is %llu sectors.\n", - mdname(mddev), - (unsigned long long)mddev->array_sectors); + pr_debug("md/raid0:%s: md_size is %llu sectors.\n", + mdname(mddev), + (unsigned long long)mddev->array_sectors); if (mddev->queue) { /* calculate the max read-ahead size. @@ -415,8 +420,8 @@ static int raid0_run(struct mddev *mddev) */ int stripe = mddev->raid_disks * (mddev->chunk_sectors << 9) / PAGE_SIZE; - if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) - mddev->queue->backing_dev_info.ra_pages = 2* stripe; + if (mddev->queue->backing_dev_info->ra_pages < 2* stripe) + mddev->queue->backing_dev_info->ra_pages = 2* stripe; } dump_zones(mddev); @@ -464,7 +469,8 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) } do { - sector_t sector = bio->bi_iter.bi_sector; + sector_t bio_sector = bio->bi_iter.bi_sector; + sector_t sector = bio_sector; unsigned chunk_sects = mddev->chunk_sectors; unsigned sectors = chunk_sects - @@ -473,7 +479,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) : sector_div(sector, chunk_sects)); /* Restore due to sector_div */ - sector = bio->bi_iter.bi_sector; + sector = bio_sector; if (sectors < bio_sectors(bio)) { split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set); @@ -492,8 +498,14 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) { /* Just ignore it */ bio_endio(split); - } else + } else { + if (mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(split->bi_bdev), + split, disk_devt(mddev->gendisk), + bio_sector); + mddev_check_writesame(mddev, split); generic_make_request(split); + } } while (split != bio); } @@ -509,17 +521,17 @@ static void *raid0_takeover_raid45(struct mddev *mddev) struct r0conf *priv_conf; if (mddev->degraded != 1) { - printk(KERN_ERR "md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n", - mdname(mddev), - mddev->degraded); + pr_warn("md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n", + mdname(mddev), + mddev->degraded); return ERR_PTR(-EINVAL); } rdev_for_each(rdev, mddev) { /* check slot number for a disk */ if (rdev->raid_disk == mddev->raid_disks-1) { - printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n", - mdname(mddev)); + pr_warn("md/raid0:%s: raid5 must have missing parity disk!\n", + mdname(mddev)); return ERR_PTR(-EINVAL); } rdev->sectors = mddev->dev_sectors; @@ -533,8 +545,10 @@ static void *raid0_takeover_raid45(struct mddev *mddev) mddev->delta_disks = -1; /* make sure it will be not marked as dirty */ mddev->recovery_cp = MaxSector; + mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); create_strip_zones(mddev, &priv_conf); + return priv_conf; } @@ -549,19 +563,19 @@ static void *raid0_takeover_raid10(struct mddev *mddev) * - all mirrors must be already degraded */ if (mddev->layout != ((1 << 8) + 2)) { - printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takeover layout: 0x%x\n", - mdname(mddev), - mddev->layout); + pr_warn("md/raid0:%s:: Raid0 cannot takeover layout: 0x%x\n", + mdname(mddev), + mddev->layout); return ERR_PTR(-EINVAL); } if (mddev->raid_disks & 1) { - printk(KERN_ERR "md/raid0:%s: Raid0 cannot takeover Raid10 with odd disk number.\n", - mdname(mddev)); + pr_warn("md/raid0:%s: Raid0 cannot takeover Raid10 with odd disk number.\n", + mdname(mddev)); return ERR_PTR(-EINVAL); } if (mddev->degraded != (mddev->raid_disks>>1)) { - printk(KERN_ERR "md/raid0:%s: All mirrors must be already degraded!\n", - mdname(mddev)); + pr_warn("md/raid0:%s: All mirrors must be already degraded!\n", + mdname(mddev)); return ERR_PTR(-EINVAL); } @@ -574,6 +588,7 @@ static void *raid0_takeover_raid10(struct mddev *mddev) mddev->degraded = 0; /* make sure it will be not marked as dirty */ mddev->recovery_cp = MaxSector; + mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); create_strip_zones(mddev, &priv_conf); return priv_conf; @@ -588,7 +603,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev) * - (N - 1) mirror drives must be already faulty */ if ((mddev->raid_disks - 1) != mddev->degraded) { - printk(KERN_ERR "md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n", + pr_err("md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n", mdname(mddev)); return ERR_PTR(-EINVAL); } @@ -616,6 +631,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev) mddev->raid_disks = 1; /* make sure it will be not marked as dirty */ mddev->recovery_cp = MaxSector; + mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); create_strip_zones(mddev, &priv_conf); return priv_conf; @@ -631,8 +647,8 @@ static void *raid0_takeover(struct mddev *mddev) */ if (mddev->bitmap) { - printk(KERN_ERR "md/raid0: %s: cannot takeover array with bitmap\n", - mdname(mddev)); + pr_warn("md/raid0: %s: cannot takeover array with bitmap\n", + mdname(mddev)); return ERR_PTR(-EBUSY); } if (mddev->level == 4) @@ -642,8 +658,8 @@ static void *raid0_takeover(struct mddev *mddev) if (mddev->layout == ALGORITHM_PARITY_N) return raid0_takeover_raid45(mddev); - printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n", - mdname(mddev), ALGORITHM_PARITY_N); + pr_warn("md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n", + mdname(mddev), ALGORITHM_PARITY_N); } if (mddev->level == 10) @@ -652,7 +668,7 @@ static void *raid0_takeover(struct mddev *mddev) if (mddev->level == 1) return raid0_takeover_raid1(mddev); - printk(KERN_ERR "Takeover from raid%i to raid0 not supported\n", + pr_warn("Takeover from raid%i to raid0 not supported\n", mddev->level); return ERR_PTR(-EINVAL); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 29e2df5cd77b..fbc2d7851b49 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -37,10 +37,18 @@ #include <linux/module.h> #include <linux/seq_file.h> #include <linux/ratelimit.h> +#include <linux/sched/signal.h> + +#include <trace/events/block.h> + #include "md.h" #include "raid1.h" #include "bitmap.h" +#define UNSUPPORTED_MDDEV_FLAGS \ + ((1L << MD_HAS_JOURNAL) | \ + (1L << MD_JOURNAL_CLEAN)) + /* * Number of guaranteed r1bios in case of extreme VM load: */ @@ -66,9 +74,11 @@ */ static int max_queued_requests = 1024; -static void allow_barrier(struct r1conf *conf, sector_t start_next_window, - sector_t bi_sector); -static void lower_barrier(struct r1conf *conf); +static void allow_barrier(struct r1conf *conf, sector_t sector_nr); +static void lower_barrier(struct r1conf *conf, sector_t sector_nr); + +#define raid1_log(md, fmt, args...) \ + do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) { @@ -92,7 +102,6 @@ static void r1bio_pool_free(void *r1_bio, void *data) #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) #define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) -#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS) static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) { @@ -197,6 +206,7 @@ static void free_r1bio(struct r1bio *r1_bio) static void put_buf(struct r1bio *r1_bio) { struct r1conf *conf = r1_bio->mddev->private; + sector_t sect = r1_bio->sector; int i; for (i = 0; i < conf->raid_disks * 2; i++) { @@ -207,7 +217,7 @@ static void put_buf(struct r1bio *r1_bio) mempool_free(r1_bio, conf->r1buf_pool); - lower_barrier(conf); + lower_barrier(conf, sect); } static void reschedule_retry(struct r1bio *r1_bio) @@ -215,10 +225,12 @@ static void reschedule_retry(struct r1bio *r1_bio) unsigned long flags; struct mddev *mddev = r1_bio->mddev; struct r1conf *conf = mddev->private; + int idx; + idx = sector_to_idx(r1_bio->sector); spin_lock_irqsave(&conf->device_lock, flags); list_add(&r1_bio->retry_list, &conf->retry_list); - conf->nr_queued ++; + atomic_inc(&conf->nr_queued[idx]); spin_unlock_irqrestore(&conf->device_lock, flags); wake_up(&conf->wait_barrier); @@ -235,7 +247,6 @@ static void call_bio_endio(struct r1bio *r1_bio) struct bio *bio = r1_bio->master_bio; int done; struct r1conf *conf = r1_bio->mddev->private; - sector_t start_next_window = r1_bio->start_next_window; sector_t bi_sector = bio->bi_iter.bi_sector; if (bio->bi_phys_segments) { @@ -261,7 +272,7 @@ static void call_bio_endio(struct r1bio *r1_bio) * Wake up any possible resync thread that waits for the device * to go idle. */ - allow_barrier(conf, start_next_window, bi_sector); + allow_barrier(conf, bi_sector); } } @@ -325,6 +336,11 @@ static void raid1_end_read_request(struct bio *bio) if (uptodate) set_bit(R1BIO_Uptodate, &r1_bio->state); + else if (test_bit(FailFast, &rdev->flags) && + test_bit(R1BIO_FailFast, &r1_bio->state)) + /* This was a fail-fast read so we definitely + * want to retry */ + ; else { /* If all other devices have failed, we want to return * the error upwards rather than fail the last device. @@ -347,13 +363,10 @@ static void raid1_end_read_request(struct bio *bio) * oops, read error: */ char b[BDEVNAME_SIZE]; - printk_ratelimited( - KERN_ERR "md/raid1:%s: %s: " - "rescheduling sector %llu\n", - mdname(conf->mddev), - bdevname(rdev->bdev, - b), - (unsigned long long)r1_bio->sector); + pr_err_ratelimited("md/raid1:%s: %s: rescheduling sector %llu\n", + mdname(conf->mddev), + bdevname(rdev->bdev, b), + (unsigned long long)r1_bio->sector); set_bit(R1BIO_ReadError, &r1_bio->state); reschedule_retry(r1_bio); /* don't drop the reference on read_disk yet */ @@ -416,7 +429,24 @@ static void raid1_end_write_request(struct bio *bio) set_bit(MD_RECOVERY_NEEDED, & conf->mddev->recovery); - set_bit(R1BIO_WriteError, &r1_bio->state); + if (test_bit(FailFast, &rdev->flags) && + (bio->bi_opf & MD_FAILFAST) && + /* We never try FailFast to WriteMostly devices */ + !test_bit(WriteMostly, &rdev->flags)) { + md_error(r1_bio->mddev, rdev); + if (!test_bit(Faulty, &rdev->flags)) + /* This is the only remaining device, + * We need to retry the write without + * FailFast + */ + set_bit(R1BIO_WriteError, &r1_bio->state); + else { + /* Finished with this branch */ + r1_bio->bios[mirror] = NULL; + to_put = bio; + } + } else + set_bit(R1BIO_WriteError, &r1_bio->state); } else { /* * Set R1BIO_Uptodate in our master bio, so that we @@ -490,6 +520,25 @@ static void raid1_end_write_request(struct bio *bio) bio_put(to_put); } +static sector_t align_to_barrier_unit_end(sector_t start_sector, + sector_t sectors) +{ + sector_t len; + + WARN_ON(sectors == 0); + /* + * len is the number of sectors from start_sector to end of the + * barrier unit which start_sector belongs to. + */ + len = round_up(start_sector + 1, BARRIER_UNIT_SECTOR_SIZE) - + start_sector; + + if (len > sectors) + len = sectors; + + return len; +} + /* * This routine returns the disk from which the requested read should * be done. There is a per-array 'next expected sequential IO' sector @@ -534,6 +583,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect best_good_sectors = 0; has_nonrot_disk = 0; choose_next_idle = 0; + clear_bit(R1BIO_FailFast, &r1_bio->state); if ((conf->mddev->recovery_cp < this_sector + sectors) || (mddev_is_clustered(conf->mddev) && @@ -607,6 +657,10 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect } else best_good_sectors = sectors; + if (best_disk >= 0) + /* At least two disks to choose from so failfast is OK */ + set_bit(R1BIO_FailFast, &r1_bio->state); + nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev)); has_nonrot_disk |= nonrot; pending = atomic_read(&rdev->nr_pending); @@ -645,11 +699,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect } break; } - /* If device is idle, use it */ - if (pending == 0) { - best_disk = disk; - break; - } if (choose_next_idle) continue; @@ -672,7 +721,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect * mixed ratation/non-rotational disks depending on workload. */ if (best_disk == -1) { - if (has_nonrot_disk) + if (has_nonrot_disk || min_pending == 0) best_disk = best_pending_disk; else best_disk = best_dist_disk; @@ -717,9 +766,9 @@ static int raid1_congested(struct mddev *mddev, int bits) * non-congested targets, it can be removed */ if ((bits & (1 << WB_async_congested)) || 1) - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= bdi_congested(q->backing_dev_info, bits); else - ret &= bdi_congested(&q->backing_dev_info, bits); + ret &= bdi_congested(q->backing_dev_info, bits); } } rcu_read_unlock(); @@ -745,9 +794,14 @@ static void flush_pending_writes(struct r1conf *conf) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; + struct md_rdev *rdev = (void*)bio->bi_bdev; bio->bi_next = NULL; - if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + bio->bi_bdev = rdev->bdev; + if (test_bit(Faulty, &rdev->flags)) { + bio->bi_error = -EIO; + bio_endio(bio); + } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) /* Just ignore it */ bio_endio(bio); else @@ -781,166 +835,228 @@ static void flush_pending_writes(struct r1conf *conf) */ static void raise_barrier(struct r1conf *conf, sector_t sector_nr) { + int idx = sector_to_idx(sector_nr); + spin_lock_irq(&conf->resync_lock); /* Wait until no block IO is waiting */ - wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, + wait_event_lock_irq(conf->wait_barrier, + !atomic_read(&conf->nr_waiting[idx]), conf->resync_lock); /* block any new IO from starting */ - conf->barrier++; - conf->next_resync = sector_nr; + atomic_inc(&conf->barrier[idx]); + /* + * In raise_barrier() we firstly increase conf->barrier[idx] then + * check conf->nr_pending[idx]. In _wait_barrier() we firstly + * increase conf->nr_pending[idx] then check conf->barrier[idx]. + * A memory barrier here to make sure conf->nr_pending[idx] won't + * be fetched before conf->barrier[idx] is increased. Otherwise + * there will be a race between raise_barrier() and _wait_barrier(). + */ + smp_mb__after_atomic(); /* For these conditions we must wait: * A: while the array is in frozen state - * B: while barrier >= RESYNC_DEPTH, meaning resync reach - * the max count which allowed. - * C: next_resync + RESYNC_SECTORS > start_next_window, meaning - * next resync will reach to the window which normal bios are - * handling. - * D: while there are any active requests in the current window. + * B: while conf->nr_pending[idx] is not 0, meaning regular I/O + * existing in corresponding I/O barrier bucket. + * C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches + * max resync count which allowed on current I/O barrier bucket. */ wait_event_lock_irq(conf->wait_barrier, !conf->array_frozen && - conf->barrier < RESYNC_DEPTH && - conf->current_window_requests == 0 && - (conf->start_next_window >= - conf->next_resync + RESYNC_SECTORS), + !atomic_read(&conf->nr_pending[idx]) && + atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH, conf->resync_lock); - conf->nr_pending++; + atomic_inc(&conf->nr_pending[idx]); spin_unlock_irq(&conf->resync_lock); } -static void lower_barrier(struct r1conf *conf) +static void lower_barrier(struct r1conf *conf, sector_t sector_nr) { - unsigned long flags; - BUG_ON(conf->barrier <= 0); - spin_lock_irqsave(&conf->resync_lock, flags); - conf->barrier--; - conf->nr_pending--; - spin_unlock_irqrestore(&conf->resync_lock, flags); + int idx = sector_to_idx(sector_nr); + + BUG_ON(atomic_read(&conf->barrier[idx]) <= 0); + + atomic_dec(&conf->barrier[idx]); + atomic_dec(&conf->nr_pending[idx]); wake_up(&conf->wait_barrier); } -static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio) +static void _wait_barrier(struct r1conf *conf, int idx) { - bool wait = false; + /* + * We need to increase conf->nr_pending[idx] very early here, + * then raise_barrier() can be blocked when it waits for + * conf->nr_pending[idx] to be 0. Then we can avoid holding + * conf->resync_lock when there is no barrier raised in same + * barrier unit bucket. Also if the array is frozen, I/O + * should be blocked until array is unfrozen. + */ + atomic_inc(&conf->nr_pending[idx]); + /* + * In _wait_barrier() we firstly increase conf->nr_pending[idx], then + * check conf->barrier[idx]. In raise_barrier() we firstly increase + * conf->barrier[idx], then check conf->nr_pending[idx]. A memory + * barrier is necessary here to make sure conf->barrier[idx] won't be + * fetched before conf->nr_pending[idx] is increased. Otherwise there + * will be a race between _wait_barrier() and raise_barrier(). + */ + smp_mb__after_atomic(); - if (conf->array_frozen || !bio) - wait = true; - else if (conf->barrier && bio_data_dir(bio) == WRITE) { - if ((conf->mddev->curr_resync_completed - >= bio_end_sector(bio)) || - (conf->next_resync + NEXT_NORMALIO_DISTANCE - <= bio->bi_iter.bi_sector)) - wait = false; - else - wait = true; - } + /* + * Don't worry about checking two atomic_t variables at same time + * here. If during we check conf->barrier[idx], the array is + * frozen (conf->array_frozen is 1), and chonf->barrier[idx] is + * 0, it is safe to return and make the I/O continue. Because the + * array is frozen, all I/O returned here will eventually complete + * or be queued, no race will happen. See code comment in + * frozen_array(). + */ + if (!READ_ONCE(conf->array_frozen) && + !atomic_read(&conf->barrier[idx])) + return; - return wait; + /* + * After holding conf->resync_lock, conf->nr_pending[idx] + * should be decreased before waiting for barrier to drop. + * Otherwise, we may encounter a race condition because + * raise_barrer() might be waiting for conf->nr_pending[idx] + * to be 0 at same time. + */ + spin_lock_irq(&conf->resync_lock); + atomic_inc(&conf->nr_waiting[idx]); + atomic_dec(&conf->nr_pending[idx]); + /* + * In case freeze_array() is waiting for + * get_unqueued_pending() == extra + */ + wake_up(&conf->wait_barrier); + /* Wait for the barrier in same barrier unit bucket to drop. */ + wait_event_lock_irq(conf->wait_barrier, + !conf->array_frozen && + !atomic_read(&conf->barrier[idx]), + conf->resync_lock); + atomic_inc(&conf->nr_pending[idx]); + atomic_dec(&conf->nr_waiting[idx]); + spin_unlock_irq(&conf->resync_lock); } -static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) +static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr) { - sector_t sector = 0; + int idx = sector_to_idx(sector_nr); - spin_lock_irq(&conf->resync_lock); - if (need_to_wait_for_sync(conf, bio)) { - conf->nr_waiting++; - /* Wait for the barrier to drop. - * However if there are already pending - * requests (preventing the barrier from - * rising completely), and the - * per-process bio queue isn't empty, - * then don't wait, as we need to empty - * that queue to allow conf->start_next_window - * to increase. - */ - wait_event_lock_irq(conf->wait_barrier, - !conf->array_frozen && - (!conf->barrier || - ((conf->start_next_window < - conf->next_resync + RESYNC_SECTORS) && - current->bio_list && - !bio_list_empty(current->bio_list))), - conf->resync_lock); - conf->nr_waiting--; - } - - if (bio && bio_data_dir(bio) == WRITE) { - if (bio->bi_iter.bi_sector >= conf->next_resync) { - if (conf->start_next_window == MaxSector) - conf->start_next_window = - conf->next_resync + - NEXT_NORMALIO_DISTANCE; - - if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE) - <= bio->bi_iter.bi_sector) - conf->next_window_requests++; - else - conf->current_window_requests++; - sector = conf->start_next_window; - } - } + /* + * Very similar to _wait_barrier(). The difference is, for read + * I/O we don't need wait for sync I/O, but if the whole array + * is frozen, the read I/O still has to wait until the array is + * unfrozen. Since there is no ordering requirement with + * conf->barrier[idx] here, memory barrier is unnecessary as well. + */ + atomic_inc(&conf->nr_pending[idx]); - conf->nr_pending++; + if (!READ_ONCE(conf->array_frozen)) + return; + + spin_lock_irq(&conf->resync_lock); + atomic_inc(&conf->nr_waiting[idx]); + atomic_dec(&conf->nr_pending[idx]); + /* + * In case freeze_array() is waiting for + * get_unqueued_pending() == extra + */ + wake_up(&conf->wait_barrier); + /* Wait for array to be unfrozen */ + wait_event_lock_irq(conf->wait_barrier, + !conf->array_frozen, + conf->resync_lock); + atomic_inc(&conf->nr_pending[idx]); + atomic_dec(&conf->nr_waiting[idx]); spin_unlock_irq(&conf->resync_lock); - return sector; } -static void allow_barrier(struct r1conf *conf, sector_t start_next_window, - sector_t bi_sector) +static void wait_barrier(struct r1conf *conf, sector_t sector_nr) { - unsigned long flags; + int idx = sector_to_idx(sector_nr); - spin_lock_irqsave(&conf->resync_lock, flags); - conf->nr_pending--; - if (start_next_window) { - if (start_next_window == conf->start_next_window) { - if (conf->start_next_window + NEXT_NORMALIO_DISTANCE - <= bi_sector) - conf->next_window_requests--; - else - conf->current_window_requests--; - } else - conf->current_window_requests--; - - if (!conf->current_window_requests) { - if (conf->next_window_requests) { - conf->current_window_requests = - conf->next_window_requests; - conf->next_window_requests = 0; - conf->start_next_window += - NEXT_NORMALIO_DISTANCE; - } else - conf->start_next_window = MaxSector; - } - } - spin_unlock_irqrestore(&conf->resync_lock, flags); + _wait_barrier(conf, idx); +} + +static void wait_all_barriers(struct r1conf *conf) +{ + int idx; + + for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) + _wait_barrier(conf, idx); +} + +static void _allow_barrier(struct r1conf *conf, int idx) +{ + atomic_dec(&conf->nr_pending[idx]); wake_up(&conf->wait_barrier); } +static void allow_barrier(struct r1conf *conf, sector_t sector_nr) +{ + int idx = sector_to_idx(sector_nr); + + _allow_barrier(conf, idx); +} + +static void allow_all_barriers(struct r1conf *conf) +{ + int idx; + + for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) + _allow_barrier(conf, idx); +} + +/* conf->resync_lock should be held */ +static int get_unqueued_pending(struct r1conf *conf) +{ + int idx, ret; + + for (ret = 0, idx = 0; idx < BARRIER_BUCKETS_NR; idx++) + ret += atomic_read(&conf->nr_pending[idx]) - + atomic_read(&conf->nr_queued[idx]); + + return ret; +} + static void freeze_array(struct r1conf *conf, int extra) { - /* stop syncio and normal IO and wait for everything to + /* Stop sync I/O and normal I/O and wait for everything to * go quite. - * We wait until nr_pending match nr_queued+extra - * This is called in the context of one normal IO request - * that has failed. Thus any sync request that might be pending - * will be blocked by nr_pending, and we need to wait for - * pending IO requests to complete or be queued for re-try. - * Thus the number queued (nr_queued) plus this request (extra) - * must match the number of pending IOs (nr_pending) before - * we continue. + * This is called in two situations: + * 1) management command handlers (reshape, remove disk, quiesce). + * 2) one normal I/O request failed. + + * After array_frozen is set to 1, new sync IO will be blocked at + * raise_barrier(), and new normal I/O will blocked at _wait_barrier() + * or wait_read_barrier(). The flying I/Os will either complete or be + * queued. When everything goes quite, there are only queued I/Os left. + + * Every flying I/O contributes to a conf->nr_pending[idx], idx is the + * barrier bucket index which this I/O request hits. When all sync and + * normal I/O are queued, sum of all conf->nr_pending[] will match sum + * of all conf->nr_queued[]. But normal I/O failure is an exception, + * in handle_read_error(), we may call freeze_array() before trying to + * fix the read error. In this case, the error read I/O is not queued, + * so get_unqueued_pending() == 1. + * + * Therefore before this function returns, we need to wait until + * get_unqueued_pendings(conf) gets equal to extra. For + * normal I/O context, extra is 1, in rested situations extra is 0. */ spin_lock_irq(&conf->resync_lock); conf->array_frozen = 1; - wait_event_lock_irq_cmd(conf->wait_barrier, - conf->nr_pending == conf->nr_queued+extra, - conf->resync_lock, - flush_pending_writes(conf)); + raid1_log(conf->mddev, "wait freeze"); + wait_event_lock_irq_cmd( + conf->wait_barrier, + get_unqueued_pending(conf) == extra, + conf->resync_lock, + flush_pending_writes(conf)); spin_unlock_irq(&conf->resync_lock); } static void unfreeze_array(struct r1conf *conf) @@ -948,8 +1064,8 @@ static void unfreeze_array(struct r1conf *conf) /* reverse the effect of the freeze */ spin_lock_irq(&conf->resync_lock); conf->array_frozen = 0; - wake_up(&conf->wait_barrier); spin_unlock_irq(&conf->resync_lock); + wake_up(&conf->wait_barrier); } /* duplicate the data pages for behind I/O @@ -1019,9 +1135,14 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; + struct md_rdev *rdev = (void*)bio->bi_bdev; bio->bi_next = NULL; - if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + bio->bi_bdev = rdev->bdev; + if (test_bit(Faulty, &rdev->flags)) { + bio->bi_error = -EIO; + bio_endio(bio); + } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) /* Just ignore it */ bio_endio(bio); else @@ -1031,27 +1152,143 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) kfree(plug); } -static void raid1_make_request(struct mddev *mddev, struct bio * bio) +static inline struct r1bio * +alloc_r1bio(struct mddev *mddev, struct bio *bio, sector_t sectors_handled) +{ + struct r1conf *conf = mddev->private; + struct r1bio *r1_bio; + + r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); + + r1_bio->master_bio = bio; + r1_bio->sectors = bio_sectors(bio) - sectors_handled; + r1_bio->state = 0; + r1_bio->mddev = mddev; + r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled; + + return r1_bio; +} + +static void raid1_read_request(struct mddev *mddev, struct bio *bio) { struct r1conf *conf = mddev->private; struct raid1_info *mirror; struct r1bio *r1_bio; struct bio *read_bio; - int i, disks; - struct bitmap *bitmap; - unsigned long flags; + struct bitmap *bitmap = mddev->bitmap; const int op = bio_op(bio); - const int rw = bio_data_dir(bio); const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); - const unsigned long do_flush_fua = (bio->bi_opf & - (REQ_PREFLUSH | REQ_FUA)); + int sectors_handled; + int max_sectors; + int rdisk; + + /* + * Still need barrier for READ in case that whole + * array is frozen. + */ + wait_read_barrier(conf, bio->bi_iter.bi_sector); + + r1_bio = alloc_r1bio(mddev, bio, 0); + + /* + * We might need to issue multiple reads to different + * devices if there are bad blocks around, so we keep + * track of the number of reads in bio->bi_phys_segments. + * If this is 0, there is only one r1_bio and no locking + * will be needed when requests complete. If it is + * non-zero, then it is the number of not-completed requests. + */ + bio->bi_phys_segments = 0; + bio_clear_flag(bio, BIO_SEG_VALID); + + /* + * make_request() can abort the operation when read-ahead is being + * used and no empty request is available. + */ +read_again: + rdisk = read_balance(conf, r1_bio, &max_sectors); + + if (rdisk < 0) { + /* couldn't find anywhere to read from */ + raid_end_bio_io(r1_bio); + return; + } + mirror = conf->mirrors + rdisk; + + if (test_bit(WriteMostly, &mirror->rdev->flags) && + bitmap) { + /* + * Reading from a write-mostly device must take care not to + * over-take any writes that are 'behind' + */ + raid1_log(mddev, "wait behind writes"); + wait_event(bitmap->behind_wait, + atomic_read(&bitmap->behind_writes) == 0); + } + r1_bio->read_disk = rdisk; + + read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); + bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector, + max_sectors); + + r1_bio->bios[rdisk] = read_bio; + + read_bio->bi_iter.bi_sector = r1_bio->sector + + mirror->rdev->data_offset; + read_bio->bi_bdev = mirror->rdev->bdev; + read_bio->bi_end_io = raid1_end_read_request; + bio_set_op_attrs(read_bio, op, do_sync); + if (test_bit(FailFast, &mirror->rdev->flags) && + test_bit(R1BIO_FailFast, &r1_bio->state)) + read_bio->bi_opf |= MD_FAILFAST; + read_bio->bi_private = r1_bio; + + if (mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev), + read_bio, disk_devt(mddev->gendisk), + r1_bio->sector); + + if (max_sectors < r1_bio->sectors) { + /* + * could not read all from this device, so we will need another + * r1_bio. + */ + sectors_handled = (r1_bio->sector + max_sectors + - bio->bi_iter.bi_sector); + r1_bio->sectors = max_sectors; + spin_lock_irq(&conf->device_lock); + if (bio->bi_phys_segments == 0) + bio->bi_phys_segments = 2; + else + bio->bi_phys_segments++; + spin_unlock_irq(&conf->device_lock); + + /* + * Cannot call generic_make_request directly as that will be + * queued in __make_request and subsequent mempool_alloc might + * block waiting for it. So hand bio over to raid1d. + */ + reschedule_retry(r1_bio); + + r1_bio = alloc_r1bio(mddev, bio, sectors_handled); + goto read_again; + } else + generic_make_request(read_bio); +} + +static void raid1_write_request(struct mddev *mddev, struct bio *bio) +{ + struct r1conf *conf = mddev->private; + struct r1bio *r1_bio; + int i, disks; + struct bitmap *bitmap = mddev->bitmap; + unsigned long flags; struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; struct raid1_plug_cb *plug = NULL; int first_clone; int sectors_handled; int max_sectors; - sector_t start_next_window; /* * Register the new request and wait if the reconstruction @@ -1061,15 +1298,15 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio) md_write_start(mddev, bio); /* wait on superblock update early */ - if (bio_data_dir(bio) == WRITE && - ((bio_end_sector(bio) > mddev->suspend_lo && + if ((bio_end_sector(bio) > mddev->suspend_lo && bio->bi_iter.bi_sector < mddev->suspend_hi) || (mddev_is_clustered(mddev) && md_cluster_ops->area_resyncing(mddev, WRITE, - bio->bi_iter.bi_sector, bio_end_sector(bio))))) { - /* As the suspend_* range is controlled by - * userspace, we want an interruptible - * wait. + bio->bi_iter.bi_sector, bio_end_sector(bio)))) { + + /* + * As the suspend_* range is controlled by userspace, we want + * an interruptible wait. */ DEFINE_WAIT(w); for (;;) { @@ -1080,33 +1317,20 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio) bio->bi_iter.bi_sector >= mddev->suspend_hi || (mddev_is_clustered(mddev) && !md_cluster_ops->area_resyncing(mddev, WRITE, - bio->bi_iter.bi_sector, bio_end_sector(bio)))) + bio->bi_iter.bi_sector, + bio_end_sector(bio)))) break; schedule(); } finish_wait(&conf->wait_barrier, &w); } + wait_barrier(conf, bio->bi_iter.bi_sector); - start_next_window = wait_barrier(conf, bio); + r1_bio = alloc_r1bio(mddev, bio, 0); - bitmap = mddev->bitmap; - - /* - * make_request() can abort the operation when read-ahead is being - * used and no empty request is available. - * - */ - r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); - - r1_bio->master_bio = bio; - r1_bio->sectors = bio_sectors(bio); - r1_bio->state = 0; - r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_iter.bi_sector; - - /* We might need to issue multiple reads to different + /* We might need to issue multiple writes to different * devices if there are bad blocks around, so we keep - * track of the number of reads in bio->bi_phys_segments. + * track of the number of writes in bio->bi_phys_segments. * If this is 0, there is only one r1_bio and no locking * will be needed when requests complete. If it is * non-zero, then it is the number of not-completed requests. @@ -1114,87 +1338,9 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio) bio->bi_phys_segments = 0; bio_clear_flag(bio, BIO_SEG_VALID); - if (rw == READ) { - /* - * read balancing logic: - */ - int rdisk; - -read_again: - rdisk = read_balance(conf, r1_bio, &max_sectors); - - if (rdisk < 0) { - /* couldn't find anywhere to read from */ - raid_end_bio_io(r1_bio); - return; - } - mirror = conf->mirrors + rdisk; - - if (test_bit(WriteMostly, &mirror->rdev->flags) && - bitmap) { - /* Reading from a write-mostly device must - * take care not to over-take any writes - * that are 'behind' - */ - wait_event(bitmap->behind_wait, - atomic_read(&bitmap->behind_writes) == 0); - } - r1_bio->read_disk = rdisk; - r1_bio->start_next_window = 0; - - read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); - bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector, - max_sectors); - - r1_bio->bios[rdisk] = read_bio; - - read_bio->bi_iter.bi_sector = r1_bio->sector + - mirror->rdev->data_offset; - read_bio->bi_bdev = mirror->rdev->bdev; - read_bio->bi_end_io = raid1_end_read_request; - bio_set_op_attrs(read_bio, op, do_sync); - read_bio->bi_private = r1_bio; - - if (max_sectors < r1_bio->sectors) { - /* could not read all from this device, so we will - * need another r1_bio. - */ - - sectors_handled = (r1_bio->sector + max_sectors - - bio->bi_iter.bi_sector); - r1_bio->sectors = max_sectors; - spin_lock_irq(&conf->device_lock); - if (bio->bi_phys_segments == 0) - bio->bi_phys_segments = 2; - else - bio->bi_phys_segments++; - spin_unlock_irq(&conf->device_lock); - /* Cannot call generic_make_request directly - * as that will be queued in __make_request - * and subsequent mempool_alloc might block waiting - * for it. So hand bio over to raid1d. - */ - reschedule_retry(r1_bio); - - r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); - - r1_bio->master_bio = bio; - r1_bio->sectors = bio_sectors(bio) - sectors_handled; - r1_bio->state = 0; - r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_iter.bi_sector + - sectors_handled; - goto read_again; - } else - generic_make_request(read_bio); - return; - } - - /* - * WRITE: - */ if (conf->pending_count >= max_queued_requests) { md_wakeup_thread(mddev->thread); + raid1_log(mddev, "wait queued"); wait_event(conf->wait_barrier, conf->pending_count < max_queued_requests); } @@ -1211,7 +1357,6 @@ read_again: disks = conf->raid_disks * 2; retry_write: - r1_bio->start_next_window = start_next_window; blocked_rdev = NULL; rcu_read_lock(); max_sectors = r1_bio->sectors; @@ -1235,8 +1380,7 @@ read_again: int bad_sectors; int is_bad; - is_bad = is_badblock(rdev, r1_bio->sector, - max_sectors, + is_bad = is_badblock(rdev, r1_bio->sector, max_sectors, &first_bad, &bad_sectors); if (is_bad < 0) { /* mustn't write here until the bad block is @@ -1279,24 +1423,15 @@ read_again: if (unlikely(blocked_rdev)) { /* Wait for this device to become unblocked */ int j; - sector_t old = start_next_window; for (j = 0; j < i; j++) if (r1_bio->bios[j]) rdev_dec_pending(conf->mirrors[j].rdev, mddev); r1_bio->state = 0; - allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector); + allow_barrier(conf, bio->bi_iter.bi_sector); + raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); md_wait_for_blocked_rdev(blocked_rdev, mddev); - start_next_window = wait_barrier(conf, bio); - /* - * We must make sure the multi r1bios of bio have - * the same value of bi_phys_segments - */ - if (bio->bi_phys_segments && old && - old != start_next_window) - /* Wait for the former r1bio(s) to complete */ - wait_event(conf->wait_barrier, - bio->bi_phys_segments == 1); + wait_barrier(conf, bio->bi_iter.bi_sector); goto retry_write; } @@ -1319,12 +1454,12 @@ read_again: first_clone = 1; for (i = 0; i < disks; i++) { - struct bio *mbio; + struct bio *mbio = NULL; + sector_t offset; if (!r1_bio->bios[i]) continue; - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors); + offset = r1_bio->sector - bio->bi_iter.bi_sector; if (first_clone) { /* do behind I/O ? @@ -1334,8 +1469,13 @@ read_again: if (bitmap && (atomic_read(&bitmap->behind_writes) < mddev->bitmap_info.max_write_behind) && - !waitqueue_active(&bitmap->behind_wait)) + !waitqueue_active(&bitmap->behind_wait)) { + mbio = bio_clone_bioset_partial(bio, GFP_NOIO, + mddev->bio_set, + offset << 9, + max_sectors << 9); alloc_behind_pages(mbio, r1_bio); + } bitmap_startwrite(bitmap, r1_bio->sector, r1_bio->sectors, @@ -1343,6 +1483,19 @@ read_again: &r1_bio->state)); first_clone = 0; } + + if (!mbio) { + if (r1_bio->behind_bvecs) + mbio = bio_clone_bioset_partial(bio, GFP_NOIO, + mddev->bio_set, + offset << 9, + max_sectors << 9); + else { + mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); + bio_trim(mbio, offset, max_sectors); + } + } + if (r1_bio->behind_bvecs) { struct bio_vec *bvec; int j; @@ -1362,11 +1515,22 @@ read_again: conf->mirrors[i].rdev->data_offset); mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; - bio_set_op_attrs(mbio, op, do_flush_fua | do_sync); + mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA)); + if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) && + !test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) && + conf->raid_disks - mddev->degraded > 1) + mbio->bi_opf |= MD_FAILFAST; mbio->bi_private = r1_bio; atomic_inc(&r1_bio->remaining); + if (mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev), + mbio, disk_devt(mddev->gendisk), + r1_bio->sector); + /* flush_pending_writes() needs access to the rdev so...*/ + mbio->bi_bdev = (void*)conf->mirrors[i].rdev; + cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug)); if (cb) plug = container_of(cb, struct raid1_plug_cb, cb); @@ -1392,12 +1556,7 @@ read_again: /* We need another r1_bio. It has already been counted * in bio->bi_phys_segments */ - r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); - r1_bio->master_bio = bio; - r1_bio->sectors = bio_sectors(bio) - sectors_handled; - r1_bio->state = 0; - r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled; + r1_bio = alloc_r1bio(mddev, bio, sectors_handled); goto retry_write; } @@ -1407,6 +1566,34 @@ read_again: wake_up(&conf->wait_barrier); } +static void raid1_make_request(struct mddev *mddev, struct bio *bio) +{ + struct bio *split; + sector_t sectors; + + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { + md_flush_request(mddev, bio); + return; + } + + /* if bio exceeds barrier unit boundary, split it */ + do { + sectors = align_to_barrier_unit_end( + bio->bi_iter.bi_sector, bio_sectors(bio)); + if (sectors < bio_sectors(bio)) { + split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set); + bio_chain(split, bio); + } else { + split = bio; + } + + if (bio_data_dir(split) == READ) + raid1_read_request(mddev, split); + else + raid1_write_request(mddev, split); + } while (split != bio); +} + static void raid1_status(struct seq_file *seq, struct mddev *mddev) { struct r1conf *conf = mddev->private; @@ -1436,6 +1623,7 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) * next level up know. * else mark the drive as failed */ + spin_lock_irqsave(&conf->device_lock, flags); if (test_bit(In_sync, &rdev->flags) && (conf->raid_disks - mddev->degraded) == 1) { /* @@ -1445,10 +1633,10 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) * it is very likely to fail. */ conf->recovery_disabled = mddev->recovery_disabled; + spin_unlock_irqrestore(&conf->device_lock, flags); return; } set_bit(Blocked, &rdev->flags); - spin_lock_irqsave(&conf->device_lock, flags); if (test_and_clear_bit(In_sync, &rdev->flags)) { mddev->degraded++; set_bit(Faulty, &rdev->flags); @@ -1459,55 +1647,46 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) * if recovery is running, make sure it aborts. */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); - set_mask_bits(&mddev->flags, 0, - BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); - printk(KERN_ALERT - "md/raid1:%s: Disk failure on %s, disabling device.\n" - "md/raid1:%s: Operation continuing on %d devices.\n", - mdname(mddev), bdevname(rdev->bdev, b), - mdname(mddev), conf->raid_disks - mddev->degraded); + set_mask_bits(&mddev->sb_flags, 0, + BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); + pr_crit("md/raid1:%s: Disk failure on %s, disabling device.\n" + "md/raid1:%s: Operation continuing on %d devices.\n", + mdname(mddev), bdevname(rdev->bdev, b), + mdname(mddev), conf->raid_disks - mddev->degraded); } static void print_conf(struct r1conf *conf) { int i; - printk(KERN_DEBUG "RAID1 conf printout:\n"); + pr_debug("RAID1 conf printout:\n"); if (!conf) { - printk(KERN_DEBUG "(!conf)\n"); + pr_debug("(!conf)\n"); return; } - printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, - conf->raid_disks); + pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, + conf->raid_disks); rcu_read_lock(); for (i = 0; i < conf->raid_disks; i++) { char b[BDEVNAME_SIZE]; struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev) - printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", - i, !test_bit(In_sync, &rdev->flags), - !test_bit(Faulty, &rdev->flags), - bdevname(rdev->bdev,b)); + pr_debug(" disk %d, wo:%d, o:%d, dev:%s\n", + i, !test_bit(In_sync, &rdev->flags), + !test_bit(Faulty, &rdev->flags), + bdevname(rdev->bdev,b)); } rcu_read_unlock(); } static void close_sync(struct r1conf *conf) { - wait_barrier(conf, NULL); - allow_barrier(conf, 0, 0); + wait_all_barriers(conf); + allow_all_barriers(conf); mempool_destroy(conf->r1buf_pool); conf->r1buf_pool = NULL; - - spin_lock_irq(&conf->resync_lock); - conf->next_resync = MaxSector - 2 * NEXT_NORMALIO_DISTANCE; - conf->start_next_window = MaxSector; - conf->current_window_requests += - conf->next_window_requests; - conf->next_window_requests = 0; - spin_unlock_irq(&conf->resync_lock); } static int raid1_spare_active(struct mddev *mddev) @@ -1788,12 +1967,24 @@ static int fix_sync_read_error(struct r1bio *r1_bio) sector_t sect = r1_bio->sector; int sectors = r1_bio->sectors; int idx = 0; + struct md_rdev *rdev; + + rdev = conf->mirrors[r1_bio->read_disk].rdev; + if (test_bit(FailFast, &rdev->flags)) { + /* Don't try recovering from here - just fail it + * ... unless it is the last working device of course */ + md_error(mddev, rdev); + if (test_bit(Faulty, &rdev->flags)) + /* Don't try to read from here, but make sure + * put_buf does it's thing + */ + bio->bi_end_io = end_sync_write; + } while(sectors) { int s = sectors; int d = r1_bio->read_disk; int success = 0; - struct md_rdev *rdev; int start; if (s > (PAGE_SIZE>>9)) @@ -1825,11 +2016,10 @@ static int fix_sync_read_error(struct r1bio *r1_bio) * work just disable and interrupt the recovery. * Don't fail devices as that won't really help. */ - printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" - " for block %llu\n", - mdname(mddev), - bdevname(bio->bi_bdev, b), - (unsigned long long)r1_bio->sector); + pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n", + mdname(mddev), + bdevname(bio->bi_bdev, b), + (unsigned long long)r1_bio->sector); for (d = 0; d < conf->raid_disks * 2; d++) { rdev = conf->mirrors[d].rdev; if (!rdev || test_bit(Faulty, &rdev->flags)) @@ -2013,6 +2203,9 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) continue; bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); + if (test_bit(FailFast, &conf->mirrors[i].rdev->flags)) + wbio->bi_opf |= MD_FAILFAST; + wbio->bi_end_io = end_sync_write; atomic_inc(&r1_bio->remaining); md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio)); @@ -2122,13 +2315,11 @@ static void fix_read_error(struct r1conf *conf, int read_disk, if (r1_sync_page_io(rdev, sect, s, conf->tmppage, READ)) { atomic_add(s, &rdev->corrected_errors); - printk(KERN_INFO - "md/raid1:%s: read error corrected " - "(%d sectors at %llu on %s)\n", - mdname(mddev), s, - (unsigned long long)(sect + - rdev->data_offset), - bdevname(rdev->bdev, b)); + pr_info("md/raid1:%s: read error corrected (%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)(sect + + rdev->data_offset), + bdevname(rdev->bdev, b)); } rdev_dec_pending(rdev, mddev); } else @@ -2192,7 +2383,8 @@ static int narrow_write_error(struct r1bio *r1_bio, int i) wbio->bi_vcnt = vcnt; } else { - wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); + wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO, + mddev->bio_set); } bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); @@ -2242,8 +2434,9 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) { - int m; + int m, idx; bool fail = false; + for (m = 0; m < conf->raid_disks * 2 ; m++) if (r1_bio->bios[m] == IO_MADE_GOOD) { struct md_rdev *rdev = conf->mirrors[m].rdev; @@ -2269,8 +2462,14 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) if (fail) { spin_lock_irq(&conf->device_lock); list_add(&r1_bio->retry_list, &conf->bio_end_io_list); - conf->nr_queued++; + idx = sector_to_idx(r1_bio->sector); + atomic_inc(&conf->nr_queued[idx]); spin_unlock_irq(&conf->device_lock); + /* + * In case freeze_array() is waiting for condition + * get_unqueued_pending() == extra to be true. + */ + wake_up(&conf->wait_barrier); md_wakeup_thread(conf->mddev->thread); } else { if (test_bit(R1BIO_WriteError, &r1_bio->state)) @@ -2287,6 +2486,8 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) struct bio *bio; char b[BDEVNAME_SIZE]; struct md_rdev *rdev; + dev_t bio_dev; + sector_t bio_sector; clear_bit(R1BIO_ReadError, &r1_bio->state); /* we got a read error. Maybe the drive is bad. Maybe just @@ -2300,10 +2501,14 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) bio = r1_bio->bios[r1_bio->read_disk]; bdevname(bio->bi_bdev, b); + bio_dev = bio->bi_bdev->bd_dev; + bio_sector = conf->mirrors[r1_bio->read_disk].rdev->data_offset + r1_bio->sector; bio_put(bio); r1_bio->bios[r1_bio->read_disk] = NULL; - if (mddev->ro == 0) { + rdev = conf->mirrors[r1_bio->read_disk].rdev; + if (mddev->ro == 0 + && !test_bit(FailFast, &rdev->flags)) { freeze_array(conf, 1); fix_read_error(conf, r1_bio->read_disk, r1_bio->sector, r1_bio->sectors); @@ -2312,34 +2517,35 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED; } - rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev); + rdev_dec_pending(rdev, conf->mddev); read_more: disk = read_balance(conf, r1_bio, &max_sectors); if (disk == -1) { - printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O" - " read error for block %llu\n", - mdname(mddev), b, (unsigned long long)r1_bio->sector); + pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n", + mdname(mddev), b, (unsigned long long)r1_bio->sector); raid_end_bio_io(r1_bio); } else { const unsigned long do_sync = r1_bio->master_bio->bi_opf & REQ_SYNC; r1_bio->read_disk = disk; - bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); + bio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO, + mddev->bio_set); bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors); r1_bio->bios[r1_bio->read_disk] = bio; rdev = conf->mirrors[disk].rdev; - printk_ratelimited(KERN_ERR - "md/raid1:%s: redirecting sector %llu" - " to other mirror: %s\n", - mdname(mddev), - (unsigned long long)r1_bio->sector, - bdevname(rdev->bdev, b)); + pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %s\n", + mdname(mddev), + (unsigned long long)r1_bio->sector, + bdevname(rdev->bdev, b)); bio->bi_iter.bi_sector = r1_bio->sector + rdev->data_offset; bio->bi_bdev = rdev->bdev; bio->bi_end_io = raid1_end_read_request; bio_set_op_attrs(bio, REQ_OP_READ, do_sync); + if (test_bit(FailFast, &rdev->flags) && + test_bit(R1BIO_FailFast, &r1_bio->state)) + bio->bi_opf |= MD_FAILFAST; bio->bi_private = r1_bio; if (max_sectors < r1_bio->sectors) { /* Drat - have to split this up more */ @@ -2353,22 +2559,20 @@ read_more: else mbio->bi_phys_segments++; spin_unlock_irq(&conf->device_lock); + trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), + bio, bio_dev, bio_sector); generic_make_request(bio); bio = NULL; - r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); - - r1_bio->master_bio = mbio; - r1_bio->sectors = bio_sectors(mbio) - sectors_handled; - r1_bio->state = 0; + r1_bio = alloc_r1bio(mddev, mbio, sectors_handled); set_bit(R1BIO_ReadError, &r1_bio->state); - r1_bio->mddev = mddev; - r1_bio->sector = mbio->bi_iter.bi_sector + - sectors_handled; goto read_more; - } else + } else { + trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), + bio, bio_dev, bio_sector); generic_make_request(bio); + } } } @@ -2380,24 +2584,23 @@ static void raid1d(struct md_thread *thread) struct r1conf *conf = mddev->private; struct list_head *head = &conf->retry_list; struct blk_plug plug; + int idx; md_check_recovery(mddev); if (!list_empty_careful(&conf->bio_end_io_list) && - !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { LIST_HEAD(tmp); spin_lock_irqsave(&conf->device_lock, flags); - if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { - while (!list_empty(&conf->bio_end_io_list)) { - list_move(conf->bio_end_io_list.prev, &tmp); - conf->nr_queued--; - } - } + if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) + list_splice_init(&conf->bio_end_io_list, &tmp); spin_unlock_irqrestore(&conf->device_lock, flags); while (!list_empty(&tmp)) { r1_bio = list_first_entry(&tmp, struct r1bio, retry_list); list_del(&r1_bio->retry_list); + idx = sector_to_idx(r1_bio->sector); + atomic_dec(&conf->nr_queued[idx]); if (mddev->degraded) set_bit(R1BIO_Degraded, &r1_bio->state); if (test_bit(R1BIO_WriteError, &r1_bio->state)) @@ -2418,7 +2621,8 @@ static void raid1d(struct md_thread *thread) } r1_bio = list_entry(head->prev, struct r1bio, retry_list); list_del(head->prev); - conf->nr_queued--; + idx = sector_to_idx(r1_bio->sector); + atomic_dec(&conf->nr_queued[idx]); spin_unlock_irqrestore(&conf->device_lock, flags); mddev = r1_bio->mddev; @@ -2441,7 +2645,7 @@ static void raid1d(struct md_thread *thread) generic_make_request(r1_bio->bios[r1_bio->read_disk]); cond_resched(); - if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) + if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING)) md_check_recovery(mddev); } blk_finish_plug(&plug); @@ -2457,7 +2661,6 @@ static int init_resync(struct r1conf *conf) conf->poolinfo); if (!conf->r1buf_pool) return -ENOMEM; - conf->next_resync = 0; return 0; } @@ -2486,6 +2689,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, int still_degraded = 0; int good_sectors = RESYNC_SECTORS; int min_bad = 0; /* number of sectors that are bad in all devices */ + int idx = sector_to_idx(sector_nr); if (!conf->r1buf_pool) if (init_resync(conf)) @@ -2535,7 +2739,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, * If there is non-resync activity waiting for a turn, then let it * though before starting on this new sync request. */ - if (conf->nr_waiting) + if (atomic_read(&conf->nr_waiting[idx])) schedule_timeout_uninterruptible(1); /* we are incrementing sector_nr below. To be safe, we check against @@ -2562,6 +2766,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, r1_bio->sector = sector_nr; r1_bio->state = 0; set_bit(R1BIO_IsSync, &r1_bio->state); + /* make sure good_sectors won't go across barrier unit boundary */ + good_sectors = align_to_barrier_unit_end(sector_nr, good_sectors); for (i = 0; i < conf->raid_disks * 2; i++) { struct md_rdev *rdev; @@ -2623,6 +2829,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, bio->bi_iter.bi_sector = sector_nr + rdev->data_offset; bio->bi_bdev = rdev->bdev; bio->bi_private = r1_bio; + if (test_bit(FailFast, &rdev->flags)) + bio->bi_opf |= MD_FAILFAST; } } rcu_read_unlock(); @@ -2642,7 +2850,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, min_bad, 0 ) && ok; } - set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); *skipped = 1; put_buf(r1_bio); @@ -2753,6 +2961,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, if (bio->bi_end_io == end_sync_read) { read_targets--; md_sync_acct(bio->bi_bdev, nr_sectors); + if (read_targets == 1) + bio->bi_opf &= ~MD_FAILFAST; generic_make_request(bio); } } @@ -2760,6 +2970,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, atomic_set(&r1_bio->remaining, 1); bio = r1_bio->bios[r1_bio->read_disk]; md_sync_acct(bio->bi_bdev, nr_sectors); + if (read_targets == 1) + bio->bi_opf &= ~MD_FAILFAST; generic_make_request(bio); } @@ -2786,6 +2998,26 @@ static struct r1conf *setup_conf(struct mddev *mddev) if (!conf) goto abort; + conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR, + sizeof(atomic_t), GFP_KERNEL); + if (!conf->nr_pending) + goto abort; + + conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR, + sizeof(atomic_t), GFP_KERNEL); + if (!conf->nr_waiting) + goto abort; + + conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR, + sizeof(atomic_t), GFP_KERNEL); + if (!conf->nr_queued) + goto abort; + + conf->barrier = kcalloc(BARRIER_BUCKETS_NR, + sizeof(atomic_t), GFP_KERNEL); + if (!conf->barrier) + goto abort; + conf->mirrors = kzalloc(sizeof(struct raid1_info) * mddev->raid_disks * 2, GFP_KERNEL); @@ -2841,9 +3073,6 @@ static struct r1conf *setup_conf(struct mddev *mddev) conf->pending_count = 0; conf->recovery_disabled = mddev->recovery_disabled - 1; - conf->start_next_window = MaxSector; - conf->current_window_requests = conf->next_window_requests = 0; - err = -EIO; for (i = 0; i < conf->raid_disks * 2; i++) { @@ -2875,12 +3104,8 @@ static struct r1conf *setup_conf(struct mddev *mddev) err = -ENOMEM; conf->thread = md_register_thread(raid1d, mddev, "raid1"); - if (!conf->thread) { - printk(KERN_ERR - "md/raid1:%s: couldn't allocate thread\n", - mdname(mddev)); + if (!conf->thread) goto abort; - } return conf; @@ -2890,6 +3115,10 @@ static struct r1conf *setup_conf(struct mddev *mddev) kfree(conf->mirrors); safe_put_page(conf->tmppage); kfree(conf->poolinfo); + kfree(conf->nr_pending); + kfree(conf->nr_waiting); + kfree(conf->nr_queued); + kfree(conf->barrier); kfree(conf); } return ERR_PTR(err); @@ -2905,13 +3134,13 @@ static int raid1_run(struct mddev *mddev) bool discard_supported = false; if (mddev->level != 1) { - printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n", - mdname(mddev), mddev->level); + pr_warn("md/raid1:%s: raid level not set to mirroring (%d)\n", + mdname(mddev), mddev->level); return -EIO; } if (mddev->reshape_position != MaxSector) { - printk(KERN_ERR "md/raid1:%s: reshape_position set but not supported\n", - mdname(mddev)); + pr_warn("md/raid1:%s: reshape_position set but not supported\n", + mdname(mddev)); return -EIO; } /* @@ -2950,11 +3179,9 @@ static int raid1_run(struct mddev *mddev) mddev->recovery_cp = MaxSector; if (mddev->recovery_cp != MaxSector) - printk(KERN_NOTICE "md/raid1:%s: not clean" - " -- starting background reconstruction\n", - mdname(mddev)); - printk(KERN_INFO - "md/raid1:%s: active with %d out of %d mirrors\n", + pr_info("md/raid1:%s: not clean -- starting background reconstruction\n", + mdname(mddev)); + pr_info("md/raid1:%s: active with %d out of %d mirrors\n", mdname(mddev), mddev->raid_disks - mddev->degraded, mddev->raid_disks); @@ -2964,6 +3191,7 @@ static int raid1_run(struct mddev *mddev) mddev->thread = conf->thread; conf->thread = NULL; mddev->private = conf; + set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); @@ -2992,6 +3220,10 @@ static void raid1_free(struct mddev *mddev, void *priv) kfree(conf->mirrors); safe_put_page(conf->tmppage); kfree(conf->poolinfo); + kfree(conf->nr_pending); + kfree(conf->nr_waiting); + kfree(conf->nr_queued); + kfree(conf->barrier); kfree(conf); } @@ -3107,9 +3339,8 @@ static int raid1_reshape(struct mddev *mddev) rdev->raid_disk = d2; sysfs_unlink_rdev(mddev, rdev); if (sysfs_link_rdev(mddev, rdev)) - printk(KERN_WARNING - "md/raid1:%s: cannot register rd%d\n", - mdname(mddev), rdev->raid_disk); + pr_warn("md/raid1:%s: cannot register rd%d\n", + mdname(mddev), rdev->raid_disk); } if (rdev) newmirrors[d2++].rdev = rdev; @@ -3163,9 +3394,12 @@ static void *raid1_takeover(struct mddev *mddev) mddev->new_layout = 0; mddev->new_chunk_sectors = 0; conf = setup_conf(mddev); - if (!IS_ERR(conf)) + if (!IS_ERR(conf)) { /* Array must appear to be quiesced */ conf->array_frozen = 1; + mddev_clear_unsupported_flags(mddev, + UNSUPPORTED_MDDEV_FLAGS); + } return conf; } return ERR_PTR(-EINVAL); diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 61c39b390cd8..dd22a37d0d83 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -1,6 +1,30 @@ #ifndef _RAID1_H #define _RAID1_H +/* + * each barrier unit size is 64MB fow now + * note: it must be larger than RESYNC_DEPTH + */ +#define BARRIER_UNIT_SECTOR_BITS 17 +#define BARRIER_UNIT_SECTOR_SIZE (1<<17) +/* + * In struct r1conf, the following members are related to I/O barrier + * buckets, + * atomic_t *nr_pending; + * atomic_t *nr_waiting; + * atomic_t *nr_queued; + * atomic_t *barrier; + * Each of them points to array of atomic_t variables, each array is + * designed to have BARRIER_BUCKETS_NR elements and occupy a single + * memory page. The data width of atomic_t variables is 4 bytes, equal + * to 1<<(ilog2(sizeof(atomic_t))), BARRIER_BUCKETS_NR_BITS is defined + * as (PAGE_SHIFT - ilog2(sizeof(int))) to make sure an array of + * atomic_t variables with BARRIER_BUCKETS_NR elements just exactly + * occupies a single memory page. + */ +#define BARRIER_BUCKETS_NR_BITS (PAGE_SHIFT - ilog2(sizeof(atomic_t))) +#define BARRIER_BUCKETS_NR (1<<BARRIER_BUCKETS_NR_BITS) + struct raid1_info { struct md_rdev *rdev; sector_t head_position; @@ -35,25 +59,6 @@ struct r1conf { */ int raid_disks; - /* During resync, read_balancing is only allowed on the part - * of the array that has been resynced. 'next_resync' tells us - * where that is. - */ - sector_t next_resync; - - /* When raid1 starts resync, we divide array into four partitions - * |---------|--------------|---------------------|-------------| - * next_resync start_next_window end_window - * start_next_window = next_resync + NEXT_NORMALIO_DISTANCE - * end_window = start_next_window + NEXT_NORMALIO_DISTANCE - * current_window_requests means the count of normalIO between - * start_next_window and end_window. - * next_window_requests means the count of normalIO after end_window. - * */ - sector_t start_next_window; - int current_window_requests; - int next_window_requests; - spinlock_t device_lock; /* list of 'struct r1bio' that need to be processed by raid1d, @@ -79,10 +84,10 @@ struct r1conf { */ wait_queue_head_t wait_barrier; spinlock_t resync_lock; - int nr_pending; - int nr_waiting; - int nr_queued; - int barrier; + atomic_t *nr_pending; + atomic_t *nr_waiting; + atomic_t *nr_queued; + atomic_t *barrier; int array_frozen; /* Set to 1 if a full sync is needed, (fresh device added). @@ -135,7 +140,6 @@ struct r1bio { * in this BehindIO request */ sector_t sector; - sector_t start_next_window; int sectors; unsigned long state; struct mddev *mddev; @@ -161,14 +165,15 @@ struct r1bio { }; /* bits for r1bio.state */ -#define R1BIO_Uptodate 0 -#define R1BIO_IsSync 1 -#define R1BIO_Degraded 2 -#define R1BIO_BehindIO 3 +enum r1bio_state { + R1BIO_Uptodate, + R1BIO_IsSync, + R1BIO_Degraded, + R1BIO_BehindIO, /* Set ReadError on bios that experience a readerror so that * raid1d knows what to do with them. */ -#define R1BIO_ReadError 4 + R1BIO_ReadError, /* For write-behind requests, we call bi_end_io when * the last non-write-behind device completes, providing * any write was successful. Otherwise we call when @@ -176,10 +181,18 @@ struct r1bio { * with failure when last write completes (and all failed). * Record that bi_end_io was called with this flag... */ -#define R1BIO_Returned 6 + R1BIO_Returned, /* If a write for this request means we can clear some * known-bad-block records, we set this flag */ -#define R1BIO_MadeGood 7 -#define R1BIO_WriteError 8 + R1BIO_MadeGood, + R1BIO_WriteError, + R1BIO_FailFast, +}; + +static inline int sector_to_idx(sector_t sector) +{ + return hash_long(sector >> BARRIER_UNIT_SECTOR_BITS, + BARRIER_BUCKETS_NR_BITS); +} #endif diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 39fddda2fef2..063c43d83b72 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -25,6 +25,7 @@ #include <linux/seq_file.h> #include <linux/ratelimit.h> #include <linux/kthread.h> +#include <trace/events/block.h> #include "md.h" #include "raid10.h" #include "raid0.h" @@ -99,12 +100,16 @@ static int max_queued_requests = 1024; static void allow_barrier(struct r10conf *conf); static void lower_barrier(struct r10conf *conf); static int _enough(struct r10conf *conf, int previous, int ignore); +static int enough(struct r10conf *conf, int ignore); static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped); static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); static void end_reshape_write(struct bio *bio); static void end_reshape(struct r10conf *conf); +#define raid10_log(md, fmt, args...) \ + do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0) + static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) { struct r10conf *conf = data; @@ -404,8 +409,7 @@ static void raid10_end_read_request(struct bio *bio) * oops, read error - keep the refcount on the rdev */ char b[BDEVNAME_SIZE]; - printk_ratelimited(KERN_ERR - "md/raid10:%s: %s: rescheduling sector %llu\n", + pr_err_ratelimited("md/raid10:%s: %s: rescheduling sector %llu\n", mdname(conf->mddev), bdevname(rdev->bdev, b), (unsigned long long)r10_bio->sector); @@ -447,6 +451,7 @@ static void raid10_end_write_request(struct bio *bio) struct r10conf *conf = r10_bio->mddev->private; int slot, repl; struct md_rdev *rdev = NULL; + struct bio *to_put = NULL; bool discard_error; discard_error = bio->bi_error && bio_op(bio) == REQ_OP_DISCARD; @@ -474,8 +479,24 @@ static void raid10_end_write_request(struct bio *bio) if (!test_and_set_bit(WantReplacement, &rdev->flags)) set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); - set_bit(R10BIO_WriteError, &r10_bio->state); + dec_rdev = 0; + if (test_bit(FailFast, &rdev->flags) && + (bio->bi_opf & MD_FAILFAST)) { + md_error(rdev->mddev, rdev); + if (!test_bit(Faulty, &rdev->flags)) + /* This is the only remaining device, + * We need to retry the write without + * FailFast + */ + set_bit(R10BIO_WriteError, &r10_bio->state); + else { + r10_bio->devs[slot].bio = NULL; + to_put = bio; + dec_rdev = 1; + } + } else + set_bit(R10BIO_WriteError, &r10_bio->state); } } else { /* @@ -525,6 +546,8 @@ static void raid10_end_write_request(struct bio *bio) one_write_done(r10_bio); if (dec_rdev) rdev_dec_pending(rdev, conf->mddev); + if (to_put) + bio_put(to_put); } /* @@ -716,6 +739,7 @@ static struct md_rdev *read_balance(struct r10conf *conf, best_dist = MaxSector; best_good_sectors = 0; do_balance = 1; + clear_bit(R10BIO_FailFast, &r10_bio->state); /* * Check if we can balance. We can balance on the whole * device if no resync is going on (recovery is ok), or below @@ -780,15 +804,18 @@ static struct md_rdev *read_balance(struct r10conf *conf, if (!do_balance) break; + if (best_slot >= 0) + /* At least 2 disks to choose from so failfast is OK */ + set_bit(R10BIO_FailFast, &r10_bio->state); /* This optimisation is debatable, and completely destroys * sequential read speed for 'far copies' arrays. So only * keep it for 'near' arrays, and review those later. */ if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending)) - break; + new_distance = 0; /* for far > 1 always use the lowest address */ - if (geo->far_copies > 1) + else if (geo->far_copies > 1) new_distance = r10_bio->devs[slot].addr; else new_distance = abs(r10_bio->devs[slot].addr - @@ -833,7 +860,7 @@ static int raid10_congested(struct mddev *mddev, int bits) if (rdev && !test_bit(Faulty, &rdev->flags)) { struct request_queue *q = bdev_get_queue(rdev->bdev); - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= bdi_congested(q->backing_dev_info, bits); } } rcu_read_unlock(); @@ -859,9 +886,14 @@ static void flush_pending_writes(struct r10conf *conf) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; + struct md_rdev *rdev = (void*)bio->bi_bdev; bio->bi_next = NULL; - if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + bio->bi_bdev = rdev->bdev; + if (test_bit(Faulty, &rdev->flags)) { + bio->bi_error = -EIO; + bio_endio(bio); + } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) /* Just ignore it */ bio_endio(bio); else @@ -937,6 +969,7 @@ static void wait_barrier(struct r10conf *conf) * that queue to get the nr_pending * count down. */ + raid10_log(conf->mddev, "wait barrier"); wait_event_lock_irq(conf->wait_barrier, !conf->barrier || (atomic_read(&conf->nr_pending) && @@ -1037,9 +1070,14 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; + struct md_rdev *rdev = (void*)bio->bi_bdev; bio->bi_next = NULL; - if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + bio->bi_bdev = rdev->bdev; + if (test_bit(Faulty, &rdev->flags)) { + bio->bi_error = -EIO; + bio_endio(bio); + } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) /* Just ignore it */ bio_endio(bio); else @@ -1049,23 +1087,122 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) kfree(plug); } -static void __make_request(struct mddev *mddev, struct bio *bio) +static void raid10_read_request(struct mddev *mddev, struct bio *bio, + struct r10bio *r10_bio) { struct r10conf *conf = mddev->private; - struct r10bio *r10_bio; struct bio *read_bio; + const int op = bio_op(bio); + const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); + int sectors_handled; + int max_sectors; + sector_t sectors; + struct md_rdev *rdev; + int slot; + + /* + * Register the new request and wait if the reconstruction + * thread has put up a bar for new requests. + * Continue immediately if no resync is active currently. + */ + wait_barrier(conf); + + sectors = bio_sectors(bio); + while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + bio->bi_iter.bi_sector < conf->reshape_progress && + bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { + /* + * IO spans the reshape position. Need to wait for reshape to + * pass + */ + raid10_log(conf->mddev, "wait reshape"); + allow_barrier(conf); + wait_event(conf->wait_barrier, + conf->reshape_progress <= bio->bi_iter.bi_sector || + conf->reshape_progress >= bio->bi_iter.bi_sector + + sectors); + wait_barrier(conf); + } + +read_again: + rdev = read_balance(conf, r10_bio, &max_sectors); + if (!rdev) { + raid_end_bio_io(r10_bio); + return; + } + slot = r10_bio->read_slot; + + read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); + bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector, + max_sectors); + + r10_bio->devs[slot].bio = read_bio; + r10_bio->devs[slot].rdev = rdev; + + read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr + + choose_data_offset(r10_bio, rdev); + read_bio->bi_bdev = rdev->bdev; + read_bio->bi_end_io = raid10_end_read_request; + bio_set_op_attrs(read_bio, op, do_sync); + if (test_bit(FailFast, &rdev->flags) && + test_bit(R10BIO_FailFast, &r10_bio->state)) + read_bio->bi_opf |= MD_FAILFAST; + read_bio->bi_private = r10_bio; + + if (mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev), + read_bio, disk_devt(mddev->gendisk), + r10_bio->sector); + if (max_sectors < r10_bio->sectors) { + /* + * Could not read all from this device, so we will need another + * r10_bio. + */ + sectors_handled = (r10_bio->sector + max_sectors + - bio->bi_iter.bi_sector); + r10_bio->sectors = max_sectors; + spin_lock_irq(&conf->device_lock); + if (bio->bi_phys_segments == 0) + bio->bi_phys_segments = 2; + else + bio->bi_phys_segments++; + spin_unlock_irq(&conf->device_lock); + /* + * Cannot call generic_make_request directly as that will be + * queued in __generic_make_request and subsequent + * mempool_alloc might block waiting for it. so hand bio over + * to raid10d. + */ + reschedule_retry(r10_bio); + + r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); + + r10_bio->master_bio = bio; + r10_bio->sectors = bio_sectors(bio) - sectors_handled; + r10_bio->state = 0; + r10_bio->mddev = mddev; + r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled; + goto read_again; + } else + generic_make_request(read_bio); + return; +} + +static void raid10_write_request(struct mddev *mddev, struct bio *bio, + struct r10bio *r10_bio) +{ + struct r10conf *conf = mddev->private; int i; const int op = bio_op(bio); - const int rw = bio_data_dir(bio); const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); const unsigned long do_fua = (bio->bi_opf & REQ_FUA); unsigned long flags; struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; struct raid10_plug_cb *plug = NULL; + sector_t sectors; int sectors_handled; int max_sectors; - int sectors; md_write_start(mddev, bio); @@ -1080,9 +1217,11 @@ static void __make_request(struct mddev *mddev, struct bio *bio) while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && bio->bi_iter.bi_sector < conf->reshape_progress && bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { - /* IO spans the reshape position. Need to wait for - * reshape to pass + /* + * IO spans the reshape position. Need to wait for reshape to + * pass */ + raid10_log(conf->mddev, "wait reshape"); allow_barrier(conf); wait_event(conf->wait_barrier, conf->reshape_progress <= bio->bi_iter.bi_sector || @@ -1090,8 +1229,8 @@ static void __make_request(struct mddev *mddev, struct bio *bio) sectors); wait_barrier(conf); } + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - bio_data_dir(bio) == WRITE && (mddev->reshape_backwards ? (bio->bi_iter.bi_sector < conf->reshape_safe && bio->bi_iter.bi_sector + sectors > conf->reshape_progress) @@ -1099,102 +1238,19 @@ static void __make_request(struct mddev *mddev, struct bio *bio) bio->bi_iter.bi_sector < conf->reshape_progress))) { /* Need to update reshape_position in metadata */ mddev->reshape_position = conf->reshape_progress; - set_mask_bits(&mddev->flags, 0, - BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); + set_mask_bits(&mddev->sb_flags, 0, + BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); md_wakeup_thread(mddev->thread); + raid10_log(conf->mddev, "wait reshape metadata"); wait_event(mddev->sb_wait, - !test_bit(MD_CHANGE_PENDING, &mddev->flags)); + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); conf->reshape_safe = mddev->reshape_position; } - r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); - - r10_bio->master_bio = bio; - r10_bio->sectors = sectors; - - r10_bio->mddev = mddev; - r10_bio->sector = bio->bi_iter.bi_sector; - r10_bio->state = 0; - - /* We might need to issue multiple reads to different - * devices if there are bad blocks around, so we keep - * track of the number of reads in bio->bi_phys_segments. - * If this is 0, there is only one r10_bio and no locking - * will be needed when the request completes. If it is - * non-zero, then it is the number of not-completed requests. - */ - bio->bi_phys_segments = 0; - bio_clear_flag(bio, BIO_SEG_VALID); - - if (rw == READ) { - /* - * read balancing logic: - */ - struct md_rdev *rdev; - int slot; - -read_again: - rdev = read_balance(conf, r10_bio, &max_sectors); - if (!rdev) { - raid_end_bio_io(r10_bio); - return; - } - slot = r10_bio->read_slot; - - read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); - bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector, - max_sectors); - - r10_bio->devs[slot].bio = read_bio; - r10_bio->devs[slot].rdev = rdev; - - read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr + - choose_data_offset(r10_bio, rdev); - read_bio->bi_bdev = rdev->bdev; - read_bio->bi_end_io = raid10_end_read_request; - bio_set_op_attrs(read_bio, op, do_sync); - read_bio->bi_private = r10_bio; - - if (max_sectors < r10_bio->sectors) { - /* Could not read all from this device, so we will - * need another r10_bio. - */ - sectors_handled = (r10_bio->sector + max_sectors - - bio->bi_iter.bi_sector); - r10_bio->sectors = max_sectors; - spin_lock_irq(&conf->device_lock); - if (bio->bi_phys_segments == 0) - bio->bi_phys_segments = 2; - else - bio->bi_phys_segments++; - spin_unlock_irq(&conf->device_lock); - /* Cannot call generic_make_request directly - * as that will be queued in __generic_make_request - * and subsequent mempool_alloc might block - * waiting for it. so hand bio over to raid10d. - */ - reschedule_retry(r10_bio); - - r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); - - r10_bio->master_bio = bio; - r10_bio->sectors = bio_sectors(bio) - sectors_handled; - r10_bio->state = 0; - r10_bio->mddev = mddev; - r10_bio->sector = bio->bi_iter.bi_sector + - sectors_handled; - goto read_again; - } else - generic_make_request(read_bio); - return; - } - - /* - * WRITE: - */ if (conf->pending_count >= max_queued_requests) { md_wakeup_thread(mddev->thread); + raid10_log(mddev, "wait queued"); wait_event(conf->wait_barrier, conf->pending_count < max_queued_requests); } @@ -1252,8 +1308,7 @@ retry_write: int bad_sectors; int is_bad; - is_bad = is_badblock(rdev, dev_sector, - max_sectors, + is_bad = is_badblock(rdev, dev_sector, max_sectors, &first_bad, &bad_sectors); if (is_bad < 0) { /* Mustn't write here until the bad block @@ -1322,6 +1377,7 @@ retry_write: } } allow_barrier(conf); + raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); md_wait_for_blocked_rdev(blocked_rdev, mddev); wait_barrier(conf); goto retry_write; @@ -1350,19 +1406,28 @@ retry_write: int d = r10_bio->devs[i].devnum; if (r10_bio->devs[i].bio) { struct md_rdev *rdev = conf->mirrors[d].rdev; - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); r10_bio->devs[i].bio = mbio; mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr+ - choose_data_offset(r10_bio, - rdev)); + choose_data_offset(r10_bio, rdev)); mbio->bi_bdev = rdev->bdev; mbio->bi_end_io = raid10_end_write_request; bio_set_op_attrs(mbio, op, do_sync | do_fua); + if (test_bit(FailFast, &conf->mirrors[d].rdev->flags) && + enough(conf, d)) + mbio->bi_opf |= MD_FAILFAST; mbio->bi_private = r10_bio; + if (conf->mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev), + mbio, disk_devt(conf->mddev->gendisk), + r10_bio->sector); + /* flush_pending_writes() needs access to the rdev so...*/ + mbio->bi_bdev = (void*)rdev; + atomic_inc(&r10_bio->remaining); cb = blk_check_plugged(raid10_unplug, mddev, @@ -1392,19 +1457,25 @@ retry_write: smp_mb(); rdev = conf->mirrors[d].rdev; } - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); r10_bio->devs[i].repl_bio = mbio; mbio->bi_iter.bi_sector = (r10_bio->devs[i].addr + - choose_data_offset( - r10_bio, rdev)); + choose_data_offset(r10_bio, rdev)); mbio->bi_bdev = rdev->bdev; mbio->bi_end_io = raid10_end_write_request; bio_set_op_attrs(mbio, op, do_sync | do_fua); mbio->bi_private = r10_bio; + if (conf->mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev), + mbio, disk_devt(conf->mddev->gendisk), + r10_bio->sector); + /* flush_pending_writes() needs access to the rdev so...*/ + mbio->bi_bdev = (void*)rdev; + atomic_inc(&r10_bio->remaining); spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); @@ -1437,6 +1508,36 @@ retry_write: one_write_done(r10_bio); } +static void __make_request(struct mddev *mddev, struct bio *bio) +{ + struct r10conf *conf = mddev->private; + struct r10bio *r10_bio; + + r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); + + r10_bio->master_bio = bio; + r10_bio->sectors = bio_sectors(bio); + + r10_bio->mddev = mddev; + r10_bio->sector = bio->bi_iter.bi_sector; + r10_bio->state = 0; + + /* + * We might need to issue multiple reads to different devices if there + * are bad blocks around, so we keep track of the number of reads in + * bio->bi_phys_segments. If this is 0, there is only one r10_bio and + * no locking will be needed when the request completes. If it is + * non-zero, then it is the number of not-completed requests. + */ + bio->bi_phys_segments = 0; + bio_clear_flag(bio, BIO_SEG_VALID); + + if (bio_data_dir(bio) == READ) + raid10_read_request(mddev, bio, r10_bio); + else + raid10_write_request(mddev, bio, r10_bio); +} + static void raid10_make_request(struct mddev *mddev, struct bio *bio) { struct r10conf *conf = mddev->private; @@ -1586,14 +1687,13 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); - set_mask_bits(&mddev->flags, 0, - BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); + set_mask_bits(&mddev->sb_flags, 0, + BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); spin_unlock_irqrestore(&conf->device_lock, flags); - printk(KERN_ALERT - "md/raid10:%s: Disk failure on %s, disabling device.\n" - "md/raid10:%s: Operation continuing on %d devices.\n", - mdname(mddev), bdevname(rdev->bdev, b), - mdname(mddev), conf->geo.raid_disks - mddev->degraded); + pr_crit("md/raid10:%s: Disk failure on %s, disabling device.\n" + "md/raid10:%s: Operation continuing on %d devices.\n", + mdname(mddev), bdevname(rdev->bdev, b), + mdname(mddev), conf->geo.raid_disks - mddev->degraded); } static void print_conf(struct r10conf *conf) @@ -1601,13 +1701,13 @@ static void print_conf(struct r10conf *conf) int i; struct md_rdev *rdev; - printk(KERN_DEBUG "RAID10 conf printout:\n"); + pr_debug("RAID10 conf printout:\n"); if (!conf) { - printk(KERN_DEBUG "(!conf)\n"); + pr_debug("(!conf)\n"); return; } - printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded, - conf->geo.raid_disks); + pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded, + conf->geo.raid_disks); /* This is only called with ->reconfix_mutex held, so * rcu protection of rdev is not needed */ @@ -1615,10 +1715,10 @@ static void print_conf(struct r10conf *conf) char b[BDEVNAME_SIZE]; rdev = conf->mirrors[i].rdev; if (rdev) - printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", - i, !test_bit(In_sync, &rdev->flags), - !test_bit(Faulty, &rdev->flags), - bdevname(rdev->bdev,b)); + pr_debug(" disk %d, wo:%d, o:%d, dev:%s\n", + i, !test_bit(In_sync, &rdev->flags), + !test_bit(Faulty, &rdev->flags), + bdevname(rdev->bdev,b)); } } @@ -1953,6 +2053,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) /* now find blocks with errors */ for (i=0 ; i < conf->copies ; i++) { int j, d; + struct md_rdev *rdev; tbio = r10_bio->devs[i].bio; @@ -1960,6 +2061,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) continue; if (i == first) continue; + d = r10_bio->devs[i].devnum; + rdev = conf->mirrors[d].rdev; if (!r10_bio->devs[i].bio->bi_error) { /* We know that the bi_io_vec layout is the same for * both 'first' and 'i', so we just compare them. @@ -1982,6 +2085,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) /* Don't fix anything. */ continue; + } else if (test_bit(FailFast, &rdev->flags)) { + /* Just give up on this device */ + md_error(rdev->mddev, rdev); + continue; } /* Ok, we need to write this bio, either to correct an * inconsistency or to correct an unreadable block. @@ -1999,11 +2106,12 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) bio_copy_data(tbio, fbio); - d = r10_bio->devs[i].devnum; atomic_inc(&conf->mirrors[d].rdev->nr_pending); atomic_inc(&r10_bio->remaining); md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio)); + if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) + tbio->bi_opf |= MD_FAILFAST; tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset; tbio->bi_bdev = conf->mirrors[d].rdev->bdev; generic_make_request(tbio); @@ -2109,10 +2217,8 @@ static void fix_recovery_read_error(struct r10bio *r10_bio) ok = rdev_set_badblocks(rdev2, addr, s, 0); if (!ok) { /* just abort the recovery */ - printk(KERN_NOTICE - "md/raid10:%s: recovery aborted" - " due to read error\n", - mdname(mddev)); + pr_notice("md/raid10:%s: recovery aborted due to read error\n", + mdname(mddev)); conf->mirrors[dw].recovery_disabled = mddev->recovery_disabled; @@ -2259,14 +2365,11 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 char b[BDEVNAME_SIZE]; bdevname(rdev->bdev, b); - printk(KERN_NOTICE - "md/raid10:%s: %s: Raid device exceeded " - "read_error threshold [cur %d:max %d]\n", - mdname(mddev), b, - atomic_read(&rdev->read_errors), max_read_errors); - printk(KERN_NOTICE - "md/raid10:%s: %s: Failing raid device\n", - mdname(mddev), b); + pr_notice("md/raid10:%s: %s: Raid device exceeded read_error threshold [cur %d:max %d]\n", + mdname(mddev), b, + atomic_read(&rdev->read_errors), max_read_errors); + pr_notice("md/raid10:%s: %s: Failing raid device\n", + mdname(mddev), b); md_error(mddev, rdev); r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; return; @@ -2356,20 +2459,16 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 s, conf->tmppage, WRITE) == 0) { /* Well, this device is dead */ - printk(KERN_NOTICE - "md/raid10:%s: read correction " - "write failed" - " (%d sectors at %llu on %s)\n", - mdname(mddev), s, - (unsigned long long)( - sect + - choose_data_offset(r10_bio, - rdev)), - bdevname(rdev->bdev, b)); - printk(KERN_NOTICE "md/raid10:%s: %s: failing " - "drive\n", - mdname(mddev), - bdevname(rdev->bdev, b)); + pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)( + sect + + choose_data_offset(r10_bio, + rdev)), + bdevname(rdev->bdev, b)); + pr_notice("md/raid10:%s: %s: failing drive\n", + mdname(mddev), + bdevname(rdev->bdev, b)); } rdev_dec_pending(rdev, mddev); rcu_read_lock(); @@ -2397,24 +2496,18 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 READ)) { case 0: /* Well, this device is dead */ - printk(KERN_NOTICE - "md/raid10:%s: unable to read back " - "corrected sectors" - " (%d sectors at %llu on %s)\n", + pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %s)\n", mdname(mddev), s, (unsigned long long)( sect + choose_data_offset(r10_bio, rdev)), bdevname(rdev->bdev, b)); - printk(KERN_NOTICE "md/raid10:%s: %s: failing " - "drive\n", + pr_notice("md/raid10:%s: %s: failing drive\n", mdname(mddev), bdevname(rdev->bdev, b)); break; case 1: - printk(KERN_INFO - "md/raid10:%s: read error corrected" - " (%d sectors at %llu on %s)\n", + pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %s)\n", mdname(mddev), s, (unsigned long long)( sect + @@ -2472,7 +2565,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) if (sectors > sect_to_write) sectors = sect_to_write; /* Write at 'sector' for 'sectors' */ - wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + wbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors); wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector); wbio->bi_iter.bi_sector = wsector + @@ -2503,6 +2596,8 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) char b[BDEVNAME_SIZE]; unsigned long do_sync; int max_sectors; + dev_t bio_dev; + sector_t bio_last_sector; /* we got a read error. Maybe the drive is bad. Maybe just * the block and we can fix it. @@ -2514,40 +2609,39 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) */ bio = r10_bio->devs[slot].bio; bdevname(bio->bi_bdev, b); + bio_dev = bio->bi_bdev->bd_dev; + bio_last_sector = r10_bio->devs[slot].addr + rdev->data_offset + r10_bio->sectors; bio_put(bio); r10_bio->devs[slot].bio = NULL; - if (mddev->ro == 0) { + if (mddev->ro) + r10_bio->devs[slot].bio = IO_BLOCKED; + else if (!test_bit(FailFast, &rdev->flags)) { freeze_array(conf, 1); fix_read_error(conf, mddev, r10_bio); unfreeze_array(conf); } else - r10_bio->devs[slot].bio = IO_BLOCKED; + md_error(mddev, rdev); rdev_dec_pending(rdev, mddev); read_more: rdev = read_balance(conf, r10_bio, &max_sectors); if (rdev == NULL) { - printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" - " read error for block %llu\n", - mdname(mddev), b, - (unsigned long long)r10_bio->sector); + pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n", + mdname(mddev), b, + (unsigned long long)r10_bio->sector); raid_end_bio_io(r10_bio); return; } do_sync = (r10_bio->master_bio->bi_opf & REQ_SYNC); slot = r10_bio->read_slot; - printk_ratelimited( - KERN_ERR - "md/raid10:%s: %s: redirecting " - "sector %llu to another mirror\n", - mdname(mddev), - bdevname(rdev->bdev, b), - (unsigned long long)r10_bio->sector); - bio = bio_clone_mddev(r10_bio->master_bio, - GFP_NOIO, mddev); + pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n", + mdname(mddev), + bdevname(rdev->bdev, b), + (unsigned long long)r10_bio->sector); + bio = bio_clone_fast(r10_bio->master_bio, GFP_NOIO, mddev->bio_set); bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); r10_bio->devs[slot].bio = bio; r10_bio->devs[slot].rdev = rdev; @@ -2555,8 +2649,15 @@ read_more: + choose_data_offset(r10_bio, rdev); bio->bi_bdev = rdev->bdev; bio_set_op_attrs(bio, REQ_OP_READ, do_sync); + if (test_bit(FailFast, &rdev->flags) && + test_bit(R10BIO_FailFast, &r10_bio->state)) + bio->bi_opf |= MD_FAILFAST; bio->bi_private = r10_bio; bio->bi_end_io = raid10_end_read_request; + trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), + bio, bio_dev, + bio_last_sector - r10_bio->sectors); + if (max_sectors < r10_bio->sectors) { /* Drat - have to split this up more */ struct bio *mbio = r10_bio->master_bio; @@ -2694,10 +2795,10 @@ static void raid10d(struct md_thread *thread) md_check_recovery(mddev); if (!list_empty_careful(&conf->bio_end_io_list) && - !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { LIST_HEAD(tmp); spin_lock_irqsave(&conf->device_lock, flags); - if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { while (!list_empty(&conf->bio_end_io_list)) { list_move(conf->bio_end_io_list.prev, &tmp); conf->nr_queued--; @@ -2755,7 +2856,7 @@ static void raid10d(struct md_thread *thread) } cond_resched(); - if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) + if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING)) md_check_recovery(mddev); } blk_finish_plug(&plug); @@ -3072,6 +3173,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio->bi_private = r10_bio; bio->bi_end_io = end_sync_read; bio_set_op_attrs(bio, REQ_OP_READ, 0); + if (test_bit(FailFast, &rdev->flags)) + bio->bi_opf |= MD_FAILFAST; from_addr = r10_bio->devs[j].addr; bio->bi_iter.bi_sector = from_addr + rdev->data_offset; @@ -3160,8 +3263,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (!any_working) { if (!test_and_set_bit(MD_RECOVERY_INTR, &mddev->recovery)) - printk(KERN_INFO "md/raid10:%s: insufficient " - "working devices for recovery.\n", + pr_warn("md/raid10:%s: insufficient working devices for recovery.\n", mdname(mddev)); mirror->recovery_disabled = mddev->recovery_disabled; @@ -3178,6 +3280,23 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, rdev_dec_pending(mrdev, mddev); if (mreplace) rdev_dec_pending(mreplace, mddev); + if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) { + /* Only want this if there is elsewhere to + * read from. 'j' is currently the first + * readable copy. + */ + int targets = 1; + for (; j < conf->copies; j++) { + int d = r10_bio->devs[j].devnum; + if (conf->mirrors[d].rdev && + test_bit(In_sync, + &conf->mirrors[d].rdev->flags)) + targets++; + } + if (targets == 1) + r10_bio->devs[0].bio->bi_opf + &= ~MD_FAILFAST; + } } if (biolist == NULL) { while (r10_bio) { @@ -3256,6 +3375,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio->bi_private = r10_bio; bio->bi_end_io = end_sync_read; bio_set_op_attrs(bio, REQ_OP_READ, 0); + if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) + bio->bi_opf |= MD_FAILFAST; bio->bi_iter.bi_sector = sector + rdev->data_offset; bio->bi_bdev = rdev->bdev; count++; @@ -3279,6 +3400,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio->bi_private = r10_bio; bio->bi_end_io = end_sync_write; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) + bio->bi_opf |= MD_FAILFAST; bio->bi_iter.bi_sector = sector + rdev->data_offset; bio->bi_bdev = rdev->bdev; count++; @@ -3489,15 +3612,14 @@ static struct r10conf *setup_conf(struct mddev *mddev) copies = setup_geo(&geo, mddev, geo_new); if (copies == -2) { - printk(KERN_ERR "md/raid10:%s: chunk size must be " - "at least PAGE_SIZE(%ld) and be a power of 2.\n", - mdname(mddev), PAGE_SIZE); + pr_warn("md/raid10:%s: chunk size must be at least PAGE_SIZE(%ld) and be a power of 2.\n", + mdname(mddev), PAGE_SIZE); goto out; } if (copies < 2 || copies > mddev->raid_disks) { - printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", - mdname(mddev), mddev->new_layout); + pr_warn("md/raid10:%s: unsupported raid10 layout: 0x%8x\n", + mdname(mddev), mddev->new_layout); goto out; } @@ -3557,9 +3679,6 @@ static struct r10conf *setup_conf(struct mddev *mddev) return conf; out: - if (err == -ENOMEM) - printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", - mdname(mddev)); if (conf) { mempool_destroy(conf->r10bio_pool); kfree(conf->mirrors); @@ -3656,7 +3775,7 @@ static int raid10_run(struct mddev *mddev) } /* need to check that every block has at least one working mirror */ if (!enough(conf, -1)) { - printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", + pr_err("md/raid10:%s: not enough operational mirrors.\n", mdname(mddev)); goto out_free_conf; } @@ -3698,11 +3817,9 @@ static int raid10_run(struct mddev *mddev) } if (mddev->recovery_cp != MaxSector) - printk(KERN_NOTICE "md/raid10:%s: not clean" - " -- starting background reconstruction\n", - mdname(mddev)); - printk(KERN_INFO - "md/raid10:%s: active with %d out of %d devices\n", + pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n", + mdname(mddev)); + pr_info("md/raid10:%s: active with %d out of %d devices\n", mdname(mddev), conf->geo.raid_disks - mddev->degraded, conf->geo.raid_disks); /* @@ -3712,6 +3829,7 @@ static int raid10_run(struct mddev *mddev) size = raid10_size(mddev, 0, 0); md_set_array_sectors(mddev, size); mddev->resync_max_sectors = size; + set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); if (mddev->queue) { int stripe = conf->geo.raid_disks * @@ -3722,8 +3840,8 @@ static int raid10_run(struct mddev *mddev) * maybe... */ stripe /= conf->geo.near_copies; - if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) - mddev->queue->backing_dev_info.ra_pages = 2 * stripe; + if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe) + mddev->queue->backing_dev_info->ra_pages = 2 * stripe; } if (md_integrity_register(mddev)) @@ -3739,7 +3857,7 @@ static int raid10_run(struct mddev *mddev) if (max(before_length, after_length) > min_offset_diff) { /* This cannot work */ - printk("md/raid10: offset difference not enough to continue reshape\n"); + pr_warn("md/raid10: offset difference not enough to continue reshape\n"); goto out_free_conf; } conf->offset_diff = min_offset_diff; @@ -3846,8 +3964,8 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) struct r10conf *conf; if (mddev->degraded > 0) { - printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n", - mdname(mddev)); + pr_warn("md/raid10:%s: Error: degraded raid0!\n", + mdname(mddev)); return ERR_PTR(-EINVAL); } sector_div(size, devs); @@ -3887,9 +4005,8 @@ static void *raid10_takeover(struct mddev *mddev) /* for raid0 takeover only one zone is supported */ raid0_conf = mddev->private; if (raid0_conf->nr_strip_zones > 1) { - printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0" - " with more than one zone.\n", - mdname(mddev)); + pr_warn("md/raid10:%s: cannot takeover raid 0 with more than one zone.\n", + mdname(mddev)); return ERR_PTR(-EINVAL); } return raid10_takeover_raid0(mddev, @@ -4078,8 +4195,8 @@ static int raid10_start_reshape(struct mddev *mddev) sector_t size = raid10_size(mddev, 0, 0); if (size < mddev->array_sectors) { spin_unlock_irq(&conf->device_lock); - printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n", - mdname(mddev)); + pr_warn("md/raid10:%s: array size must be reduce before number of disks\n", + mdname(mddev)); return -EINVAL; } mddev->resync_max_sectors = size; @@ -4126,7 +4243,7 @@ static int raid10_start_reshape(struct mddev *mddev) spin_unlock_irq(&conf->device_lock); mddev->raid_disks = conf->geo.raid_disks; mddev->reshape_position = conf->reshape_progress; - set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); @@ -4321,9 +4438,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, else mddev->curr_resync_completed = conf->reshape_progress; conf->reshape_checkpoint = jiffies; - set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); md_wakeup_thread(mddev->thread); - wait_event(mddev->sb_wait, mddev->flags == 0 || + wait_event(mddev->sb_wait, mddev->sb_flags == 0 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { allow_barrier(conf); @@ -4525,8 +4642,8 @@ static void end_reshape(struct r10conf *conf) int stripe = conf->geo.raid_disks * ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE); stripe /= conf->geo.near_copies; - if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) - conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; + if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe) + conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe; } conf->fullsync = 0; } diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 18ec1f7a98bf..3162615e57bd 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -156,5 +156,7 @@ enum r10bio_state { * flag is set */ R10BIO_Previous, +/* failfast devices did receive failfast requests. */ + R10BIO_FailFast, }; #endif diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index a227a9f3ee65..3f307be01b10 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1,5 +1,6 @@ /* * Copyright (C) 2015 Shaohua Li <shli@fb.com> + * Copyright (C) 2016 Song Liu <songliubraving@fb.com> * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -18,8 +19,11 @@ #include <linux/raid/md_p.h> #include <linux/crc32c.h> #include <linux/random.h> +#include <linux/kthread.h> +#include <linux/types.h> #include "md.h" #include "raid5.h" +#include "bitmap.h" /* * metadata/data stored in disk with 4k size unit (a block) regardless @@ -28,18 +32,70 @@ #define BLOCK_SECTORS (8) /* - * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent - * recovery scans a very long log + * log->max_free_space is min(1/4 disk size, 10G reclaimable space). + * + * In write through mode, the reclaim runs every log->max_free_space. + * This can prevent the recovery scans for too long */ #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ #define RECLAIM_MAX_FREE_SPACE_SHIFT (2) +/* wake up reclaim thread periodically */ +#define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ) +/* start flush with these full stripes */ +#define R5C_FULL_STRIPE_FLUSH_BATCH 256 +/* reclaim stripes in groups */ +#define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2) + /* * We only need 2 bios per I/O unit to make progress, but ensure we * have a few more available to not get too tight. */ #define R5L_POOL_SIZE 4 +/* + * r5c journal modes of the array: write-back or write-through. + * write-through mode has identical behavior as existing log only + * implementation. + */ +enum r5c_journal_mode { + R5C_JOURNAL_MODE_WRITE_THROUGH = 0, + R5C_JOURNAL_MODE_WRITE_BACK = 1, +}; + +static char *r5c_journal_mode_str[] = {"write-through", + "write-back"}; +/* + * raid5 cache state machine + * + * With the RAID cache, each stripe works in two phases: + * - caching phase + * - writing-out phase + * + * These two phases are controlled by bit STRIPE_R5C_CACHING: + * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase + * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase + * + * When there is no journal, or the journal is in write-through mode, + * the stripe is always in writing-out phase. + * + * For write-back journal, the stripe is sent to caching phase on write + * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off + * the write-out phase by clearing STRIPE_R5C_CACHING. + * + * Stripes in caching phase do not write the raid disks. Instead, all + * writes are committed from the log device. Therefore, a stripe in + * caching phase handles writes as: + * - write to log device + * - return IO + * + * Stripes in writing-out phase handle writes as: + * - calculate parity + * - write pending data and parity to journal + * - write data and parity to raid disks + * - return IO for pending writes + */ + struct r5l_log { struct md_rdev *rdev; @@ -58,7 +114,6 @@ struct r5l_log { u64 seq; /* log head sequence */ sector_t next_checkpoint; - u64 next_cp_seq; struct mutex io_mutex; struct r5l_io_unit *current_io; /* current io_unit accepting new data */ @@ -96,9 +151,74 @@ struct r5l_log { spinlock_t no_space_stripes_lock; bool need_cache_flush; + + /* for r5c_cache */ + enum r5c_journal_mode r5c_journal_mode; + + /* all stripes in r5cache, in the order of seq at sh->log_start */ + struct list_head stripe_in_journal_list; + + spinlock_t stripe_in_journal_lock; + atomic_t stripe_in_journal_count; + + /* to submit async io_units, to fulfill ordering of flush */ + struct work_struct deferred_io_work; + /* to disable write back during in degraded mode */ + struct work_struct disable_writeback_work; + + /* to for chunk_aligned_read in writeback mode, details below */ + spinlock_t tree_lock; + struct radix_tree_root big_stripe_tree; }; /* + * Enable chunk_aligned_read() with write back cache. + * + * Each chunk may contain more than one stripe (for example, a 256kB + * chunk contains 64 4kB-page, so this chunk contain 64 stripes). For + * chunk_aligned_read, these stripes are grouped into one "big_stripe". + * For each big_stripe, we count how many stripes of this big_stripe + * are in the write back cache. These data are tracked in a radix tree + * (big_stripe_tree). We use radix_tree item pointer as the counter. + * r5c_tree_index() is used to calculate keys for the radix tree. + * + * chunk_aligned_read() calls r5c_big_stripe_cached() to look up + * big_stripe of each chunk in the tree. If this big_stripe is in the + * tree, chunk_aligned_read() aborts. This look up is protected by + * rcu_read_lock(). + * + * It is necessary to remember whether a stripe is counted in + * big_stripe_tree. Instead of adding new flag, we reuses existing flags: + * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these + * two flags are set, the stripe is counted in big_stripe_tree. This + * requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to + * r5c_try_caching_write(); and moving clear_bit of + * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to + * r5c_finish_stripe_write_out(). + */ + +/* + * radix tree requests lowest 2 bits of data pointer to be 2b'00. + * So it is necessary to left shift the counter by 2 bits before using it + * as data pointer of the tree. + */ +#define R5C_RADIX_COUNT_SHIFT 2 + +/* + * calculate key for big_stripe_tree + * + * sect: align_bi->bi_iter.bi_sector or sh->sector + */ +static inline sector_t r5c_tree_index(struct r5conf *conf, + sector_t sect) +{ + sector_t offset; + + offset = sector_div(sect, conf->chunk_sectors); + return sect; +} + +/* * an IO range starts from a meta data block and end at the next meta data * block. The io unit's the meta data block tracks data/parity followed it. io * unit is written to log disk with normal write, as we always flush log disk @@ -122,6 +242,18 @@ struct r5l_io_unit { int state; bool need_split_bio; + struct bio *split_bio; + + unsigned int has_flush:1; /* include flush request */ + unsigned int has_fua:1; /* include fua request */ + unsigned int has_null_flush:1; /* include empty flush request */ + /* + * io isn't sent yet, flush/fua request can only be submitted till it's + * the first IO in running_ios list + */ + unsigned int io_deferred:1; + + struct bio_list flush_barriers; /* size == 0 flush bios */ }; /* r5l_io_unit state */ @@ -133,6 +265,12 @@ enum r5l_io_unit_state { IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ }; +bool r5c_is_writeback(struct r5l_log *log) +{ + return (log != NULL && + log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK); +} + static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) { start += inc; @@ -168,12 +306,239 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io, io->state = state; } +static void +r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev, + struct bio_list *return_bi) +{ + struct bio *wbi, *wbi2; + + wbi = dev->written; + dev->written = NULL; + while (wbi && wbi->bi_iter.bi_sector < + dev->sector + STRIPE_SECTORS) { + wbi2 = r5_next_bio(wbi, dev->sector); + if (!raid5_dec_bi_active_stripes(wbi)) { + md_write_end(conf->mddev); + bio_list_add(return_bi, wbi); + } + wbi = wbi2; + } +} + +void r5c_handle_cached_data_endio(struct r5conf *conf, + struct stripe_head *sh, int disks, struct bio_list *return_bi) +{ + int i; + + for (i = sh->disks; i--; ) { + if (sh->dev[i].written) { + set_bit(R5_UPTODATE, &sh->dev[i].flags); + r5c_return_dev_pending_writes(conf, &sh->dev[i], + return_bi); + bitmap_endwrite(conf->mddev->bitmap, sh->sector, + STRIPE_SECTORS, + !test_bit(STRIPE_DEGRADED, &sh->state), + 0); + } + } +} + +/* Check whether we should flush some stripes to free up stripe cache */ +void r5c_check_stripe_cache_usage(struct r5conf *conf) +{ + int total_cached; + + if (!r5c_is_writeback(conf->log)) + return; + + total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + + atomic_read(&conf->r5c_cached_full_stripes); + + /* + * The following condition is true for either of the following: + * - stripe cache pressure high: + * total_cached > 3/4 min_nr_stripes || + * empty_inactive_list_nr > 0 + * - stripe cache pressure moderate: + * total_cached > 1/2 min_nr_stripes + */ + if (total_cached > conf->min_nr_stripes * 1 / 2 || + atomic_read(&conf->empty_inactive_list_nr) > 0) + r5l_wake_reclaim(conf->log, 0); +} + +/* + * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full + * stripes in the cache + */ +void r5c_check_cached_full_stripe(struct r5conf *conf) +{ + if (!r5c_is_writeback(conf->log)) + return; + + /* + * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes + * or a full stripe (chunk size / 4k stripes). + */ + if (atomic_read(&conf->r5c_cached_full_stripes) >= + min(R5C_FULL_STRIPE_FLUSH_BATCH, + conf->chunk_sectors >> STRIPE_SHIFT)) + r5l_wake_reclaim(conf->log, 0); +} + +/* + * Total log space (in sectors) needed to flush all data in cache + * + * To avoid deadlock due to log space, it is necessary to reserve log + * space to flush critical stripes (stripes that occupying log space near + * last_checkpoint). This function helps check how much log space is + * required to flush all cached stripes. + * + * To reduce log space requirements, two mechanisms are used to give cache + * flush higher priorities: + * 1. In handle_stripe_dirtying() and schedule_reconstruction(), + * stripes ALREADY in journal can be flushed w/o pending writes; + * 2. In r5l_write_stripe() and r5c_cache_data(), stripes NOT in journal + * can be delayed (r5l_add_no_space_stripe). + * + * In cache flush, the stripe goes through 1 and then 2. For a stripe that + * already passed 1, flushing it requires at most (conf->max_degraded + 1) + * pages of journal space. For stripes that has not passed 1, flushing it + * requires (conf->raid_disks + 1) pages of journal space. There are at + * most (conf->group_cnt + 1) stripe that passed 1. So total journal space + * required to flush all cached stripes (in pages) is: + * + * (stripe_in_journal_count - group_cnt - 1) * (max_degraded + 1) + + * (group_cnt + 1) * (raid_disks + 1) + * or + * (stripe_in_journal_count) * (max_degraded + 1) + + * (group_cnt + 1) * (raid_disks - max_degraded) + */ +static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) +{ + struct r5l_log *log = conf->log; + + if (!r5c_is_writeback(log)) + return 0; + + return BLOCK_SECTORS * + ((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) + + (conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1)); +} + +/* + * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL + * + * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of + * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log + * device is less than 2x of reclaim_required_space. + */ +static inline void r5c_update_log_state(struct r5l_log *log) +{ + struct r5conf *conf = log->rdev->mddev->private; + sector_t free_space; + sector_t reclaim_space; + bool wake_reclaim = false; + + if (!r5c_is_writeback(log)) + return; + + free_space = r5l_ring_distance(log, log->log_start, + log->last_checkpoint); + reclaim_space = r5c_log_required_to_flush_cache(conf); + if (free_space < 2 * reclaim_space) + set_bit(R5C_LOG_CRITICAL, &conf->cache_state); + else { + if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) + wake_reclaim = true; + clear_bit(R5C_LOG_CRITICAL, &conf->cache_state); + } + if (free_space < 3 * reclaim_space) + set_bit(R5C_LOG_TIGHT, &conf->cache_state); + else + clear_bit(R5C_LOG_TIGHT, &conf->cache_state); + + if (wake_reclaim) + r5l_wake_reclaim(log, 0); +} + +/* + * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING. + * This function should only be called in write-back mode. + */ +void r5c_make_stripe_write_out(struct stripe_head *sh) +{ + struct r5conf *conf = sh->raid_conf; + struct r5l_log *log = conf->log; + + BUG_ON(!r5c_is_writeback(log)); + + WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); + clear_bit(STRIPE_R5C_CACHING, &sh->state); + + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + atomic_inc(&conf->preread_active_stripes); +} + +static void r5c_handle_data_cached(struct stripe_head *sh) +{ + int i; + + for (i = sh->disks; i--; ) + if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { + set_bit(R5_InJournal, &sh->dev[i].flags); + clear_bit(R5_LOCKED, &sh->dev[i].flags); + } + clear_bit(STRIPE_LOG_TRAPPED, &sh->state); +} + +/* + * this journal write must contain full parity, + * it may also contain some data pages + */ +static void r5c_handle_parity_cached(struct stripe_head *sh) +{ + int i; + + for (i = sh->disks; i--; ) + if (test_bit(R5_InJournal, &sh->dev[i].flags)) + set_bit(R5_Wantwrite, &sh->dev[i].flags); +} + +/* + * Setting proper flags after writing (or flushing) data and/or parity to the + * log device. This is called from r5l_log_endio() or r5l_log_flush_endio(). + */ +static void r5c_finish_cache_stripe(struct stripe_head *sh) +{ + struct r5l_log *log = sh->raid_conf->log; + + if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { + BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); + /* + * Set R5_InJournal for parity dev[pd_idx]. This means + * all data AND parity in the journal. For RAID 6, it is + * NOT necessary to set the flag for dev[qd_idx], as the + * two parities are written out together. + */ + set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); + } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) { + r5c_handle_data_cached(sh); + } else { + r5c_handle_parity_cached(sh); + set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); + } +} + static void r5l_io_run_stripes(struct r5l_io_unit *io) { struct stripe_head *sh, *next; list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { list_del_init(&sh->log_list); + + r5c_finish_cache_stripe(sh); + set_bit(STRIPE_HANDLE, &sh->state); raid5_release_stripe(sh); } @@ -209,9 +574,11 @@ static void r5l_move_to_end_ios(struct r5l_log *log) } } +static void __r5l_stripe_write_finished(struct r5l_io_unit *io); static void r5l_log_endio(struct bio *bio) { struct r5l_io_unit *io = bio->bi_private; + struct r5l_io_unit *io_deferred; struct r5l_log *log = io->log; unsigned long flags; @@ -227,18 +594,104 @@ static void r5l_log_endio(struct bio *bio) r5l_move_to_end_ios(log); else r5l_log_run_stripes(log); + if (!list_empty(&log->running_ios)) { + /* + * FLUSH/FUA io_unit is deferred because of ordering, now we + * can dispatch it + */ + io_deferred = list_first_entry(&log->running_ios, + struct r5l_io_unit, log_sibling); + if (io_deferred->io_deferred) + schedule_work(&log->deferred_io_work); + } + spin_unlock_irqrestore(&log->io_list_lock, flags); if (log->need_cache_flush) md_wakeup_thread(log->rdev->mddev->thread); + + if (io->has_null_flush) { + struct bio *bi; + + WARN_ON(bio_list_empty(&io->flush_barriers)); + while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) { + bio_endio(bi); + atomic_dec(&io->pending_stripe); + } + if (atomic_read(&io->pending_stripe) == 0) + __r5l_stripe_write_finished(io); + } +} + +static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io) +{ + unsigned long flags; + + spin_lock_irqsave(&log->io_list_lock, flags); + __r5l_set_io_unit_state(io, IO_UNIT_IO_START); + spin_unlock_irqrestore(&log->io_list_lock, flags); + + if (io->has_flush) + io->current_bio->bi_opf |= REQ_PREFLUSH; + if (io->has_fua) + io->current_bio->bi_opf |= REQ_FUA; + submit_bio(io->current_bio); + + if (!io->split_bio) + return; + + if (io->has_flush) + io->split_bio->bi_opf |= REQ_PREFLUSH; + if (io->has_fua) + io->split_bio->bi_opf |= REQ_FUA; + submit_bio(io->split_bio); +} + +/* deferred io_unit will be dispatched here */ +static void r5l_submit_io_async(struct work_struct *work) +{ + struct r5l_log *log = container_of(work, struct r5l_log, + deferred_io_work); + struct r5l_io_unit *io = NULL; + unsigned long flags; + + spin_lock_irqsave(&log->io_list_lock, flags); + if (!list_empty(&log->running_ios)) { + io = list_first_entry(&log->running_ios, struct r5l_io_unit, + log_sibling); + if (!io->io_deferred) + io = NULL; + else + io->io_deferred = 0; + } + spin_unlock_irqrestore(&log->io_list_lock, flags); + if (io) + r5l_do_submit_io(log, io); +} + +static void r5c_disable_writeback_async(struct work_struct *work) +{ + struct r5l_log *log = container_of(work, struct r5l_log, + disable_writeback_work); + struct mddev *mddev = log->rdev->mddev; + + if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) + return; + pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n", + mdname(mddev)); + mddev_suspend(mddev); + log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; + mddev_resume(mddev); } static void r5l_submit_current_io(struct r5l_log *log) { struct r5l_io_unit *io = log->current_io; + struct bio *bio; struct r5l_meta_block *block; unsigned long flags; u32 crc; + bool do_submit = true; if (!io) return; @@ -247,13 +700,20 @@ static void r5l_submit_current_io(struct r5l_log *log) block->meta_size = cpu_to_le32(io->meta_offset); crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); block->checksum = cpu_to_le32(crc); + bio = io->current_bio; log->current_io = NULL; spin_lock_irqsave(&log->io_list_lock, flags); - __r5l_set_io_unit_state(io, IO_UNIT_IO_START); + if (io->has_flush || io->has_fua) { + if (io != list_first_entry(&log->running_ios, + struct r5l_io_unit, log_sibling)) { + io->io_deferred = 1; + do_submit = false; + } + } spin_unlock_irqrestore(&log->io_list_lock, flags); - - submit_bio(io->current_bio); + if (do_submit) + r5l_do_submit_io(log, io); } static struct bio *r5l_bio_alloc(struct r5l_log *log) @@ -271,6 +731,7 @@ static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io) { log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); + r5c_update_log_state(log); /* * If we filled up the log device start from the beginning again, * which will require a new bio. @@ -297,6 +758,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) io->log = log; INIT_LIST_HEAD(&io->log_sibling); INIT_LIST_HEAD(&io->stripe_list); + bio_list_init(&io->flush_barriers); io->state = IO_UNIT_RUNNING; io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO); @@ -367,12 +829,11 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page) struct r5l_io_unit *io = log->current_io; if (io->need_split_bio) { - struct bio *prev = io->current_bio; - + BUG_ON(io->split_bio); + io->split_bio = io->current_bio; io->current_bio = r5l_bio_alloc(log); - bio_chain(io->current_bio, prev); - - submit_bio(prev); + bio_chain(io->current_bio, io->split_bio); + io->need_split_bio = false; } if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) @@ -401,50 +862,85 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, io = log->current_io; + if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state)) + io->has_flush = 1; + for (i = 0; i < sh->disks; i++) { - if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) + if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || + test_bit(R5_InJournal, &sh->dev[i].flags)) continue; if (i == sh->pd_idx || i == sh->qd_idx) continue; + if (test_bit(R5_WantFUA, &sh->dev[i].flags) && + log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) { + io->has_fua = 1; + /* + * we need to flush journal to make sure recovery can + * reach the data with fua flag + */ + io->has_flush = 1; + } r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, raid5_compute_blocknr(sh, i, 0), sh->dev[i].log_checksum, 0, false); r5l_append_payload_page(log, sh->dev[i].page); } - if (sh->qd_idx >= 0) { + if (parity_pages == 2) { r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, sh->sector, sh->dev[sh->pd_idx].log_checksum, sh->dev[sh->qd_idx].log_checksum, true); r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); - } else { + } else if (parity_pages == 1) { r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, sh->sector, sh->dev[sh->pd_idx].log_checksum, 0, false); r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); - } + } else /* Just writing data, not parity, in caching phase */ + BUG_ON(parity_pages != 0); list_add_tail(&sh->log_list, &io->stripe_list); atomic_inc(&io->pending_stripe); sh->log_io = io; + if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) + return 0; + + if (sh->log_start == MaxSector) { + BUG_ON(!list_empty(&sh->r5c)); + sh->log_start = io->log_start; + spin_lock_irq(&log->stripe_in_journal_lock); + list_add_tail(&sh->r5c, + &log->stripe_in_journal_list); + spin_unlock_irq(&log->stripe_in_journal_lock); + atomic_inc(&log->stripe_in_journal_count); + } return 0; } -static void r5l_wake_reclaim(struct r5l_log *log, sector_t space); +/* add stripe to no_space_stripes, and then wake up reclaim */ +static inline void r5l_add_no_space_stripe(struct r5l_log *log, + struct stripe_head *sh) +{ + spin_lock(&log->no_space_stripes_lock); + list_add_tail(&sh->log_list, &log->no_space_stripes); + spin_unlock(&log->no_space_stripes_lock); +} + /* * running in raid5d, where reclaim could wait for raid5d too (when it flushes * data from log to raid disks), so we shouldn't wait for reclaim here */ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) { + struct r5conf *conf = sh->raid_conf; int write_disks = 0; int data_pages, parity_pages; - int meta_size; int reserve; int i; int ret = 0; + bool wake_reclaim = false; if (!log) return -EAGAIN; @@ -456,11 +952,15 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) return -EAGAIN; } + WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); + for (i = 0; i < sh->disks; i++) { void *addr; - if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) + if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || + test_bit(R5_InJournal, &sh->dev[i].flags)) continue; + write_disks++; /* checksum is already calculated in last run */ if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) @@ -473,15 +973,6 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) parity_pages = 1 + !!(sh->qd_idx >= 0); data_pages = write_disks - parity_pages; - meta_size = - ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) - * data_pages) + - sizeof(struct r5l_payload_data_parity) + - sizeof(__le32) * parity_pages; - /* Doesn't work with very big raid array */ - if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE) - return -EINVAL; - set_bit(STRIPE_LOG_TRAPPED, &sh->state); /* * The stripe must enter state machine again to finish the write, so @@ -493,22 +984,49 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) mutex_lock(&log->io_mutex); /* meta + data */ reserve = (1 + write_disks) << (PAGE_SHIFT - 9); - if (!r5l_has_free_space(log, reserve)) { - spin_lock(&log->no_space_stripes_lock); - list_add_tail(&sh->log_list, &log->no_space_stripes); - spin_unlock(&log->no_space_stripes_lock); - r5l_wake_reclaim(log, reserve); - } else { - ret = r5l_log_stripe(log, sh, data_pages, parity_pages); - if (ret) { - spin_lock_irq(&log->io_list_lock); - list_add_tail(&sh->log_list, &log->no_mem_stripes); - spin_unlock_irq(&log->io_list_lock); + if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { + if (!r5l_has_free_space(log, reserve)) { + r5l_add_no_space_stripe(log, sh); + wake_reclaim = true; + } else { + ret = r5l_log_stripe(log, sh, data_pages, parity_pages); + if (ret) { + spin_lock_irq(&log->io_list_lock); + list_add_tail(&sh->log_list, + &log->no_mem_stripes); + spin_unlock_irq(&log->io_list_lock); + } + } + } else { /* R5C_JOURNAL_MODE_WRITE_BACK */ + /* + * log space critical, do not process stripes that are + * not in cache yet (sh->log_start == MaxSector). + */ + if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && + sh->log_start == MaxSector) { + r5l_add_no_space_stripe(log, sh); + wake_reclaim = true; + reserve = 0; + } else if (!r5l_has_free_space(log, reserve)) { + if (sh->log_start == log->last_checkpoint) + BUG(); + else + r5l_add_no_space_stripe(log, sh); + } else { + ret = r5l_log_stripe(log, sh, data_pages, parity_pages); + if (ret) { + spin_lock_irq(&log->io_list_lock); + list_add_tail(&sh->log_list, + &log->no_mem_stripes); + spin_unlock_irq(&log->io_list_lock); + } } } mutex_unlock(&log->io_mutex); + if (wake_reclaim) + r5l_wake_reclaim(log, reserve); return 0; } @@ -525,17 +1043,34 @@ int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) { if (!log) return -ENODEV; - /* - * we flush log disk cache first, then write stripe data to raid disks. - * So if bio is finished, the log disk cache is flushed already. The - * recovery guarantees we can recovery the bio from log disk, so we - * don't need to flush again - */ - if (bio->bi_iter.bi_size == 0) { - bio_endio(bio); - return 0; + + if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { + /* + * in write through (journal only) + * we flush log disk cache first, then write stripe data to + * raid disks. So if bio is finished, the log disk cache is + * flushed already. The recovery guarantees we can recovery + * the bio from log disk, so we don't need to flush again + */ + if (bio->bi_iter.bi_size == 0) { + bio_endio(bio); + return 0; + } + bio->bi_opf &= ~REQ_PREFLUSH; + } else { + /* write back (with cache) */ + if (bio->bi_iter.bi_size == 0) { + mutex_lock(&log->io_mutex); + r5l_get_meta(log, 0); + bio_list_add(&log->current_io->flush_barriers, bio); + log->current_io->has_flush = 1; + log->current_io->has_null_flush = 1; + atomic_inc(&log->current_io->pending_stripe); + r5l_submit_current_io(log); + mutex_unlock(&log->io_mutex); + return 0; + } } - bio->bi_opf &= ~REQ_PREFLUSH; return -EAGAIN; } @@ -555,10 +1090,40 @@ static void r5l_run_no_space_stripes(struct r5l_log *log) spin_unlock(&log->no_space_stripes_lock); } +/* + * calculate new last_checkpoint + * for write through mode, returns log->next_checkpoint + * for write back, returns log_start of first sh in stripe_in_journal_list + */ +static sector_t r5c_calculate_new_cp(struct r5conf *conf) +{ + struct stripe_head *sh; + struct r5l_log *log = conf->log; + sector_t new_cp; + unsigned long flags; + + if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) + return log->next_checkpoint; + + spin_lock_irqsave(&log->stripe_in_journal_lock, flags); + if (list_empty(&conf->log->stripe_in_journal_list)) { + /* all stripes flushed */ + spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); + return log->next_checkpoint; + } + sh = list_first_entry(&conf->log->stripe_in_journal_list, + struct stripe_head, r5c); + new_cp = sh->log_start; + spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); + return new_cp; +} + static sector_t r5l_reclaimable_space(struct r5l_log *log) { + struct r5conf *conf = log->rdev->mddev->private; + return r5l_ring_distance(log, log->last_checkpoint, - log->next_checkpoint); + r5c_calculate_new_cp(conf)); } static void r5l_run_no_mem_stripe(struct r5l_log *log) @@ -589,7 +1154,6 @@ static bool r5l_complete_finished_ios(struct r5l_log *log) break; log->next_checkpoint = io->log_start; - log->next_cp_seq = io->seq; list_del(&io->log_sibling); mempool_free(io, log->io_pool); @@ -604,6 +1168,7 @@ static bool r5l_complete_finished_ios(struct r5l_log *log) static void __r5l_stripe_write_finished(struct r5l_io_unit *io) { struct r5l_log *log = io->log; + struct r5conf *conf = log->rdev->mddev->private; unsigned long flags; spin_lock_irqsave(&log->io_list_lock, flags); @@ -614,7 +1179,8 @@ static void __r5l_stripe_write_finished(struct r5l_io_unit *io) return; } - if (r5l_reclaimable_space(log) > log->max_free_space) + if (r5l_reclaimable_space(log) > log->max_free_space || + test_bit(R5C_LOG_TIGHT, &conf->cache_state)) r5l_wake_reclaim(log, 0); spin_unlock_irqrestore(&log->io_list_lock, flags); @@ -685,7 +1251,7 @@ void r5l_flush_stripe_to_raid(struct r5l_log *log) bio_reset(&log->flush_bio); log->flush_bio.bi_bdev = log->rdev->bdev; log->flush_bio.bi_end_io = r5l_log_flush_endio; - bio_set_op_attrs(&log->flush_bio, REQ_OP_WRITE, WRITE_FLUSH); + log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; submit_bio(&log->flush_bio); } @@ -713,8 +1279,8 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log, * there is a deadlock. We workaround this issue with a trylock. * FIXME: we could miss discard if we can't take reconfig mutex */ - set_mask_bits(&mddev->flags, 0, - BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); + set_mask_bits(&mddev->sb_flags, 0, + BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); if (!mddev_trylock(mddev)) return; md_update_sb(mddev, 1); @@ -735,15 +1301,156 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log, } } +/* + * r5c_flush_stripe moves stripe from cached list to handle_list. When called, + * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes. + * + * must hold conf->device_lock + */ +static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) +{ + BUG_ON(list_empty(&sh->lru)); + BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); + BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); + + /* + * The stripe is not ON_RELEASE_LIST, so it is safe to call + * raid5_release_stripe() while holding conf->device_lock + */ + BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); + assert_spin_locked(&conf->device_lock); + + list_del_init(&sh->lru); + atomic_inc(&sh->count); + + set_bit(STRIPE_HANDLE, &sh->state); + atomic_inc(&conf->active_stripes); + r5c_make_stripe_write_out(sh); + + if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) + atomic_inc(&conf->r5c_flushing_partial_stripes); + else + atomic_inc(&conf->r5c_flushing_full_stripes); + raid5_release_stripe(sh); +} + +/* + * if num == 0, flush all full stripes + * if num > 0, flush all full stripes. If less than num full stripes are + * flushed, flush some partial stripes until totally num stripes are + * flushed or there is no more cached stripes. + */ +void r5c_flush_cache(struct r5conf *conf, int num) +{ + int count; + struct stripe_head *sh, *next; + + assert_spin_locked(&conf->device_lock); + if (!conf->log) + return; + + count = 0; + list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) { + r5c_flush_stripe(conf, sh); + count++; + } + + if (count >= num) + return; + list_for_each_entry_safe(sh, next, + &conf->r5c_partial_stripe_list, lru) { + r5c_flush_stripe(conf, sh); + if (++count >= num) + break; + } +} + +static void r5c_do_reclaim(struct r5conf *conf) +{ + struct r5l_log *log = conf->log; + struct stripe_head *sh; + int count = 0; + unsigned long flags; + int total_cached; + int stripes_to_flush; + int flushing_partial, flushing_full; + + if (!r5c_is_writeback(log)) + return; + + flushing_partial = atomic_read(&conf->r5c_flushing_partial_stripes); + flushing_full = atomic_read(&conf->r5c_flushing_full_stripes); + total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + + atomic_read(&conf->r5c_cached_full_stripes) - + flushing_full - flushing_partial; + + if (total_cached > conf->min_nr_stripes * 3 / 4 || + atomic_read(&conf->empty_inactive_list_nr) > 0) + /* + * if stripe cache pressure high, flush all full stripes and + * some partial stripes + */ + stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP; + else if (total_cached > conf->min_nr_stripes * 1 / 2 || + atomic_read(&conf->r5c_cached_full_stripes) - flushing_full > + R5C_FULL_STRIPE_FLUSH_BATCH) + /* + * if stripe cache pressure moderate, or if there is many full + * stripes,flush all full stripes + */ + stripes_to_flush = 0; + else + /* no need to flush */ + stripes_to_flush = -1; + + if (stripes_to_flush >= 0) { + spin_lock_irqsave(&conf->device_lock, flags); + r5c_flush_cache(conf, stripes_to_flush); + spin_unlock_irqrestore(&conf->device_lock, flags); + } + + /* if log space is tight, flush stripes on stripe_in_journal_list */ + if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) { + spin_lock_irqsave(&log->stripe_in_journal_lock, flags); + spin_lock(&conf->device_lock); + list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) { + /* + * stripes on stripe_in_journal_list could be in any + * state of the stripe_cache state machine. In this + * case, we only want to flush stripe on + * r5c_cached_full/partial_stripes. The following + * condition makes sure the stripe is on one of the + * two lists. + */ + if (!list_empty(&sh->lru) && + !test_bit(STRIPE_HANDLE, &sh->state) && + atomic_read(&sh->count) == 0) { + r5c_flush_stripe(conf, sh); + if (count++ >= R5C_RECLAIM_STRIPE_GROUP) + break; + } + } + spin_unlock(&conf->device_lock); + spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); + } + + if (!test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) + r5l_run_no_space_stripes(log); + + md_wakeup_thread(conf->mddev->thread); +} static void r5l_do_reclaim(struct r5l_log *log) { + struct r5conf *conf = log->rdev->mddev->private; sector_t reclaim_target = xchg(&log->reclaim_target, 0); sector_t reclaimable; sector_t next_checkpoint; - u64 next_cp_seq; + bool write_super; spin_lock_irq(&log->io_list_lock); + write_super = r5l_reclaimable_space(log) > log->max_free_space || + reclaim_target != 0 || !list_empty(&log->no_space_stripes); /* * move proper io_unit to reclaim list. We should not change the order. * reclaimable/unreclaimable io_unit can be mixed in the list, we @@ -764,12 +1471,10 @@ static void r5l_do_reclaim(struct r5l_log *log) log->io_list_lock); } - next_checkpoint = log->next_checkpoint; - next_cp_seq = log->next_cp_seq; + next_checkpoint = r5c_calculate_new_cp(conf); spin_unlock_irq(&log->io_list_lock); - BUG_ON(reclaimable < 0); - if (reclaimable == 0) + if (reclaimable == 0 || !write_super) return; /* @@ -781,7 +1486,7 @@ static void r5l_do_reclaim(struct r5l_log *log) mutex_lock(&log->io_mutex); log->last_checkpoint = next_checkpoint; - log->last_cp_seq = next_cp_seq; + r5c_update_log_state(log); mutex_unlock(&log->io_mutex); r5l_run_no_space_stripes(log); @@ -795,14 +1500,17 @@ static void r5l_reclaim_thread(struct md_thread *thread) if (!log) return; + r5c_do_reclaim(conf); r5l_do_reclaim(log); } -static void r5l_wake_reclaim(struct r5l_log *log, sector_t space) +void r5l_wake_reclaim(struct r5l_log *log, sector_t space) { unsigned long target; unsigned long new = (unsigned long)space; /* overflow in theory */ + if (!log) + return; do { target = log->reclaim_target; if (new < target) @@ -816,22 +1524,14 @@ void r5l_quiesce(struct r5l_log *log, int state) struct mddev *mddev; if (!log || state == 2) return; - if (state == 0) { - /* - * This is a special case for hotadd. In suspend, the array has - * no journal. In resume, journal is initialized as well as the - * reclaim thread. - */ - if (log->reclaim_thread) - return; - log->reclaim_thread = md_register_thread(r5l_reclaim_thread, - log->rdev->mddev, "reclaim"); - } else if (state == 1) { + if (state == 0) + kthread_unpark(log->reclaim_thread->tsk); + else if (state == 1) { /* make sure r5l_write_super_and_discard_space exits */ mddev = log->rdev->mddev; wake_up(&mddev->sb_wait); - r5l_wake_reclaim(log, -1L); - md_unregister_thread(&log->reclaim_thread); + kthread_park(log->reclaim_thread->tsk); + r5l_wake_reclaim(log, MaxSector); r5l_do_reclaim(log); } } @@ -857,10 +1557,13 @@ struct r5l_recovery_ctx { sector_t meta_total_blocks; /* total size of current meta and data */ sector_t pos; /* recovery position */ u64 seq; /* recovery position seq */ + int data_parity_stripes; /* number of data_parity stripes */ + int data_only_stripes; /* number of data_only stripes */ + struct list_head cached_list; }; -static int r5l_read_meta_block(struct r5l_log *log, - struct r5l_recovery_ctx *ctx) +static int r5l_recovery_read_meta_block(struct r5l_log *log, + struct r5l_recovery_ctx *ctx) { struct page *page = ctx->meta_page; struct r5l_meta_block *mb; @@ -892,216 +1595,700 @@ static int r5l_read_meta_block(struct r5l_log *log, return 0; } -static int r5l_recovery_flush_one_stripe(struct r5l_log *log, - struct r5l_recovery_ctx *ctx, - sector_t stripe_sect, - int *offset, sector_t *log_offset) +static void +r5l_recovery_create_empty_meta_block(struct r5l_log *log, + struct page *page, + sector_t pos, u64 seq) { - struct r5conf *conf = log->rdev->mddev->private; - struct stripe_head *sh; - struct r5l_payload_data_parity *payload; - int disk_index; + struct r5l_meta_block *mb; - sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0); - while (1) { - payload = page_address(ctx->meta_page) + *offset; + mb = page_address(page); + clear_page(mb); + mb->magic = cpu_to_le32(R5LOG_MAGIC); + mb->version = R5LOG_VERSION; + mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); + mb->seq = cpu_to_le64(seq); + mb->position = cpu_to_le64(pos); +} - if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) { - raid5_compute_sector(conf, - le64_to_cpu(payload->location), 0, - &disk_index, sh); +static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, + u64 seq) +{ + struct page *page; + struct r5l_meta_block *mb; - sync_page_io(log->rdev, *log_offset, PAGE_SIZE, - sh->dev[disk_index].page, REQ_OP_READ, 0, - false); - sh->dev[disk_index].log_checksum = - le32_to_cpu(payload->checksum[0]); - set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); - ctx->meta_total_blocks += BLOCK_SECTORS; - } else { - disk_index = sh->pd_idx; - sync_page_io(log->rdev, *log_offset, PAGE_SIZE, - sh->dev[disk_index].page, REQ_OP_READ, 0, - false); - sh->dev[disk_index].log_checksum = - le32_to_cpu(payload->checksum[0]); - set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); - - if (sh->qd_idx >= 0) { - disk_index = sh->qd_idx; - sync_page_io(log->rdev, - r5l_ring_add(log, *log_offset, BLOCK_SECTORS), - PAGE_SIZE, sh->dev[disk_index].page, - REQ_OP_READ, 0, false); - sh->dev[disk_index].log_checksum = - le32_to_cpu(payload->checksum[1]); - set_bit(R5_Wantwrite, - &sh->dev[disk_index].flags); - } - ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; - } + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + r5l_recovery_create_empty_meta_block(log, page, pos, seq); + mb = page_address(page); + mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, + mb, PAGE_SIZE)); + if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, + REQ_FUA, false)) { + __free_page(page); + return -EIO; + } + __free_page(page); + return 0; +} - *log_offset = r5l_ring_add(log, *log_offset, - le32_to_cpu(payload->size)); - *offset += sizeof(struct r5l_payload_data_parity) + - sizeof(__le32) * - (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); - if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) - break; +/* + * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite + * to mark valid (potentially not flushed) data in the journal. + * + * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb, + * so there should not be any mismatch here. + */ +static void r5l_recovery_load_data(struct r5l_log *log, + struct stripe_head *sh, + struct r5l_recovery_ctx *ctx, + struct r5l_payload_data_parity *payload, + sector_t log_offset) +{ + struct mddev *mddev = log->rdev->mddev; + struct r5conf *conf = mddev->private; + int dd_idx; + + raid5_compute_sector(conf, + le64_to_cpu(payload->location), 0, + &dd_idx, sh); + sync_page_io(log->rdev, log_offset, PAGE_SIZE, + sh->dev[dd_idx].page, REQ_OP_READ, 0, false); + sh->dev[dd_idx].log_checksum = + le32_to_cpu(payload->checksum[0]); + ctx->meta_total_blocks += BLOCK_SECTORS; + + set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags); + set_bit(STRIPE_R5C_CACHING, &sh->state); +} + +static void r5l_recovery_load_parity(struct r5l_log *log, + struct stripe_head *sh, + struct r5l_recovery_ctx *ctx, + struct r5l_payload_data_parity *payload, + sector_t log_offset) +{ + struct mddev *mddev = log->rdev->mddev; + struct r5conf *conf = mddev->private; + + ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; + sync_page_io(log->rdev, log_offset, PAGE_SIZE, + sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false); + sh->dev[sh->pd_idx].log_checksum = + le32_to_cpu(payload->checksum[0]); + set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags); + + if (sh->qd_idx >= 0) { + sync_page_io(log->rdev, + r5l_ring_add(log, log_offset, BLOCK_SECTORS), + PAGE_SIZE, sh->dev[sh->qd_idx].page, + REQ_OP_READ, 0, false); + sh->dev[sh->qd_idx].log_checksum = + le32_to_cpu(payload->checksum[1]); + set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags); } + clear_bit(STRIPE_R5C_CACHING, &sh->state); +} - for (disk_index = 0; disk_index < sh->disks; disk_index++) { - void *addr; - u32 checksum; +static void r5l_recovery_reset_stripe(struct stripe_head *sh) +{ + int i; + sh->state = 0; + sh->log_start = MaxSector; + for (i = sh->disks; i--; ) + sh->dev[i].flags = 0; +} + +static void +r5l_recovery_replay_one_stripe(struct r5conf *conf, + struct stripe_head *sh, + struct r5l_recovery_ctx *ctx) +{ + struct md_rdev *rdev, *rrdev; + int disk_index; + int data_count = 0; + + for (disk_index = 0; disk_index < sh->disks; disk_index++) { if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) continue; - addr = kmap_atomic(sh->dev[disk_index].page); - checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); - kunmap_atomic(addr); - if (checksum != sh->dev[disk_index].log_checksum) - goto error; + if (disk_index == sh->qd_idx || disk_index == sh->pd_idx) + continue; + data_count++; } - for (disk_index = 0; disk_index < sh->disks; disk_index++) { - struct md_rdev *rdev, *rrdev; + /* + * stripes that only have parity must have been flushed + * before the crash that we are now recovering from, so + * there is nothing more to recovery. + */ + if (data_count == 0) + goto out; - if (!test_and_clear_bit(R5_Wantwrite, - &sh->dev[disk_index].flags)) + for (disk_index = 0; disk_index < sh->disks; disk_index++) { + if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) continue; /* in case device is broken */ + rcu_read_lock(); rdev = rcu_dereference(conf->disks[disk_index].rdev); - if (rdev) - sync_page_io(rdev, stripe_sect, PAGE_SIZE, + if (rdev) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + sync_page_io(rdev, sh->sector, PAGE_SIZE, sh->dev[disk_index].page, REQ_OP_WRITE, 0, false); + rdev_dec_pending(rdev, rdev->mddev); + rcu_read_lock(); + } rrdev = rcu_dereference(conf->disks[disk_index].replacement); - if (rrdev) - sync_page_io(rrdev, stripe_sect, PAGE_SIZE, + if (rrdev) { + atomic_inc(&rrdev->nr_pending); + rcu_read_unlock(); + sync_page_io(rrdev, sh->sector, PAGE_SIZE, sh->dev[disk_index].page, REQ_OP_WRITE, 0, false); + rdev_dec_pending(rrdev, rrdev->mddev); + rcu_read_lock(); + } + rcu_read_unlock(); } - raid5_release_stripe(sh); + ctx->data_parity_stripes++; +out: + r5l_recovery_reset_stripe(sh); +} + +static struct stripe_head * +r5c_recovery_alloc_stripe(struct r5conf *conf, + sector_t stripe_sect) +{ + struct stripe_head *sh; + + sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0); + if (!sh) + return NULL; /* no more stripe available */ + + r5l_recovery_reset_stripe(sh); + + return sh; +} + +static struct stripe_head * +r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect) +{ + struct stripe_head *sh; + + list_for_each_entry(sh, list, lru) + if (sh->sector == sect) + return sh; + return NULL; +} + +static void +r5c_recovery_drop_stripes(struct list_head *cached_stripe_list, + struct r5l_recovery_ctx *ctx) +{ + struct stripe_head *sh, *next; + + list_for_each_entry_safe(sh, next, cached_stripe_list, lru) { + r5l_recovery_reset_stripe(sh); + list_del_init(&sh->lru); + raid5_release_stripe(sh); + } +} + +static void +r5c_recovery_replay_stripes(struct list_head *cached_stripe_list, + struct r5l_recovery_ctx *ctx) +{ + struct stripe_head *sh, *next; + + list_for_each_entry_safe(sh, next, cached_stripe_list, lru) + if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { + r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx); + list_del_init(&sh->lru); + raid5_release_stripe(sh); + } +} + +/* if matches return 0; otherwise return -EINVAL */ +static int +r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page, + sector_t log_offset, __le32 log_checksum) +{ + void *addr; + u32 checksum; + + sync_page_io(log->rdev, log_offset, PAGE_SIZE, + page, REQ_OP_READ, 0, false); + addr = kmap_atomic(page); + checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); + kunmap_atomic(addr); + return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL; +} + +/* + * before loading data to stripe cache, we need verify checksum for all data, + * if there is mismatch for any data page, we drop all data in the mata block + */ +static int +r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log, + struct r5l_recovery_ctx *ctx) +{ + struct mddev *mddev = log->rdev->mddev; + struct r5conf *conf = mddev->private; + struct r5l_meta_block *mb = page_address(ctx->meta_page); + sector_t mb_offset = sizeof(struct r5l_meta_block); + sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); + struct page *page; + struct r5l_payload_data_parity *payload; + + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + while (mb_offset < le32_to_cpu(mb->meta_size)) { + payload = (void *)mb + mb_offset; + + if (payload->header.type == R5LOG_PAYLOAD_DATA) { + if (r5l_recovery_verify_data_checksum( + log, page, log_offset, + payload->checksum[0]) < 0) + goto mismatch; + } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) { + if (r5l_recovery_verify_data_checksum( + log, page, log_offset, + payload->checksum[0]) < 0) + goto mismatch; + if (conf->max_degraded == 2 && /* q for RAID 6 */ + r5l_recovery_verify_data_checksum( + log, page, + r5l_ring_add(log, log_offset, + BLOCK_SECTORS), + payload->checksum[1]) < 0) + goto mismatch; + } else /* not R5LOG_PAYLOAD_DATA or R5LOG_PAYLOAD_PARITY */ + goto mismatch; + + log_offset = r5l_ring_add(log, log_offset, + le32_to_cpu(payload->size)); + + mb_offset += sizeof(struct r5l_payload_data_parity) + + sizeof(__le32) * + (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); + } + + put_page(page); return 0; -error: - for (disk_index = 0; disk_index < sh->disks; disk_index++) - sh->dev[disk_index].flags = 0; - raid5_release_stripe(sh); +mismatch: + put_page(page); return -EINVAL; } -static int r5l_recovery_flush_one_meta(struct r5l_log *log, - struct r5l_recovery_ctx *ctx) +/* + * Analyze all data/parity pages in one meta block + * Returns: + * 0 for success + * -EINVAL for unknown playload type + * -EAGAIN for checksum mismatch of data page + * -ENOMEM for run out of memory (alloc_page failed or run out of stripes) + */ +static int +r5c_recovery_analyze_meta_block(struct r5l_log *log, + struct r5l_recovery_ctx *ctx, + struct list_head *cached_stripe_list) { - struct r5conf *conf = log->rdev->mddev->private; - struct r5l_payload_data_parity *payload; + struct mddev *mddev = log->rdev->mddev; + struct r5conf *conf = mddev->private; struct r5l_meta_block *mb; - int offset; + struct r5l_payload_data_parity *payload; + int mb_offset; sector_t log_offset; - sector_t stripe_sector; + sector_t stripe_sect; + struct stripe_head *sh; + int ret; + + /* + * for mismatch in data blocks, we will drop all data in this mb, but + * we will still read next mb for other data with FLUSH flag, as + * io_unit could finish out of order. + */ + ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx); + if (ret == -EINVAL) + return -EAGAIN; + else if (ret) + return ret; /* -ENOMEM duo to alloc_page() failed */ mb = page_address(ctx->meta_page); - offset = sizeof(struct r5l_meta_block); + mb_offset = sizeof(struct r5l_meta_block); log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); - while (offset < le32_to_cpu(mb->meta_size)) { + while (mb_offset < le32_to_cpu(mb->meta_size)) { int dd; - payload = (void *)mb + offset; - stripe_sector = raid5_compute_sector(conf, - le64_to_cpu(payload->location), 0, &dd, NULL); - if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector, - &offset, &log_offset)) + payload = (void *)mb + mb_offset; + stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ? + raid5_compute_sector( + conf, le64_to_cpu(payload->location), 0, &dd, + NULL) + : le64_to_cpu(payload->location); + + sh = r5c_recovery_lookup_stripe(cached_stripe_list, + stripe_sect); + + if (!sh) { + sh = r5c_recovery_alloc_stripe(conf, stripe_sect); + /* + * cannot get stripe from raid5_get_active_stripe + * try replay some stripes + */ + if (!sh) { + r5c_recovery_replay_stripes( + cached_stripe_list, ctx); + sh = r5c_recovery_alloc_stripe( + conf, stripe_sect); + } + if (!sh) { + pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n", + mdname(mddev), + conf->min_nr_stripes * 2); + raid5_set_cache_size(mddev, + conf->min_nr_stripes * 2); + sh = r5c_recovery_alloc_stripe(conf, + stripe_sect); + } + if (!sh) { + pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n", + mdname(mddev)); + return -ENOMEM; + } + list_add_tail(&sh->lru, cached_stripe_list); + } + + if (payload->header.type == R5LOG_PAYLOAD_DATA) { + if (!test_bit(STRIPE_R5C_CACHING, &sh->state) && + test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) { + r5l_recovery_replay_one_stripe(conf, sh, ctx); + list_move_tail(&sh->lru, cached_stripe_list); + } + r5l_recovery_load_data(log, sh, ctx, payload, + log_offset); + } else if (payload->header.type == R5LOG_PAYLOAD_PARITY) + r5l_recovery_load_parity(log, sh, ctx, payload, + log_offset); + else return -EINVAL; + + log_offset = r5l_ring_add(log, log_offset, + le32_to_cpu(payload->size)); + + mb_offset += sizeof(struct r5l_payload_data_parity) + + sizeof(__le32) * + (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); } + return 0; } -/* copy data/parity from log to raid disks */ -static void r5l_recovery_flush_log(struct r5l_log *log, - struct r5l_recovery_ctx *ctx) +/* + * Load the stripe into cache. The stripe will be written out later by + * the stripe cache state machine. + */ +static void r5c_recovery_load_one_stripe(struct r5l_log *log, + struct stripe_head *sh) { + struct r5dev *dev; + int i; + + for (i = sh->disks; i--; ) { + dev = sh->dev + i; + if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) { + set_bit(R5_InJournal, &dev->flags); + set_bit(R5_UPTODATE, &dev->flags); + } + } +} + +/* + * Scan through the log for all to-be-flushed data + * + * For stripes with data and parity, namely Data-Parity stripe + * (STRIPE_R5C_CACHING == 0), we simply replay all the writes. + * + * For stripes with only data, namely Data-Only stripe + * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine. + * + * For a stripe, if we see data after parity, we should discard all previous + * data and parity for this stripe, as these data are already flushed to + * the array. + * + * At the end of the scan, we return the new journal_tail, which points to + * first data-only stripe on the journal device, or next invalid meta block. + */ +static int r5c_recovery_flush_log(struct r5l_log *log, + struct r5l_recovery_ctx *ctx) +{ + struct stripe_head *sh; + int ret = 0; + + /* scan through the log */ while (1) { - if (r5l_read_meta_block(log, ctx)) - return; - if (r5l_recovery_flush_one_meta(log, ctx)) - return; + if (r5l_recovery_read_meta_block(log, ctx)) + break; + + ret = r5c_recovery_analyze_meta_block(log, ctx, + &ctx->cached_list); + /* + * -EAGAIN means mismatch in data block, in this case, we still + * try scan the next metablock + */ + if (ret && ret != -EAGAIN) + break; /* ret == -EINVAL or -ENOMEM */ ctx->seq++; ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); } + + if (ret == -ENOMEM) { + r5c_recovery_drop_stripes(&ctx->cached_list, ctx); + return ret; + } + + /* replay data-parity stripes */ + r5c_recovery_replay_stripes(&ctx->cached_list, ctx); + + /* load data-only stripes to stripe cache */ + list_for_each_entry(sh, &ctx->cached_list, lru) { + WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); + r5c_recovery_load_one_stripe(log, sh); + ctx->data_only_stripes++; + } + + return 0; } -static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, - u64 seq) +/* + * we did a recovery. Now ctx.pos points to an invalid meta block. New + * log will start here. but we can't let superblock point to last valid + * meta block. The log might looks like: + * | meta 1| meta 2| meta 3| + * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If + * superblock points to meta 1, we write a new valid meta 2n. if crash + * happens again, new recovery will start from meta 1. Since meta 2n is + * valid now, recovery will think meta 3 is valid, which is wrong. + * The solution is we create a new meta in meta2 with its seq == meta + * 1's seq + 10000 and let superblock points to meta2. The same recovery + * will not think meta 3 is a valid meta, because its seq doesn't match + */ + +/* + * Before recovery, the log looks like the following + * + * --------------------------------------------- + * | valid log | invalid log | + * --------------------------------------------- + * ^ + * |- log->last_checkpoint + * |- log->last_cp_seq + * + * Now we scan through the log until we see invalid entry + * + * --------------------------------------------- + * | valid log | invalid log | + * --------------------------------------------- + * ^ ^ + * |- log->last_checkpoint |- ctx->pos + * |- log->last_cp_seq |- ctx->seq + * + * From this point, we need to increase seq number by 10 to avoid + * confusing next recovery. + * + * --------------------------------------------- + * | valid log | invalid log | + * --------------------------------------------- + * ^ ^ + * |- log->last_checkpoint |- ctx->pos+1 + * |- log->last_cp_seq |- ctx->seq+10001 + * + * However, it is not safe to start the state machine yet, because data only + * parities are not yet secured in RAID. To save these data only parities, we + * rewrite them from seq+11. + * + * ----------------------------------------------------------------- + * | valid log | data only stripes | invalid log | + * ----------------------------------------------------------------- + * ^ ^ + * |- log->last_checkpoint |- ctx->pos+n + * |- log->last_cp_seq |- ctx->seq+10000+n + * + * If failure happens again during this process, the recovery can safe start + * again from log->last_checkpoint. + * + * Once data only stripes are rewritten to journal, we move log_tail + * + * ----------------------------------------------------------------- + * | old log | data only stripes | invalid log | + * ----------------------------------------------------------------- + * ^ ^ + * |- log->last_checkpoint |- ctx->pos+n + * |- log->last_cp_seq |- ctx->seq+10000+n + * + * Then we can safely start the state machine. If failure happens from this + * point on, the recovery will start from new log->last_checkpoint. + */ +static int +r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, + struct r5l_recovery_ctx *ctx) { + struct stripe_head *sh; + struct mddev *mddev = log->rdev->mddev; struct page *page; - struct r5l_meta_block *mb; - u32 crc; + sector_t next_checkpoint = MaxSector; - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) + page = alloc_page(GFP_KERNEL); + if (!page) { + pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n", + mdname(mddev)); return -ENOMEM; - mb = page_address(page); - mb->magic = cpu_to_le32(R5LOG_MAGIC); - mb->version = R5LOG_VERSION; - mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); - mb->seq = cpu_to_le64(seq); - mb->position = cpu_to_le64(pos); - crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); - mb->checksum = cpu_to_le32(crc); + } - if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, - WRITE_FUA, false)) { - __free_page(page); - return -EIO; + WARN_ON(list_empty(&ctx->cached_list)); + + list_for_each_entry(sh, &ctx->cached_list, lru) { + struct r5l_meta_block *mb; + int i; + int offset; + sector_t write_pos; + + WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); + r5l_recovery_create_empty_meta_block(log, page, + ctx->pos, ctx->seq); + mb = page_address(page); + offset = le32_to_cpu(mb->meta_size); + write_pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); + + for (i = sh->disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + struct r5l_payload_data_parity *payload; + void *addr; + + if (test_bit(R5_InJournal, &dev->flags)) { + payload = (void *)mb + offset; + payload->header.type = cpu_to_le16( + R5LOG_PAYLOAD_DATA); + payload->size = BLOCK_SECTORS; + payload->location = cpu_to_le64( + raid5_compute_blocknr(sh, i, 0)); + addr = kmap_atomic(dev->page); + payload->checksum[0] = cpu_to_le32( + crc32c_le(log->uuid_checksum, addr, + PAGE_SIZE)); + kunmap_atomic(addr); + sync_page_io(log->rdev, write_pos, PAGE_SIZE, + dev->page, REQ_OP_WRITE, 0, false); + write_pos = r5l_ring_add(log, write_pos, + BLOCK_SECTORS); + offset += sizeof(__le32) + + sizeof(struct r5l_payload_data_parity); + + } + } + mb->meta_size = cpu_to_le32(offset); + mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, + mb, PAGE_SIZE)); + sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, + REQ_OP_WRITE, REQ_FUA, false); + sh->log_start = ctx->pos; + list_add_tail(&sh->r5c, &log->stripe_in_journal_list); + atomic_inc(&log->stripe_in_journal_count); + ctx->pos = write_pos; + ctx->seq += 1; + next_checkpoint = sh->log_start; } + log->next_checkpoint = next_checkpoint; __free_page(page); return 0; } +static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log, + struct r5l_recovery_ctx *ctx) +{ + struct mddev *mddev = log->rdev->mddev; + struct r5conf *conf = mddev->private; + struct stripe_head *sh, *next; + + if (ctx->data_only_stripes == 0) + return; + + log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK; + + list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { + r5c_make_stripe_write_out(sh); + set_bit(STRIPE_HANDLE, &sh->state); + list_del_init(&sh->lru); + raid5_release_stripe(sh); + } + + md_wakeup_thread(conf->mddev->thread); + /* reuse conf->wait_for_quiescent in recovery */ + wait_event(conf->wait_for_quiescent, + atomic_read(&conf->active_stripes) == 0); + + log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; +} + static int r5l_recovery_log(struct r5l_log *log) { + struct mddev *mddev = log->rdev->mddev; struct r5l_recovery_ctx ctx; + int ret; + sector_t pos; ctx.pos = log->last_checkpoint; ctx.seq = log->last_cp_seq; ctx.meta_page = alloc_page(GFP_KERNEL); + ctx.data_only_stripes = 0; + ctx.data_parity_stripes = 0; + INIT_LIST_HEAD(&ctx.cached_list); + if (!ctx.meta_page) return -ENOMEM; - r5l_recovery_flush_log(log, &ctx); + ret = r5c_recovery_flush_log(log, &ctx); __free_page(ctx.meta_page); - /* - * we did a recovery. Now ctx.pos points to an invalid meta block. New - * log will start here. but we can't let superblock point to last valid - * meta block. The log might looks like: - * | meta 1| meta 2| meta 3| - * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If - * superblock points to meta 1, we write a new valid meta 2n. if crash - * happens again, new recovery will start from meta 1. Since meta 2n is - * valid now, recovery will think meta 3 is valid, which is wrong. - * The solution is we create a new meta in meta2 with its seq == meta - * 1's seq + 10 and let superblock points to meta2. The same recovery will - * not think meta 3 is a valid meta, because its seq doesn't match - */ - if (ctx.seq > log->last_cp_seq) { - int ret; - - ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10); - if (ret) - return ret; - log->seq = ctx.seq + 11; - log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); - r5l_write_super(log, ctx.pos); - log->last_checkpoint = ctx.pos; + if (ret) + return ret; + + pos = ctx.pos; + ctx.seq += 10000; + + + if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0)) + pr_debug("md/raid:%s: starting from clean shutdown\n", + mdname(mddev)); + else + pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n", + mdname(mddev), ctx.data_only_stripes, + ctx.data_parity_stripes); + + if (ctx.data_only_stripes == 0) { log->next_checkpoint = ctx.pos; - } else { - log->log_start = ctx.pos; - log->seq = ctx.seq; + r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++); + ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); + } else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) { + pr_err("md/raid:%s: failed to rewrite stripes to journal\n", + mdname(mddev)); + return -EIO; } + + log->log_start = ctx.pos; + log->seq = ctx.seq; + log->last_checkpoint = pos; + r5l_write_super(log, pos); + + r5c_recovery_flush_data_only_stripes(log, &ctx); return 0; } @@ -1110,7 +2297,400 @@ static void r5l_write_super(struct r5l_log *log, sector_t cp) struct mddev *mddev = log->rdev->mddev; log->rdev->journal_tail = cp; - set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); +} + +static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) +{ + struct r5conf *conf = mddev->private; + int ret; + + if (!conf->log) + return 0; + + switch (conf->log->r5c_journal_mode) { + case R5C_JOURNAL_MODE_WRITE_THROUGH: + ret = snprintf( + page, PAGE_SIZE, "[%s] %s\n", + r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], + r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); + break; + case R5C_JOURNAL_MODE_WRITE_BACK: + ret = snprintf( + page, PAGE_SIZE, "%s [%s]\n", + r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], + r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); + break; + default: + ret = 0; + } + return ret; +} + +static ssize_t r5c_journal_mode_store(struct mddev *mddev, + const char *page, size_t length) +{ + struct r5conf *conf = mddev->private; + struct r5l_log *log = conf->log; + int val = -1, i; + int len = length; + + if (!log) + return -ENODEV; + + if (len && page[len - 1] == '\n') + len -= 1; + for (i = 0; i < ARRAY_SIZE(r5c_journal_mode_str); i++) + if (strlen(r5c_journal_mode_str[i]) == len && + strncmp(page, r5c_journal_mode_str[i], len) == 0) { + val = i; + break; + } + if (val < R5C_JOURNAL_MODE_WRITE_THROUGH || + val > R5C_JOURNAL_MODE_WRITE_BACK) + return -EINVAL; + + if (raid5_calc_degraded(conf) > 0 && + val == R5C_JOURNAL_MODE_WRITE_BACK) + return -EINVAL; + + mddev_suspend(mddev); + conf->log->r5c_journal_mode = val; + mddev_resume(mddev); + + pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", + mdname(mddev), val, r5c_journal_mode_str[val]); + return length; +} + +struct md_sysfs_entry +r5c_journal_mode = __ATTR(journal_mode, 0644, + r5c_journal_mode_show, r5c_journal_mode_store); + +/* + * Try handle write operation in caching phase. This function should only + * be called in write-back mode. + * + * If all outstanding writes can be handled in caching phase, returns 0 + * If writes requires write-out phase, call r5c_make_stripe_write_out() + * and returns -EAGAIN + */ +int r5c_try_caching_write(struct r5conf *conf, + struct stripe_head *sh, + struct stripe_head_state *s, + int disks) +{ + struct r5l_log *log = conf->log; + int i; + struct r5dev *dev; + int to_cache = 0; + void **pslot; + sector_t tree_index; + int ret; + uintptr_t refcount; + + BUG_ON(!r5c_is_writeback(log)); + + if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { + /* + * There are two different scenarios here: + * 1. The stripe has some data cached, and it is sent to + * write-out phase for reclaim + * 2. The stripe is clean, and this is the first write + * + * For 1, return -EAGAIN, so we continue with + * handle_stripe_dirtying(). + * + * For 2, set STRIPE_R5C_CACHING and continue with caching + * write. + */ + + /* case 1: anything injournal or anything in written */ + if (s->injournal > 0 || s->written > 0) + return -EAGAIN; + /* case 2 */ + set_bit(STRIPE_R5C_CACHING, &sh->state); + } + + /* + * When run in degraded mode, array is set to write-through mode. + * This check helps drain pending write safely in the transition to + * write-through mode. + */ + if (s->failed) { + r5c_make_stripe_write_out(sh); + return -EAGAIN; + } + + for (i = disks; i--; ) { + dev = &sh->dev[i]; + /* if non-overwrite, use writing-out phase */ + if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) && + !test_bit(R5_InJournal, &dev->flags)) { + r5c_make_stripe_write_out(sh); + return -EAGAIN; + } + } + + /* if the stripe is not counted in big_stripe_tree, add it now */ + if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) && + !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { + tree_index = r5c_tree_index(conf, sh->sector); + spin_lock(&log->tree_lock); + pslot = radix_tree_lookup_slot(&log->big_stripe_tree, + tree_index); + if (pslot) { + refcount = (uintptr_t)radix_tree_deref_slot_protected( + pslot, &log->tree_lock) >> + R5C_RADIX_COUNT_SHIFT; + radix_tree_replace_slot( + &log->big_stripe_tree, pslot, + (void *)((refcount + 1) << R5C_RADIX_COUNT_SHIFT)); + } else { + /* + * this radix_tree_insert can fail safely, so no + * need to call radix_tree_preload() + */ + ret = radix_tree_insert( + &log->big_stripe_tree, tree_index, + (void *)(1 << R5C_RADIX_COUNT_SHIFT)); + if (ret) { + spin_unlock(&log->tree_lock); + r5c_make_stripe_write_out(sh); + return -EAGAIN; + } + } + spin_unlock(&log->tree_lock); + + /* + * set STRIPE_R5C_PARTIAL_STRIPE, this shows the stripe is + * counted in the radix tree + */ + set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state); + atomic_inc(&conf->r5c_cached_partial_stripes); + } + + for (i = disks; i--; ) { + dev = &sh->dev[i]; + if (dev->towrite) { + set_bit(R5_Wantwrite, &dev->flags); + set_bit(R5_Wantdrain, &dev->flags); + set_bit(R5_LOCKED, &dev->flags); + to_cache++; + } + } + + if (to_cache) { + set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); + /* + * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data() + * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in + * r5c_handle_data_cached() + */ + set_bit(STRIPE_LOG_TRAPPED, &sh->state); + } + + return 0; +} + +/* + * free extra pages (orig_page) we allocated for prexor + */ +void r5c_release_extra_page(struct stripe_head *sh) +{ + struct r5conf *conf = sh->raid_conf; + int i; + bool using_disk_info_extra_page; + + using_disk_info_extra_page = + sh->dev[0].orig_page == conf->disks[0].extra_page; + + for (i = sh->disks; i--; ) + if (sh->dev[i].page != sh->dev[i].orig_page) { + struct page *p = sh->dev[i].orig_page; + + sh->dev[i].orig_page = sh->dev[i].page; + clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); + + if (!using_disk_info_extra_page) + put_page(p); + } + + if (using_disk_info_extra_page) { + clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state); + md_wakeup_thread(conf->mddev->thread); + } +} + +void r5c_use_extra_page(struct stripe_head *sh) +{ + struct r5conf *conf = sh->raid_conf; + int i; + struct r5dev *dev; + + for (i = sh->disks; i--; ) { + dev = &sh->dev[i]; + if (dev->orig_page != dev->page) + put_page(dev->orig_page); + dev->orig_page = conf->disks[i].extra_page; + } +} + +/* + * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the + * stripe is committed to RAID disks. + */ +void r5c_finish_stripe_write_out(struct r5conf *conf, + struct stripe_head *sh, + struct stripe_head_state *s) +{ + struct r5l_log *log = conf->log; + int i; + int do_wakeup = 0; + sector_t tree_index; + void **pslot; + uintptr_t refcount; + + if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) + return; + + WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); + clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); + + if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) + return; + + for (i = sh->disks; i--; ) { + clear_bit(R5_InJournal, &sh->dev[i].flags); + if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) + do_wakeup = 1; + } + + /* + * analyse_stripe() runs before r5c_finish_stripe_write_out(), + * We updated R5_InJournal, so we also update s->injournal. + */ + s->injournal = 0; + + if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) + if (atomic_dec_and_test(&conf->pending_full_writes)) + md_wakeup_thread(conf->mddev->thread); + + if (do_wakeup) + wake_up(&conf->wait_for_overlap); + + spin_lock_irq(&log->stripe_in_journal_lock); + list_del_init(&sh->r5c); + spin_unlock_irq(&log->stripe_in_journal_lock); + sh->log_start = MaxSector; + + atomic_dec(&log->stripe_in_journal_count); + r5c_update_log_state(log); + + /* stop counting this stripe in big_stripe_tree */ + if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) || + test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { + tree_index = r5c_tree_index(conf, sh->sector); + spin_lock(&log->tree_lock); + pslot = radix_tree_lookup_slot(&log->big_stripe_tree, + tree_index); + BUG_ON(pslot == NULL); + refcount = (uintptr_t)radix_tree_deref_slot_protected( + pslot, &log->tree_lock) >> + R5C_RADIX_COUNT_SHIFT; + if (refcount == 1) + radix_tree_delete(&log->big_stripe_tree, tree_index); + else + radix_tree_replace_slot( + &log->big_stripe_tree, pslot, + (void *)((refcount - 1) << R5C_RADIX_COUNT_SHIFT)); + spin_unlock(&log->tree_lock); + } + + if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) { + BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0); + atomic_dec(&conf->r5c_flushing_partial_stripes); + atomic_dec(&conf->r5c_cached_partial_stripes); + } + + if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { + BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0); + atomic_dec(&conf->r5c_flushing_full_stripes); + atomic_dec(&conf->r5c_cached_full_stripes); + } +} + +int +r5c_cache_data(struct r5l_log *log, struct stripe_head *sh, + struct stripe_head_state *s) +{ + struct r5conf *conf = sh->raid_conf; + int pages = 0; + int reserve; + int i; + int ret = 0; + + BUG_ON(!log); + + for (i = 0; i < sh->disks; i++) { + void *addr; + + if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) + continue; + addr = kmap_atomic(sh->dev[i].page); + sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, + addr, PAGE_SIZE); + kunmap_atomic(addr); + pages++; + } + WARN_ON(pages == 0); + + /* + * The stripe must enter state machine again to call endio, so + * don't delay. + */ + clear_bit(STRIPE_DELAYED, &sh->state); + atomic_inc(&sh->count); + + mutex_lock(&log->io_mutex); + /* meta + data */ + reserve = (1 + pages) << (PAGE_SHIFT - 9); + + if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && + sh->log_start == MaxSector) + r5l_add_no_space_stripe(log, sh); + else if (!r5l_has_free_space(log, reserve)) { + if (sh->log_start == log->last_checkpoint) + BUG(); + else + r5l_add_no_space_stripe(log, sh); + } else { + ret = r5l_log_stripe(log, sh, pages, 0); + if (ret) { + spin_lock_irq(&log->io_list_lock); + list_add_tail(&sh->log_list, &log->no_mem_stripes); + spin_unlock_irq(&log->io_list_lock); + } + } + + mutex_unlock(&log->io_mutex); + return 0; +} + +/* check whether this big stripe is in write back cache. */ +bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect) +{ + struct r5l_log *log = conf->log; + sector_t tree_index; + void *slot; + + if (!log) + return false; + + WARN_ON_ONCE(!rcu_read_lock_held()); + tree_index = r5c_tree_index(conf, sect); + slot = radix_tree_lookup(&log->big_stripe_tree, tree_index); + return slot != NULL; } static int r5l_load_log(struct r5l_log *log) @@ -1121,7 +2701,7 @@ static int r5l_load_log(struct r5l_log *log) sector_t cp = log->rdev->journal_tail; u32 stored_crc, expected_crc; bool create_super = false; - int ret; + int ret = 0; /* Make sure it's valid */ if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp) @@ -1171,16 +2751,36 @@ create: if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) log->max_free_space = RECLAIM_MAX_FREE_SPACE; log->last_checkpoint = cp; - log->next_checkpoint = cp; __free_page(page); - return r5l_recovery_log(log); + if (create_super) { + log->log_start = r5l_ring_add(log, cp, BLOCK_SECTORS); + log->seq = log->last_cp_seq + 1; + log->next_checkpoint = cp; + } else + ret = r5l_recovery_log(log); + + r5c_update_log_state(log); + return ret; ioerr: __free_page(page); return ret; } +void r5c_update_on_rdev_error(struct mddev *mddev) +{ + struct r5conf *conf = mddev->private; + struct r5l_log *log = conf->log; + + if (!log) + return; + + if (raid5_calc_degraded(conf) > 0 && + conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) + schedule_work(&log->disable_writeback_work); +} + int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) { struct request_queue *q = bdev_get_queue(rdev->bdev); @@ -1188,6 +2788,22 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) if (PAGE_SIZE != 4096) return -EINVAL; + + /* + * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and + * raid_disks r5l_payload_data_parity. + * + * Write journal and cache does not work for very big array + * (raid_disks > 203) + */ + if (sizeof(struct r5l_meta_block) + + ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) * + conf->raid_disks) > PAGE_SIZE) { + pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n", + mdname(conf->mddev), conf->raid_disks); + return -EINVAL; + } + log = kzalloc(sizeof(*log), GFP_KERNEL); if (!log) return -ENOMEM; @@ -1205,7 +2821,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) INIT_LIST_HEAD(&log->io_end_ios); INIT_LIST_HEAD(&log->flushing_ios); INIT_LIST_HEAD(&log->finished_ios); - bio_init(&log->flush_bio); + bio_init(&log->flush_bio, NULL, 0); log->io_kc = KMEM_CACHE(r5l_io_unit, 0); if (!log->io_kc) @@ -1223,10 +2839,15 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) if (!log->meta_pool) goto out_mempool; + spin_lock_init(&log->tree_lock); + INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN); + log->reclaim_thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev, "reclaim"); if (!log->reclaim_thread) goto reclaim_thread; + log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; + init_waitqueue_head(&log->iounit_wait); INIT_LIST_HEAD(&log->no_mem_stripes); @@ -1234,14 +2855,24 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) INIT_LIST_HEAD(&log->no_space_stripes); spin_lock_init(&log->no_space_stripes_lock); + INIT_WORK(&log->deferred_io_work, r5l_submit_io_async); + INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async); + + log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; + INIT_LIST_HEAD(&log->stripe_in_journal_list); + spin_lock_init(&log->stripe_in_journal_lock); + atomic_set(&log->stripe_in_journal_count, 0); + + rcu_assign_pointer(conf->log, log); + if (r5l_load_log(log)) goto error; - rcu_assign_pointer(conf->log, log); set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); return 0; error: + rcu_assign_pointer(conf->log, NULL); md_unregister_thread(&log->reclaim_thread); reclaim_thread: mempool_destroy(log->meta_pool); @@ -1258,6 +2889,7 @@ io_kc: void r5l_exit_log(struct r5l_log *log) { + flush_work(&log->disable_writeback_work); md_unregister_thread(&log->reclaim_thread); mempool_destroy(log->meta_pool); bioset_free(log->bs); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 92ac251e91e6..4fb09b3fcb41 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -55,6 +55,8 @@ #include <linux/ratelimit.h> #include <linux/nodemask.h> #include <linux/flex_array.h> +#include <linux/sched/signal.h> + #include <trace/events/block.h> #include "md.h" @@ -62,6 +64,8 @@ #include "raid0.h" #include "bitmap.h" +#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED) + #define cpu_to_group(cpu) cpu_to_node(cpu) #define ANY_GROUP NUMA_NO_NODE @@ -70,19 +74,6 @@ module_param(devices_handle_discard_safely, bool, 0644); MODULE_PARM_DESC(devices_handle_discard_safely, "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); static struct workqueue_struct *raid5_wq; -/* - * Stripe cache - */ - -#define NR_STRIPES 256 -#define STRIPE_SIZE PAGE_SIZE -#define STRIPE_SHIFT (PAGE_SHIFT - 9) -#define STRIPE_SECTORS (STRIPE_SIZE>>9) -#define IO_THRESHOLD 1 -#define BYPASS_THRESHOLD 1 -#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) -#define HASH_MASK (NR_HASH - 1) -#define MAX_STRIPE_BATCH 8 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) { @@ -126,64 +117,6 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) local_irq_enable(); } -/* bio's attached to a stripe+device for I/O are linked together in bi_sector - * order without overlap. There may be several bio's per stripe+device, and - * a bio could span several devices. - * When walking this list for a particular stripe+device, we must never proceed - * beyond a bio that extends past this device, as the next bio might no longer - * be valid. - * This function is used to determine the 'next' bio in the list, given the sector - * of the current stripe+device - */ -static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) -{ - int sectors = bio_sectors(bio); - if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS) - return bio->bi_next; - else - return NULL; -} - -/* - * We maintain a biased count of active stripes in the bottom 16 bits of - * bi_phys_segments, and a count of processed stripes in the upper 16 bits - */ -static inline int raid5_bi_processed_stripes(struct bio *bio) -{ - atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; - return (atomic_read(segments) >> 16) & 0xffff; -} - -static inline int raid5_dec_bi_active_stripes(struct bio *bio) -{ - atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; - return atomic_sub_return(1, segments) & 0xffff; -} - -static inline void raid5_inc_bi_active_stripes(struct bio *bio) -{ - atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; - atomic_inc(segments); -} - -static inline void raid5_set_bi_processed_stripes(struct bio *bio, - unsigned int cnt) -{ - atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; - int old, new; - - do { - old = atomic_read(segments); - new = (old & 0xffff) | (cnt << 16); - } while (atomic_cmpxchg(segments, old, new) != old); -} - -static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt) -{ - atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; - atomic_set(segments, cnt); -} - /* Find first data disk in a raid6 stripe */ static inline int raid6_d0(struct stripe_head *sh) { @@ -289,8 +222,27 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh) static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, struct list_head *temp_inactive_list) { + int i; + int injournal = 0; /* number of date pages with R5_InJournal */ + BUG_ON(!list_empty(&sh->lru)); BUG_ON(atomic_read(&conf->active_stripes)==0); + + if (r5c_is_writeback(conf->log)) + for (i = sh->disks; i--; ) + if (test_bit(R5_InJournal, &sh->dev[i].flags)) + injournal++; + /* + * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with + * data in journal, so they are not released to cached lists + */ + if (conf->quiesce && r5c_is_writeback(conf->log) && + !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) { + if (test_bit(STRIPE_R5C_CACHING, &sh->state)) + r5c_make_stripe_write_out(sh); + set_bit(STRIPE_HANDLE, &sh->state); + } + if (test_bit(STRIPE_HANDLE, &sh->state)) { if (test_bit(STRIPE_DELAYED, &sh->state) && !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) @@ -316,8 +268,30 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, < IO_THRESHOLD) md_wakeup_thread(conf->mddev->thread); atomic_dec(&conf->active_stripes); - if (!test_bit(STRIPE_EXPANDING, &sh->state)) - list_add_tail(&sh->lru, temp_inactive_list); + if (!test_bit(STRIPE_EXPANDING, &sh->state)) { + if (!r5c_is_writeback(conf->log)) + list_add_tail(&sh->lru, temp_inactive_list); + else { + WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)); + if (injournal == 0) + list_add_tail(&sh->lru, temp_inactive_list); + else if (injournal == conf->raid_disks - conf->max_degraded) { + /* full stripe */ + if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) + atomic_inc(&conf->r5c_cached_full_stripes); + if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) + atomic_dec(&conf->r5c_cached_partial_stripes); + list_add_tail(&sh->lru, &conf->r5c_full_stripe_list); + r5c_check_cached_full_stripe(conf); + } else + /* + * STRIPE_R5C_PARTIAL_STRIPE is set in + * r5c_try_caching_write(). No need to + * set it again. + */ + list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list); + } + } } } @@ -381,17 +355,15 @@ static void release_inactive_stripe_list(struct r5conf *conf, static int release_stripe_list(struct r5conf *conf, struct list_head *temp_inactive_list) { - struct stripe_head *sh; + struct stripe_head *sh, *t; int count = 0; struct llist_node *head; head = llist_del_all(&conf->released_stripes); head = llist_reverse_order(head); - while (head) { + llist_for_each_entry_safe(sh, t, head, release_list) { int hash; - sh = llist_entry(head, struct stripe_head, release_list); - head = llist_next(head); /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ smp_mb(); clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); @@ -541,7 +513,7 @@ retry: if (dev->toread || dev->read || dev->towrite || dev->written || test_bit(R5_LOCKED, &dev->flags)) { - printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", + pr_err("sector=%llx i=%d %p %p %p %p %d\n", (unsigned long long)sh->sector, i, dev->toread, dev->read, dev->towrite, dev->written, test_bit(R5_LOCKED, &dev->flags)); @@ -584,7 +556,7 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, * of the two sections, and some non-in_sync devices may * be insync in the section most affected by failed devices. */ -static int calc_degraded(struct r5conf *conf) +int raid5_calc_degraded(struct r5conf *conf) { int degraded, degraded2; int i; @@ -647,7 +619,7 @@ static int has_failed(struct r5conf *conf) if (conf->mddev->reshape_position == MaxSector) return conf->mddev->degraded > conf->max_degraded; - degraded = calc_degraded(conf); + degraded = raid5_calc_degraded(conf); if (degraded > conf->max_degraded) return 1; return 0; @@ -680,9 +652,12 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector, } if (noblock && sh == NULL) break; + + r5c_check_stripe_cache_usage(conf); if (!sh) { set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); + r5l_wake_reclaim(conf->log, 0); wait_event_lock_irq( conf->wait_for_stripe, !list_empty(conf->inactive_list + hash) && @@ -888,6 +863,43 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) return 1; } +static void flush_deferred_bios(struct r5conf *conf) +{ + struct bio_list tmp; + struct bio *bio; + + if (!conf->batch_bio_dispatch || !conf->group_cnt) + return; + + bio_list_init(&tmp); + spin_lock(&conf->pending_bios_lock); + bio_list_merge(&tmp, &conf->pending_bios); + bio_list_init(&conf->pending_bios); + spin_unlock(&conf->pending_bios_lock); + + while ((bio = bio_list_pop(&tmp))) + generic_make_request(bio); +} + +static void defer_bio_issue(struct r5conf *conf, struct bio *bio) +{ + /* + * change group_cnt will drain all bios, so this is safe + * + * A read generally means a read-modify-write, which usually means a + * randwrite, so we don't delay it + */ + if (!conf->batch_bio_dispatch || !conf->group_cnt || + bio_op(bio) == REQ_OP_READ) { + generic_make_request(bio); + return; + } + spin_lock(&conf->pending_bios_lock); + bio_list_add(&conf->pending_bios, bio); + spin_unlock(&conf->pending_bios_lock); + md_wakeup_thread(conf->mddev->thread); +} + static void raid5_end_read_request(struct bio *bi); static void @@ -901,8 +913,19 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) might_sleep(); - if (r5l_write_stripe(conf->log, sh) == 0) - return; + if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { + /* writing out phase */ + if (s->waiting_extra_page) + return; + if (r5l_write_stripe(conf->log, sh) == 0) + return; + } else { /* caching phase */ + if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) { + r5c_cache_data(conf->log, sh, s); + return; + } + } + for (i = disks; i--; ) { int op, op_flags = 0; int replace_only = 0; @@ -913,7 +936,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { op = REQ_OP_WRITE; if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) - op_flags = WRITE_FUA; + op_flags = REQ_FUA; if (test_bit(R5_Discard, &sh->dev[i].flags)) op = REQ_OP_DISCARD; } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) @@ -977,7 +1000,7 @@ again: if (bad < 0) { set_bit(BlockedBadBlocks, &rdev->flags); if (!conf->mddev->external && - conf->mddev->flags) { + conf->mddev->sb_flags) { /* It is very unlikely, but we might * still need to write out the * bad block log - better give it @@ -1029,7 +1052,17 @@ again: if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); - sh->dev[i].vec.bv_page = sh->dev[i].page; + + if (!op_is_write(op) && + test_bit(R5_InJournal, &sh->dev[i].flags)) + /* + * issuing read for a page in journal, this + * must be preparing for prexor in rmw; read + * the data into orig_page + */ + sh->dev[i].vec.bv_page = sh->dev[i].orig_page; + else + sh->dev[i].vec.bv_page = sh->dev[i].page; bi->bi_vcnt = 1; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; bi->bi_io_vec[0].bv_offset = 0; @@ -1047,7 +1080,7 @@ again: trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), bi, disk_devt(conf->mddev->gendisk), sh->dev[i].sector); - generic_make_request(bi); + defer_bio_issue(conf, bi); } if (rrdev) { if (s->syncing || s->expanding || s->expanded @@ -1092,7 +1125,7 @@ again: trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), rbi, disk_devt(conf->mddev->gendisk), sh->dev[i].sector); - generic_make_request(rbi); + defer_bio_issue(conf, rbi); } if (!rdev && !rrdev) { if (op_is_write(op)) @@ -1115,7 +1148,7 @@ again: static struct dma_async_tx_descriptor * async_copy_data(int frombio, struct bio *bio, struct page **page, sector_t sector, struct dma_async_tx_descriptor *tx, - struct stripe_head *sh) + struct stripe_head *sh, int no_skipcopy) { struct bio_vec bvl; struct bvec_iter iter; @@ -1155,7 +1188,8 @@ async_copy_data(int frombio, struct bio *bio, struct page **page, if (frombio) { if (sh->raid_conf->skip_copy && b_offset == 0 && page_offset == 0 && - clen == STRIPE_SIZE) + clen == STRIPE_SIZE && + !no_skipcopy) *page = bio_page; else tx = async_memcpy(*page, bio_page, page_offset, @@ -1237,7 +1271,7 @@ static void ops_run_biofill(struct stripe_head *sh) while (rbi && rbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { tx = async_copy_data(0, rbi, &dev->page, - dev->sector, tx, sh); + dev->sector, tx, sh, 0); rbi = r5_next_bio(rbi, dev->sector); } } @@ -1364,10 +1398,15 @@ static int set_syndrome_sources(struct page **srcs, if (i == sh->qd_idx || i == sh->pd_idx || (srctype == SYNDROME_SRC_ALL) || (srctype == SYNDROME_SRC_WANT_DRAIN && - test_bit(R5_Wantdrain, &dev->flags)) || + (test_bit(R5_Wantdrain, &dev->flags) || + test_bit(R5_InJournal, &dev->flags))) || (srctype == SYNDROME_SRC_WRITTEN && - dev->written)) - srcs[slot] = sh->dev[i].page; + dev->written)) { + if (test_bit(R5_InJournal, &dev->flags)) + srcs[slot] = sh->dev[i].orig_page; + else + srcs[slot] = sh->dev[i].page; + } i = raid6_next_disk(i, disks); } while (i != d0_idx); @@ -1546,6 +1585,13 @@ static void ops_complete_prexor(void *stripe_head_ref) pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + + if (r5c_is_writeback(sh->raid_conf->log)) + /* + * raid5-cache write back uses orig_page during prexor. + * After prexor, it is time to free orig_page + */ + r5c_release_extra_page(sh); } static struct dma_async_tx_descriptor * @@ -1567,7 +1613,9 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; /* Only process blocks that are known to be uptodate */ - if (test_bit(R5_Wantdrain, &dev->flags)) + if (test_bit(R5_InJournal, &dev->flags)) + xor_srcs[count++] = dev->orig_page; + else if (test_bit(R5_Wantdrain, &dev->flags)) xor_srcs[count++] = dev->page; } @@ -1601,6 +1649,7 @@ ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, static struct dma_async_tx_descriptor * ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) { + struct r5conf *conf = sh->raid_conf; int disks = sh->disks; int i; struct stripe_head *head_sh = sh; @@ -1618,6 +1667,11 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) again: dev = &sh->dev[i]; + /* + * clear R5_InJournal, so when rewriting a page in + * journal, it is not skipped by r5l_log_stripe() + */ + clear_bit(R5_InJournal, &dev->flags); spin_lock_irq(&sh->stripe_lock); chosen = dev->towrite; dev->towrite = NULL; @@ -1637,8 +1691,10 @@ again: set_bit(R5_Discard, &dev->flags); else { tx = async_copy_data(1, wbi, &dev->page, - dev->sector, tx, sh); - if (dev->page != dev->orig_page) { + dev->sector, tx, sh, + r5c_is_writeback(conf->log)); + if (dev->page != dev->orig_page && + !r5c_is_writeback(conf->log)) { set_bit(R5_SkipCopy, &dev->flags); clear_bit(R5_UPTODATE, &dev->flags); clear_bit(R5_OVERWRITE, &dev->flags); @@ -1746,7 +1802,8 @@ again: xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (head_sh->dev[i].written) + if (head_sh->dev[i].written || + test_bit(R5_InJournal, &head_sh->dev[i].flags)) xor_srcs[count++] = dev->page; } } else { @@ -2000,17 +2057,15 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, spin_lock_init(&sh->batch_lock); INIT_LIST_HEAD(&sh->batch_list); INIT_LIST_HEAD(&sh->lru); + INIT_LIST_HEAD(&sh->r5c); + INIT_LIST_HEAD(&sh->log_list); atomic_set(&sh->count, 1); + sh->log_start = MaxSector; for (i = 0; i < disks; i++) { struct r5dev *dev = &sh->dev[i]; - bio_init(&dev->req); - dev->req.bi_io_vec = &dev->vec; - dev->req.bi_max_vecs = 1; - - bio_init(&dev->rreq); - dev->rreq.bi_io_vec = &dev->rvec; - dev->rreq.bi_max_vecs = 1; + bio_init(&dev->req, &dev->vec, 1); + bio_init(&dev->rreq, &dev->rvec, 1); } } return sh; @@ -2245,10 +2300,24 @@ static int resize_stripes(struct r5conf *conf, int newsize) */ ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); if (ndisks) { - for (i=0; i<conf->raid_disks; i++) + for (i = 0; i < conf->pool_size; i++) ndisks[i] = conf->disks[i]; - kfree(conf->disks); - conf->disks = ndisks; + + for (i = conf->pool_size; i < newsize; i++) { + ndisks[i].extra_page = alloc_page(GFP_NOIO); + if (!ndisks[i].extra_page) + err = -ENOMEM; + } + + if (err) { + for (i = conf->pool_size; i < newsize; i++) + if (ndisks[i].extra_page) + put_page(ndisks[i].extra_page); + kfree(ndisks); + } else { + kfree(conf->disks); + conf->disks = ndisks; + } } else err = -ENOMEM; @@ -2347,10 +2416,8 @@ static void raid5_end_read_request(struct bio * bi) * replacement device. We just fail those on * any error */ - printk_ratelimited( - KERN_INFO - "md/raid:%s: read error corrected" - " (%lu sectors at %llu on %s)\n", + pr_info_ratelimited( + "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n", mdname(conf->mddev), STRIPE_SECTORS, (unsigned long long)s, bdevname(rdev->bdev, b)); @@ -2360,6 +2427,13 @@ static void raid5_end_read_request(struct bio * bi) } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); + if (test_bit(R5_InJournal, &sh->dev[i].flags)) + /* + * end read for a page in journal, this + * must be preparing for prexor in rmw + */ + set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); + if (atomic_read(&rdev->read_errors)) atomic_set(&rdev->read_errors, 0); } else { @@ -2370,36 +2444,29 @@ static void raid5_end_read_request(struct bio * bi) clear_bit(R5_UPTODATE, &sh->dev[i].flags); atomic_inc(&rdev->read_errors); if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) - printk_ratelimited( - KERN_WARNING - "md/raid:%s: read error on replacement device " - "(sector %llu on %s).\n", + pr_warn_ratelimited( + "md/raid:%s: read error on replacement device (sector %llu on %s).\n", mdname(conf->mddev), (unsigned long long)s, bdn); else if (conf->mddev->degraded >= conf->max_degraded) { set_bad = 1; - printk_ratelimited( - KERN_WARNING - "md/raid:%s: read error not correctable " - "(sector %llu on %s).\n", + pr_warn_ratelimited( + "md/raid:%s: read error not correctable (sector %llu on %s).\n", mdname(conf->mddev), (unsigned long long)s, bdn); } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) { /* Oh, no!!! */ set_bad = 1; - printk_ratelimited( - KERN_WARNING - "md/raid:%s: read error NOT corrected!! " - "(sector %llu on %s).\n", + pr_warn_ratelimited( + "md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n", mdname(conf->mddev), (unsigned long long)s, bdn); } else if (atomic_read(&rdev->read_errors) > conf->max_nr_stripes) - printk(KERN_WARNING - "md/raid:%s: Too many read errors, failing device %s.\n", + pr_warn("md/raid:%s: Too many read errors, failing device %s.\n", mdname(conf->mddev), bdn); else retry = 1; @@ -2525,21 +2592,21 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) spin_lock_irqsave(&conf->device_lock, flags); clear_bit(In_sync, &rdev->flags); - mddev->degraded = calc_degraded(conf); + mddev->degraded = raid5_calc_degraded(conf); spin_unlock_irqrestore(&conf->device_lock, flags); set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); - set_mask_bits(&mddev->flags, 0, - BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); - printk(KERN_ALERT - "md/raid:%s: Disk failure on %s, disabling device.\n" - "md/raid:%s: Operation continuing on %d devices.\n", - mdname(mddev), - bdevname(rdev->bdev, b), - mdname(mddev), - conf->raid_disks - mddev->degraded); + set_mask_bits(&mddev->sb_flags, 0, + BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); + pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n" + "md/raid:%s: Operation continuing on %d devices.\n", + mdname(mddev), + bdevname(rdev->bdev, b), + mdname(mddev), + conf->raid_disks - mddev->degraded); + r5c_update_on_rdev_error(mddev); } /* @@ -2861,13 +2928,61 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) previous, &dummy1, &sh2); if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx || sh2.qd_idx != sh->qd_idx) { - printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", - mdname(conf->mddev)); + pr_warn("md/raid:%s: compute_blocknr: map not correct\n", + mdname(conf->mddev)); return 0; } return r_sector; } +/* + * There are cases where we want handle_stripe_dirtying() and + * schedule_reconstruction() to delay towrite to some dev of a stripe. + * + * This function checks whether we want to delay the towrite. Specifically, + * we delay the towrite when: + * + * 1. degraded stripe has a non-overwrite to the missing dev, AND this + * stripe has data in journal (for other devices). + * + * In this case, when reading data for the non-overwrite dev, it is + * necessary to handle complex rmw of write back cache (prexor with + * orig_page, and xor with page). To keep read path simple, we would + * like to flush data in journal to RAID disks first, so complex rmw + * is handled in the write patch (handle_stripe_dirtying). + * + * 2. when journal space is critical (R5C_LOG_CRITICAL=1) + * + * It is important to be able to flush all stripes in raid5-cache. + * Therefore, we need reserve some space on the journal device for + * these flushes. If flush operation includes pending writes to the + * stripe, we need to reserve (conf->raid_disk + 1) pages per stripe + * for the flush out. If we exclude these pending writes from flush + * operation, we only need (conf->max_degraded + 1) pages per stripe. + * Therefore, excluding pending writes in these cases enables more + * efficient use of the journal device. + * + * Note: To make sure the stripe makes progress, we only delay + * towrite for stripes with data already in journal (injournal > 0). + * When LOG_CRITICAL, stripes with injournal == 0 will be sent to + * no_space_stripes list. + * + */ +static inline bool delay_towrite(struct r5conf *conf, + struct r5dev *dev, + struct stripe_head_state *s) +{ + /* case 1 above */ + if (!test_bit(R5_OVERWRITE, &dev->flags) && + !test_bit(R5_Insync, &dev->flags) && s->injournal) + return true; + /* case 2 above */ + if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && + s->injournal > 0) + return true; + return false; +} + static void schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, int rcw, int expand) @@ -2877,16 +2992,26 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, int level = conf->level; if (rcw) { + /* + * In some cases, handle_stripe_dirtying initially decided to + * run rmw and allocates extra page for prexor. However, rcw is + * cheaper later on. We need to free the extra page now, + * because we won't be able to do that in ops_complete_prexor(). + */ + r5c_release_extra_page(sh); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (dev->towrite) { + if (dev->towrite && !delay_towrite(conf, dev, s)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantdrain, &dev->flags); if (!expand) clear_bit(R5_UPTODATE, &dev->flags); s->locked++; + } else if (test_bit(R5_InJournal, &dev->flags)) { + set_bit(R5_LOCKED, &dev->flags); + s->locked++; } } /* if we are not expanding this is a proper write request, and @@ -2926,6 +3051,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, set_bit(R5_LOCKED, &dev->flags); clear_bit(R5_UPTODATE, &dev->flags); s->locked++; + } else if (test_bit(R5_InJournal, &dev->flags)) { + set_bit(R5_LOCKED, &dev->flags); + s->locked++; } } if (!s->locked) @@ -3270,13 +3398,6 @@ static int want_replace(struct stripe_head *sh, int disk_idx) return rv; } -/* fetch_block - checks the given member device to see if its data needs - * to be read or computed to satisfy a request. - * - * Returns 1 when no more member devices need to be checked, otherwise returns - * 0 to tell the loop in handle_stripe_fill to continue - */ - static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, int disk_idx, int disks) { @@ -3367,6 +3488,12 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, return 0; } +/* fetch_block - checks the given member device to see if its data needs + * to be read or computed to satisfy a request. + * + * Returns 1 when no more member devices need to be checked, otherwise returns + * 0 to tell the loop in handle_stripe_fill to continue + */ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, int disk_idx, int disks) { @@ -3453,10 +3580,26 @@ static void handle_stripe_fill(struct stripe_head *sh, * midst of changing due to a write */ if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && - !sh->reconstruct_state) + !sh->reconstruct_state) { + + /* + * For degraded stripe with data in journal, do not handle + * read requests yet, instead, flush the stripe to raid + * disks first, this avoids handling complex rmw of write + * back cache (prexor with orig_page, and then xor with + * page) in the read path + */ + if (s->injournal && s->failed) { + if (test_bit(STRIPE_R5C_CACHING, &sh->state)) + r5c_make_stripe_write_out(sh); + goto out; + } + for (i = disks; i--; ) if (fetch_block(sh, s, i, disks)) break; + } +out: set_bit(STRIPE_HANDLE, &sh->state); } @@ -3569,10 +3712,25 @@ unhash: break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); } -static void handle_stripe_dirtying(struct r5conf *conf, - struct stripe_head *sh, - struct stripe_head_state *s, - int disks) +/* + * For RMW in write back cache, we need extra page in prexor to store the + * old data. This page is stored in dev->orig_page. + * + * This function checks whether we have data for prexor. The exact logic + * is: + * R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE) + */ +static inline bool uptodate_for_rmw(struct r5dev *dev) +{ + return (test_bit(R5_UPTODATE, &dev->flags)) && + (!test_bit(R5_InJournal, &dev->flags) || + test_bit(R5_OrigPageUPTDODATE, &dev->flags)); +} + +static int handle_stripe_dirtying(struct r5conf *conf, + struct stripe_head *sh, + struct stripe_head_state *s, + int disks) { int rmw = 0, rcw = 0, i; sector_t recovery_cp = conf->mddev->recovery_cp; @@ -3597,9 +3755,11 @@ static void handle_stripe_dirtying(struct r5conf *conf, } else for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ struct r5dev *dev = &sh->dev[i]; - if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && + if (((dev->towrite && !delay_towrite(conf, dev, s)) || + i == sh->pd_idx || i == sh->qd_idx || + test_bit(R5_InJournal, &dev->flags)) && !test_bit(R5_LOCKED, &dev->flags) && - !(test_bit(R5_UPTODATE, &dev->flags) || + !(uptodate_for_rmw(dev) || test_bit(R5_Wantcompute, &dev->flags))) { if (test_bit(R5_Insync, &dev->flags)) rmw++; @@ -3611,15 +3771,16 @@ static void handle_stripe_dirtying(struct r5conf *conf, i != sh->pd_idx && i != sh->qd_idx && !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_Wantcompute, &dev->flags))) { + test_bit(R5_Wantcompute, &dev->flags))) { if (test_bit(R5_Insync, &dev->flags)) rcw++; else rcw += 2*disks; } } - pr_debug("for sector %llu, rmw=%d rcw=%d\n", - (unsigned long long)sh->sector, rmw, rcw); + + pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n", + (unsigned long long)sh->sector, sh->state, rmw, rcw); set_bit(STRIPE_HANDLE, &sh->state); if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { /* prefer read-modify-write, but need to get some data */ @@ -3629,10 +3790,42 @@ static void handle_stripe_dirtying(struct r5conf *conf, (unsigned long long)sh->sector, rmw); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) && + if (test_bit(R5_InJournal, &dev->flags) && + dev->page == dev->orig_page && + !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) { + /* alloc page for prexor */ + struct page *p = alloc_page(GFP_NOIO); + + if (p) { + dev->orig_page = p; + continue; + } + + /* + * alloc_page() failed, try use + * disk_info->extra_page + */ + if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE, + &conf->cache_state)) { + r5c_use_extra_page(sh); + break; + } + + /* extra_page in use, add to delayed_list */ + set_bit(STRIPE_DELAYED, &sh->state); + s->waiting_extra_page = 1; + return -EAGAIN; + } + } + + for (i = disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + if (((dev->towrite && !delay_towrite(conf, dev, s)) || + i == sh->pd_idx || i == sh->qd_idx || + test_bit(R5_InJournal, &dev->flags)) && !test_bit(R5_LOCKED, &dev->flags) && - !(test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_Wantcompute, &dev->flags)) && + !(uptodate_for_rmw(dev) || + test_bit(R5_Wantcompute, &dev->flags)) && test_bit(R5_Insync, &dev->flags)) { if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { @@ -3697,8 +3890,9 @@ static void handle_stripe_dirtying(struct r5conf *conf, */ if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && (s->locked == 0 && (rcw == 0 || rmw == 0) && - !test_bit(STRIPE_BIT_DELAY, &sh->state))) + !test_bit(STRIPE_BIT_DELAY, &sh->state))) schedule_reconstruction(sh, s, rcw == 0, 0); + return 0; } static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, @@ -3782,7 +3976,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, case check_state_compute_run: break; default: - printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", + pr_err("%s: unknown check_state: %d sector: %llu\n", __func__, sh->check_state, (unsigned long long) sh->sector); BUG(); @@ -3946,9 +4140,9 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, case check_state_compute_run: break; default: - printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", - __func__, sh->check_state, - (unsigned long long) sh->sector); + pr_warn("%s: unknown check_state: %d sector: %llu\n", + __func__, sh->check_state, + (unsigned long long) sh->sector); BUG(); } } @@ -4188,6 +4382,11 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) if (rdev && !test_bit(Faulty, &rdev->flags)) do_recovery = 1; } + + if (test_bit(R5_InJournal, &dev->flags)) + s->injournal++; + if (test_bit(R5_InJournal, &dev->flags) && dev->written) + s->just_cached++; } if (test_bit(STRIPE_SYNCING, &sh->state)) { /* If there is a failed device being replaced, @@ -4416,7 +4615,8 @@ static void handle_stripe(struct stripe_head *sh) struct r5dev *dev = &sh->dev[i]; if (test_bit(R5_LOCKED, &dev->flags) && (i == sh->pd_idx || i == sh->qd_idx || - dev->written)) { + dev->written || test_bit(R5_InJournal, + &dev->flags))) { pr_debug("Writing block %d\n", i); set_bit(R5_Wantwrite, &dev->flags); if (prexor) @@ -4456,6 +4656,10 @@ static void handle_stripe(struct stripe_head *sh) test_bit(R5_Discard, &qdev->flags)))))) handle_stripe_clean_event(conf, sh, disks, &s.return_bi); + if (s.just_cached) + r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi); + r5l_stripe_write_finished(sh); + /* Now we might consider reading some blocks, either to check/generate * parity, or to satisfy requests * or to load a block that is being partially written. @@ -4467,14 +4671,51 @@ static void handle_stripe(struct stripe_head *sh) || s.expanding) handle_stripe_fill(sh, &s, disks); - /* Now to consider new write requests and what else, if anything - * should be read. We do not handle new writes when: + /* + * When the stripe finishes full journal write cycle (write to journal + * and raid disk), this is the clean up procedure so it is ready for + * next operation. + */ + r5c_finish_stripe_write_out(conf, sh, &s); + + /* + * Now to consider new write requests, cache write back and what else, + * if anything should be read. We do not handle new writes when: * 1/ A 'write' operation (copy+xor) is already in flight. * 2/ A 'check' operation is in flight, as it may clobber the parity * block. + * 3/ A r5c cache log write is in flight. */ - if (s.to_write && !sh->reconstruct_state && !sh->check_state) - handle_stripe_dirtying(conf, sh, &s, disks); + + if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) { + if (!r5c_is_writeback(conf->log)) { + if (s.to_write) + handle_stripe_dirtying(conf, sh, &s, disks); + } else { /* write back cache */ + int ret = 0; + + /* First, try handle writes in caching phase */ + if (s.to_write) + ret = r5c_try_caching_write(conf, sh, &s, + disks); + /* + * If caching phase failed: ret == -EAGAIN + * OR + * stripe under reclaim: !caching && injournal + * + * fall back to handle_stripe_dirtying() + */ + if (ret == -EAGAIN || + /* stripe under reclaim: !caching && injournal */ + (!test_bit(STRIPE_R5C_CACHING, &sh->state) && + s.injournal > 0)) { + ret = handle_stripe_dirtying(conf, sh, &s, + disks); + if (ret == -EAGAIN) + goto finish; + } + } + } /* maybe we need to check and possibly fix the parity for this stripe * Any reads will already have been scheduled, so we just see if enough @@ -4645,9 +4886,7 @@ finish: } if (!bio_list_empty(&s.return_bi)) { - if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags) && - (s.failed <= conf->max_degraded || - conf->mddev->external == 0)) { + if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) { spin_lock_irq(&conf->device_lock); bio_list_merge(&conf->return_bi, &s.return_bi); spin_unlock_irq(&conf->device_lock); @@ -4703,6 +4942,10 @@ static int raid5_congested(struct mddev *mddev, int bits) if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) return 1; + + /* Also checks whether there is pressure on r5cache log space */ + if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) + return 1; if (conf->quiesce) return 1; if (atomic_read(&conf->empty_inactive_list_nr)) @@ -4813,9 +5056,9 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) return 0; } /* - * use bio_clone_mddev to make a copy of the bio + * use bio_clone_fast to make a copy of the bio */ - align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); + align_bi = bio_clone_fast(raid_bio, GFP_NOIO, mddev->bio_set); if (!align_bi) return 0; /* @@ -4843,6 +5086,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) rdev->recovery_offset >= end_sector))) rdev = NULL; } + + if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) { + rcu_read_unlock(); + bio_put(align_bi); + return 0; + } + if (rdev) { sector_t first_bad; int bad_sectors; @@ -5172,6 +5422,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) int remaining; DEFINE_WAIT(w); bool do_prepare; + bool do_flush = false; if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { int ret = r5l_handle_flush_request(conf->log, bi); @@ -5183,6 +5434,11 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) return; } /* ret == -EAGAIN, fallback */ + /* + * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, + * we need to flush journal device + */ + do_flush = bi->bi_opf & REQ_PREFLUSH; } md_write_start(mddev, bi); @@ -5321,6 +5577,12 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) do_prepare = true; goto retry; } + if (do_flush) { + set_bit(STRIPE_R5C_PREFLUSH, &sh->state); + /* we only need flush for one stripe */ + do_flush = false; + } + set_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); if ((!sh->batch_head || sh == sh->batch_head) && @@ -5486,9 +5748,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk mddev->reshape_position = conf->reshape_progress; mddev->curr_resync_completed = sector_nr; conf->reshape_checkpoint = jiffies; - set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); md_wakeup_thread(mddev->thread); - wait_event(mddev->sb_wait, mddev->flags == 0 || + wait_event(mddev->sb_wait, mddev->sb_flags == 0 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) return 0; @@ -5584,10 +5846,10 @@ finish: mddev->reshape_position = conf->reshape_progress; mddev->curr_resync_completed = sector_nr; conf->reshape_checkpoint = jiffies; - set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, - !test_bit(MD_CHANGE_DEVS, &mddev->flags) + !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) goto ret; @@ -5862,10 +6124,10 @@ static void raid5d(struct md_thread *thread) md_check_recovery(mddev); if (!bio_list_empty(&conf->return_bi) && - !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { struct bio_list tmp = BIO_EMPTY_LIST; spin_lock_irq(&conf->device_lock); - if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { bio_list_merge(&tmp, &conf->return_bi); bio_list_init(&conf->return_bi); } @@ -5912,7 +6174,7 @@ static void raid5d(struct md_thread *thread) break; handled += batch_size; - if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) { + if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) { spin_unlock_irq(&conf->device_lock); md_check_recovery(mddev); spin_lock_irq(&conf->device_lock); @@ -5931,6 +6193,8 @@ static void raid5d(struct md_thread *thread) mutex_unlock(&conf->cache_size_mutex); } + flush_deferred_bios(conf); + r5l_flush_stripe_to_raid(conf->log); async_tx_issue_pending_all(); @@ -6136,10 +6400,10 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) mddev_suspend(mddev); conf->skip_copy = new; if (new) - mddev->queue->backing_dev_info.capabilities |= + mddev->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; else - mddev->queue->backing_dev_info.capabilities &= + mddev->queue->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES; mddev_resume(mddev); } @@ -6242,6 +6506,7 @@ static struct attribute *raid5_attrs[] = { &raid5_group_thread_cnt.attr, &raid5_skip_copy.attr, &raid5_rmw_level.attr, + &r5c_journal_mode.attr, NULL, }; static struct attribute_group raid5_attrs_group = { @@ -6368,6 +6633,8 @@ static void raid5_free_percpu(struct r5conf *conf) static void free_conf(struct r5conf *conf) { + int i; + if (conf->log) r5l_exit_log(conf->log); if (conf->shrinker.nr_deferred) @@ -6376,6 +6643,9 @@ static void free_conf(struct r5conf *conf) free_thread_groups(conf); shrink_stripes(conf); raid5_free_percpu(conf); + for (i = 0; i < conf->pool_size; i++) + if (conf->disks[i].extra_page) + put_page(conf->disks[i].extra_page); kfree(conf->disks); kfree(conf->stripe_hashtbl); kfree(conf); @@ -6387,8 +6657,8 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); if (alloc_scratch_buffer(conf, percpu)) { - pr_err("%s: failed memory allocation for cpu%u\n", - __func__, cpu); + pr_warn("%s: failed memory allocation for cpu%u\n", + __func__, cpu); return -ENOMEM; } return 0; @@ -6458,29 +6728,29 @@ static struct r5conf *setup_conf(struct mddev *mddev) if (mddev->new_level != 5 && mddev->new_level != 4 && mddev->new_level != 6) { - printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", - mdname(mddev), mddev->new_level); + pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n", + mdname(mddev), mddev->new_level); return ERR_PTR(-EIO); } if ((mddev->new_level == 5 && !algorithm_valid_raid5(mddev->new_layout)) || (mddev->new_level == 6 && !algorithm_valid_raid6(mddev->new_layout))) { - printk(KERN_ERR "md/raid:%s: layout %d not supported\n", - mdname(mddev), mddev->new_layout); + pr_warn("md/raid:%s: layout %d not supported\n", + mdname(mddev), mddev->new_layout); return ERR_PTR(-EIO); } if (mddev->new_level == 6 && mddev->raid_disks < 4) { - printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", - mdname(mddev), mddev->raid_disks); + pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n", + mdname(mddev), mddev->raid_disks); return ERR_PTR(-EINVAL); } if (!mddev->new_chunk_sectors || (mddev->new_chunk_sectors << 9) % PAGE_SIZE || !is_power_of_2(mddev->new_chunk_sectors)) { - printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", - mdname(mddev), mddev->new_chunk_sectors << 9); + pr_warn("md/raid:%s: invalid chunk size %d\n", + mdname(mddev), mddev->new_chunk_sectors << 9); return ERR_PTR(-EINVAL); } @@ -6510,6 +6780,18 @@ static struct r5conf *setup_conf(struct mddev *mddev) atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); atomic_set(&conf->active_aligned_reads, 0); + bio_list_init(&conf->pending_bios); + spin_lock_init(&conf->pending_bios_lock); + conf->batch_bio_dispatch = true; + rdev_for_each(rdev, mddev) { + if (test_bit(Journal, &rdev->flags)) + continue; + if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) { + conf->batch_bio_dispatch = false; + break; + } + } + conf->bypass_threshold = BYPASS_THRESHOLD; conf->recovery_disabled = mddev->recovery_disabled - 1; @@ -6522,9 +6804,16 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf->disks = kzalloc(max_disks * sizeof(struct disk_info), GFP_KERNEL); + if (!conf->disks) goto abort; + for (i = 0; i < max_disks; i++) { + conf->disks[i].extra_page = alloc_page(GFP_KERNEL); + if (!conf->disks[i].extra_page) + goto abort; + } + conf->mddev = mddev; if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) @@ -6545,6 +6834,13 @@ static struct r5conf *setup_conf(struct mddev *mddev) for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) INIT_LIST_HEAD(conf->temp_inactive_list + i); + atomic_set(&conf->r5c_cached_full_stripes, 0); + INIT_LIST_HEAD(&conf->r5c_full_stripe_list); + atomic_set(&conf->r5c_cached_partial_stripes, 0); + INIT_LIST_HEAD(&conf->r5c_partial_stripe_list); + atomic_set(&conf->r5c_flushing_full_stripes, 0); + atomic_set(&conf->r5c_flushing_partial_stripes, 0); + conf->level = mddev->new_level; conf->chunk_sectors = mddev->new_chunk_sectors; if (raid5_alloc_percpu(conf) != 0) @@ -6571,9 +6867,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) if (test_bit(In_sync, &rdev->flags)) { char b[BDEVNAME_SIZE]; - printk(KERN_INFO "md/raid:%s: device %s operational as raid" - " disk %d\n", - mdname(mddev), bdevname(rdev->bdev, b), raid_disk); + pr_info("md/raid:%s: device %s operational as raid disk %d\n", + mdname(mddev), bdevname(rdev->bdev, b), raid_disk); } else if (rdev->saved_raid_disk != raid_disk) /* Cannot rely on bitmap to complete recovery */ conf->fullsync = 1; @@ -6607,21 +6902,18 @@ static struct r5conf *setup_conf(struct mddev *mddev) ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4); conf->min_nr_stripes = max(NR_STRIPES, stripes); if (conf->min_nr_stripes != NR_STRIPES) - printk(KERN_INFO - "md/raid:%s: force stripe size %d for reshape\n", + pr_info("md/raid:%s: force stripe size %d for reshape\n", mdname(mddev), conf->min_nr_stripes); } memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); if (grow_stripes(conf, conf->min_nr_stripes)) { - printk(KERN_ERR - "md/raid:%s: couldn't allocate %dkB for buffers\n", - mdname(mddev), memory); + pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n", + mdname(mddev), memory); goto abort; } else - printk(KERN_INFO "md/raid:%s: allocated %dkB\n", - mdname(mddev), memory); + pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory); /* * Losing a stripe head costs more than the time to refill it, * it reduces the queue depth and so can hurt throughput. @@ -6633,18 +6925,16 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf->shrinker.batch = 128; conf->shrinker.flags = 0; if (register_shrinker(&conf->shrinker)) { - printk(KERN_ERR - "md/raid:%s: couldn't register shrinker.\n", - mdname(mddev)); + pr_warn("md/raid:%s: couldn't register shrinker.\n", + mdname(mddev)); goto abort; } sprintf(pers_name, "raid%d", mddev->new_level); conf->thread = md_register_thread(raid5d, mddev, pers_name); if (!conf->thread) { - printk(KERN_ERR - "md/raid:%s: couldn't allocate thread.\n", - mdname(mddev)); + pr_warn("md/raid:%s: couldn't allocate thread.\n", + mdname(mddev)); goto abort; } @@ -6697,9 +6987,8 @@ static int raid5_run(struct mddev *mddev) int first = 1; if (mddev->recovery_cp != MaxSector) - printk(KERN_NOTICE "md/raid:%s: not clean" - " -- starting background reconstruction\n", - mdname(mddev)); + pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", + mdname(mddev)); rdev_for_each(rdev, mddev) { long long diff; @@ -6742,15 +7031,14 @@ static int raid5_run(struct mddev *mddev) int new_data_disks; if (journal_dev) { - printk(KERN_ERR "md/raid:%s: don't support reshape with journal - aborting.\n", - mdname(mddev)); + pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n", + mdname(mddev)); return -EINVAL; } if (mddev->new_level != mddev->level) { - printk(KERN_ERR "md/raid:%s: unsupported reshape " - "required - aborting.\n", - mdname(mddev)); + pr_warn("md/raid:%s: unsupported reshape required - aborting.\n", + mdname(mddev)); return -EINVAL; } old_disks = mddev->raid_disks - mddev->delta_disks; @@ -6765,8 +7053,8 @@ static int raid5_run(struct mddev *mddev) chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors); new_data_disks = mddev->raid_disks - max_degraded; if (sector_div(here_new, chunk_sectors * new_data_disks)) { - printk(KERN_ERR "md/raid:%s: reshape_position not " - "on a stripe boundary\n", mdname(mddev)); + pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n", + mdname(mddev)); return -EINVAL; } reshape_offset = here_new * chunk_sectors; @@ -6787,10 +7075,8 @@ static int raid5_run(struct mddev *mddev) abs(min_offset_diff) >= mddev->new_chunk_sectors) /* not really in-place - so OK */; else if (mddev->ro == 0) { - printk(KERN_ERR "md/raid:%s: in-place reshape " - "must be started in read-only mode " - "- aborting\n", - mdname(mddev)); + pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n", + mdname(mddev)); return -EINVAL; } } else if (mddev->reshape_backwards @@ -6799,13 +7085,11 @@ static int raid5_run(struct mddev *mddev) : (here_new * chunk_sectors >= here_old * chunk_sectors + (-min_offset_diff))) { /* Reading from the same stripe as writing to - bad */ - printk(KERN_ERR "md/raid:%s: reshape_position too early for " - "auto-recovery - aborting.\n", - mdname(mddev)); + pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n", + mdname(mddev)); return -EINVAL; } - printk(KERN_INFO "md/raid:%s: reshape will continue\n", - mdname(mddev)); + pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev)); /* OK, we should be able to continue; */ } else { BUG_ON(mddev->level != mddev->new_level); @@ -6824,8 +7108,8 @@ static int raid5_run(struct mddev *mddev) if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { if (!journal_dev) { - pr_err("md/raid:%s: journal disk is missing, force array readonly\n", - mdname(mddev)); + pr_warn("md/raid:%s: journal disk is missing, force array readonly\n", + mdname(mddev)); mddev->ro = 1; set_disk_ro(mddev->gendisk, 1); } else if (mddev->recovery_cp == MaxSector) @@ -6852,8 +7136,7 @@ static int raid5_run(struct mddev *mddev) if (conf->disks[i].replacement && conf->reshape_progress != MaxSector) { /* replacements and reshape simply do not mix. */ - printk(KERN_ERR "md: cannot handle concurrent " - "replacement and reshape.\n"); + pr_warn("md: cannot handle concurrent replacement and reshape.\n"); goto abort; } if (test_bit(In_sync, &rdev->flags)) { @@ -6892,11 +7175,10 @@ static int raid5_run(struct mddev *mddev) /* * 0 for a fully functional array, 1 or 2 for a degraded array. */ - mddev->degraded = calc_degraded(conf); + mddev->degraded = raid5_calc_degraded(conf); if (has_failed(conf)) { - printk(KERN_ERR "md/raid:%s: not enough operational devices" - " (%d/%d failed)\n", + pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n", mdname(mddev), mddev->degraded, conf->raid_disks); goto abort; } @@ -6908,29 +7190,19 @@ static int raid5_run(struct mddev *mddev) if (mddev->degraded > dirty_parity_disks && mddev->recovery_cp != MaxSector) { if (mddev->ok_start_degraded) - printk(KERN_WARNING - "md/raid:%s: starting dirty degraded array" - " - data corruption possible.\n", - mdname(mddev)); + pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n", + mdname(mddev)); else { - printk(KERN_ERR - "md/raid:%s: cannot start dirty degraded array.\n", - mdname(mddev)); + pr_crit("md/raid:%s: cannot start dirty degraded array.\n", + mdname(mddev)); goto abort; } } - if (mddev->degraded == 0) - printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" - " devices, algorithm %d\n", mdname(mddev), conf->level, - mddev->raid_disks-mddev->degraded, mddev->raid_disks, - mddev->new_layout); - else - printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" - " out of %d devices, algorithm %d\n", - mdname(mddev), conf->level, - mddev->raid_disks - mddev->degraded, - mddev->raid_disks, mddev->new_layout); + pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n", + mdname(mddev), conf->level, + mddev->raid_disks-mddev->degraded, mddev->raid_disks, + mddev->new_layout); print_raid5_conf(conf); @@ -6950,9 +7222,8 @@ static int raid5_run(struct mddev *mddev) mddev->to_remove = NULL; else if (mddev->kobj.sd && sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) - printk(KERN_WARNING - "raid5: failed to create sysfs attributes for %s\n", - mdname(mddev)); + pr_warn("raid5: failed to create sysfs attributes for %s\n", + mdname(mddev)); md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); if (mddev->queue) { @@ -6965,8 +7236,8 @@ static int raid5_run(struct mddev *mddev) int data_disks = conf->previous_raid_disks - conf->max_degraded; int stripe = data_disks * ((mddev->chunk_sectors << 9) / PAGE_SIZE); - if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) - mddev->queue->backing_dev_info.ra_pages = 2 * stripe; + if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe) + mddev->queue->backing_dev_info->ra_pages = 2 * stripe; chunk_size = mddev->chunk_sectors << 9; blk_queue_io_min(mddev->queue, chunk_size); @@ -6984,6 +7255,15 @@ static int raid5_run(struct mddev *mddev) stripe = (stripe | (stripe-1)) + 1; mddev->queue->limits.discard_alignment = stripe; mddev->queue->limits.discard_granularity = stripe; + + /* + * We use 16-bit counter of active stripes in bi_phys_segments + * (minus one for over-loaded initialization) + */ + blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS); + blk_queue_max_discard_sectors(mddev->queue, + 0xfffe * STRIPE_SECTORS); + /* * unaligned part of discard request will be ignored, so can't * guarantee discard_zeroes_data @@ -7040,9 +7320,10 @@ static int raid5_run(struct mddev *mddev) if (journal_dev) { char b[BDEVNAME_SIZE]; - printk(KERN_INFO"md/raid:%s: using device %s as journal\n", - mdname(mddev), bdevname(journal_dev->bdev, b)); - r5l_init_log(conf, journal_dev); + pr_debug("md/raid:%s: using device %s as journal\n", + mdname(mddev), bdevname(journal_dev->bdev, b)); + if (r5l_init_log(conf, journal_dev)) + goto abort; } return 0; @@ -7051,7 +7332,7 @@ abort: print_raid5_conf(conf); free_conf(conf); mddev->private = NULL; - printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); + pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); return -EIO; } @@ -7085,12 +7366,12 @@ static void print_raid5_conf (struct r5conf *conf) int i; struct disk_info *tmp; - printk(KERN_DEBUG "RAID conf printout:\n"); + pr_debug("RAID conf printout:\n"); if (!conf) { - printk("(conf==NULL)\n"); + pr_debug("(conf==NULL)\n"); return; } - printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, + pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level, conf->raid_disks, conf->raid_disks - conf->mddev->degraded); @@ -7098,7 +7379,7 @@ static void print_raid5_conf (struct r5conf *conf) char b[BDEVNAME_SIZE]; tmp = conf->disks + i; if (tmp->rdev) - printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", + pr_debug(" disk %d, o:%d, dev:%s\n", i, !test_bit(Faulty, &tmp->rdev->flags), bdevname(tmp->rdev->bdev, b)); } @@ -7141,7 +7422,7 @@ static int raid5_spare_active(struct mddev *mddev) } } spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded = calc_degraded(conf); + mddev->degraded = raid5_calc_degraded(conf); spin_unlock_irqrestore(&conf->device_lock, flags); print_raid5_conf(conf); return count; @@ -7246,8 +7527,8 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) * write requests running. We should be safe */ r5l_init_log(conf, rdev); - printk(KERN_INFO"md/raid:%s: using device %s as journal\n", - mdname(mddev), bdevname(rdev->bdev, b)); + pr_debug("md/raid:%s: using device %s as journal\n", + mdname(mddev), bdevname(rdev->bdev, b)); return 0; } if (mddev->recovery_disabled == conf->recovery_disabled) @@ -7351,10 +7632,10 @@ static int check_stripe_cache(struct mddev *mddev) > conf->min_nr_stripes || ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 > conf->min_nr_stripes) { - printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", - mdname(mddev), - ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) - / STRIPE_SIZE)*4); + pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n", + mdname(mddev), + ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) + / STRIPE_SIZE)*4); return 0; } return 1; @@ -7435,8 +7716,8 @@ static int raid5_start_reshape(struct mddev *mddev) */ if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) < mddev->array_sectors) { - printk(KERN_ERR "md/raid:%s: array size must be reduced " - "before number of disks\n", mdname(mddev)); + pr_warn("md/raid:%s: array size must be reduced before number of disks\n", + mdname(mddev)); return -EINVAL; } @@ -7501,12 +7782,12 @@ static int raid5_start_reshape(struct mddev *mddev) * pre and post number of devices. */ spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded = calc_degraded(conf); + mddev->degraded = raid5_calc_degraded(conf); spin_unlock_irqrestore(&conf->device_lock, flags); } mddev->raid_disks = conf->raid_disks; mddev->reshape_position = conf->reshape_progress; - set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); @@ -7565,8 +7846,8 @@ static void end_reshape(struct r5conf *conf) int data_disks = conf->raid_disks - conf->max_degraded; int stripe = data_disks * ((conf->chunk_sectors << 9) / PAGE_SIZE); - if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) - conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; + if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe) + conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe; } } } @@ -7589,7 +7870,7 @@ static void raid5_finish_reshape(struct mddev *mddev) } else { int d; spin_lock_irq(&conf->device_lock); - mddev->degraded = calc_degraded(conf); + mddev->degraded = raid5_calc_degraded(conf); spin_unlock_irq(&conf->device_lock); for (d = conf->raid_disks ; d < conf->raid_disks - mddev->delta_disks; @@ -7624,6 +7905,7 @@ static void raid5_quiesce(struct mddev *mddev, int state) /* '2' tells resync/reshape to pause so that all * active stripes can drain */ + r5c_flush_cache(conf, INT_MAX); conf->quiesce = 2; wait_event_cmd(conf->wait_for_quiescent, atomic_read(&conf->active_stripes) == 0 && @@ -7654,8 +7936,8 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level) /* for raid0 takeover only one zone is supported */ if (raid0_conf->nr_strip_zones > 1) { - printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", - mdname(mddev)); + pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n", + mdname(mddev)); return ERR_PTR(-EINVAL); } @@ -7676,6 +7958,7 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level) static void *raid5_takeover_raid1(struct mddev *mddev) { int chunksect; + void *ret; if (mddev->raid_disks != 2 || mddev->degraded > 1) @@ -7697,7 +7980,11 @@ static void *raid5_takeover_raid1(struct mddev *mddev) mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; mddev->new_chunk_sectors = chunksect; - return setup_conf(mddev); + ret = setup_conf(mddev); + if (!IS_ERR(ret)) + mddev_clear_unsupported_flags(mddev, + UNSUPPORTED_MDDEV_FLAGS); + return ret; } static void *raid5_takeover_raid6(struct mddev *mddev) @@ -7767,7 +8054,7 @@ static int raid5_check_reshape(struct mddev *mddev) conf->chunk_sectors = new_chunk ; mddev->chunk_sectors = new_chunk; } - set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); md_wakeup_thread(mddev->thread); } return check_reshape(mddev); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 57ec49f0839e..4bb27b97bf6b 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -226,6 +226,8 @@ struct stripe_head { struct r5l_io_unit *log_io; struct list_head log_list; + sector_t log_start; /* first meta block on the journal */ + struct list_head r5c; /* for r5c_cache->stripe_in_journal */ /** * struct stripe_operations * @target - STRIPE_OP_COMPUTE_BLK target @@ -264,6 +266,7 @@ struct stripe_head_state { int syncing, expanding, expanded, replacing; int locked, uptodate, to_read, to_write, failed, written; int to_fill, compute, req_compute, non_overwrite; + int injournal, just_cached; int failed_num[2]; int p_failed, q_failed; int dec_preread_active; @@ -273,6 +276,7 @@ struct stripe_head_state { struct md_rdev *blocked_rdev; int handle_bad_blocks; int log_failed; + int waiting_extra_page; }; /* Flags for struct r5dev.flags */ @@ -313,6 +317,16 @@ enum r5dev_flags { */ R5_Discard, /* Discard the stripe */ R5_SkipCopy, /* Don't copy data from bio to stripe cache */ + R5_InJournal, /* data being written is in the journal device. + * if R5_InJournal is set for parity pd_idx, all the + * data and parity being written are in the journal + * device + */ + R5_OrigPageUPTDODATE, /* with write back cache, we read old data into + * dev->orig_page for prexor. When this flag is + * set, orig_page contains latest data in the + * raid disk. + */ }; /* @@ -345,7 +359,30 @@ enum { STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add * to batch yet. */ - STRIPE_LOG_TRAPPED, /* trapped into log */ + STRIPE_LOG_TRAPPED, /* trapped into log (see raid5-cache.c) + * this bit is used in two scenarios: + * + * 1. write-out phase + * set in first entry of r5l_write_stripe + * clear in second entry of r5l_write_stripe + * used to bypass logic in handle_stripe + * + * 2. caching phase + * set in r5c_try_caching_write() + * clear when journal write is done + * used to initiate r5c_cache_data() + * also used to bypass logic in handle_stripe + */ + STRIPE_R5C_CACHING, /* the stripe is in caching phase + * see more detail in the raid5-cache.c + */ + STRIPE_R5C_PARTIAL_STRIPE, /* in r5c cache (to-be/being handled or + * in conf->r5c_partial_stripe_list) + */ + STRIPE_R5C_FULL_STRIPE, /* in r5c cache (to-be/being handled or + * in conf->r5c_full_stripe_list) + */ + STRIPE_R5C_PREFLUSH, /* need to flush journal device */ }; #define STRIPE_EXPAND_SYNC_FLAGS \ @@ -408,8 +445,86 @@ enum { struct disk_info { struct md_rdev *rdev, *replacement; + struct page *extra_page; /* extra page to use in prexor */ }; +/* + * Stripe cache + */ + +#define NR_STRIPES 256 +#define STRIPE_SIZE PAGE_SIZE +#define STRIPE_SHIFT (PAGE_SHIFT - 9) +#define STRIPE_SECTORS (STRIPE_SIZE>>9) +#define IO_THRESHOLD 1 +#define BYPASS_THRESHOLD 1 +#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) +#define HASH_MASK (NR_HASH - 1) +#define MAX_STRIPE_BATCH 8 + +/* bio's attached to a stripe+device for I/O are linked together in bi_sector + * order without overlap. There may be several bio's per stripe+device, and + * a bio could span several devices. + * When walking this list for a particular stripe+device, we must never proceed + * beyond a bio that extends past this device, as the next bio might no longer + * be valid. + * This function is used to determine the 'next' bio in the list, given the + * sector of the current stripe+device + */ +static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) +{ + int sectors = bio_sectors(bio); + + if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS) + return bio->bi_next; + else + return NULL; +} + +/* + * We maintain a biased count of active stripes in the bottom 16 bits of + * bi_phys_segments, and a count of processed stripes in the upper 16 bits + */ +static inline int raid5_bi_processed_stripes(struct bio *bio) +{ + atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; + + return (atomic_read(segments) >> 16) & 0xffff; +} + +static inline int raid5_dec_bi_active_stripes(struct bio *bio) +{ + atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; + + return atomic_sub_return(1, segments) & 0xffff; +} + +static inline void raid5_inc_bi_active_stripes(struct bio *bio) +{ + atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; + + atomic_inc(segments); +} + +static inline void raid5_set_bi_processed_stripes(struct bio *bio, + unsigned int cnt) +{ + atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; + int old, new; + + do { + old = atomic_read(segments); + new = (old & 0xffff) | (cnt << 16); + } while (atomic_cmpxchg(segments, old, new) != old); +} + +static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt) +{ + atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; + + atomic_set(segments, cnt); +} + /* NOTE NR_STRIPE_HASH_LOCKS must remain below 64. * This is because we sometimes take all the spinlocks * and creating that much locking depth can cause @@ -432,6 +547,30 @@ struct r5worker_group { int stripes_cnt; }; +enum r5_cache_state { + R5_INACTIVE_BLOCKED, /* release of inactive stripes blocked, + * waiting for 25% to be free + */ + R5_ALLOC_MORE, /* It might help to allocate another + * stripe. + */ + R5_DID_ALLOC, /* A stripe was allocated, don't allocate + * more until at least one has been + * released. This avoids flooding + * the cache. + */ + R5C_LOG_TIGHT, /* log device space tight, need to + * prioritize stripes at last_checkpoint + */ + R5C_LOG_CRITICAL, /* log device is running out of space, + * only process stripes that are already + * occupying the log + */ + R5C_EXTRA_PAGE_IN_USE, /* a stripe is using disk_info.extra_page + * for prexor + */ +}; + struct r5conf { struct hlist_head *stripe_hashtbl; /* only protect corresponding hash list and inactive_list */ @@ -519,23 +658,20 @@ struct r5conf { */ atomic_t active_stripes; struct list_head inactive_list[NR_STRIPE_HASH_LOCKS]; + + atomic_t r5c_cached_full_stripes; + struct list_head r5c_full_stripe_list; + atomic_t r5c_cached_partial_stripes; + struct list_head r5c_partial_stripe_list; + atomic_t r5c_flushing_full_stripes; + atomic_t r5c_flushing_partial_stripes; + atomic_t empty_inactive_list_nr; struct llist_head released_stripes; wait_queue_head_t wait_for_quiescent; wait_queue_head_t wait_for_stripe; wait_queue_head_t wait_for_overlap; unsigned long cache_state; -#define R5_INACTIVE_BLOCKED 1 /* release of inactive stripes blocked, - * waiting for 25% to be free - */ -#define R5_ALLOC_MORE 2 /* It might help to allocate another - * stripe. - */ -#define R5_DID_ALLOC 4 /* A stripe was allocated, don't allocate - * more until at least one has been - * released. This avoids flooding - * the cache. - */ struct shrinker shrinker; int pool_size; /* number of disks in stripeheads in pool */ spinlock_t device_lock; @@ -550,6 +686,10 @@ struct r5conf { int group_cnt; int worker_cnt_per_group; struct r5l_log *log; + + struct bio_list pending_bios; + spinlock_t pending_bios_lock; + bool batch_bio_dispatch; }; @@ -624,6 +764,7 @@ extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, extern struct stripe_head * raid5_get_active_stripe(struct r5conf *conf, sector_t sector, int previous, int noblock, int noquiesce); +extern int raid5_calc_degraded(struct r5conf *conf); extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev); extern void r5l_exit_log(struct r5l_log *log); extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh); @@ -633,4 +774,25 @@ extern void r5l_stripe_write_finished(struct stripe_head *sh); extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); extern void r5l_quiesce(struct r5l_log *log, int state); extern bool r5l_log_disk_error(struct r5conf *conf); +extern bool r5c_is_writeback(struct r5l_log *log); +extern int +r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh, + struct stripe_head_state *s, int disks); +extern void +r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh, + struct stripe_head_state *s); +extern void r5c_release_extra_page(struct stripe_head *sh); +extern void r5c_use_extra_page(struct stripe_head *sh); +extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space); +extern void r5c_handle_cached_data_endio(struct r5conf *conf, + struct stripe_head *sh, int disks, struct bio_list *return_bi); +extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh, + struct stripe_head_state *s); +extern void r5c_make_stripe_write_out(struct stripe_head *sh); +extern void r5c_flush_cache(struct r5conf *conf, int num); +extern void r5c_check_stripe_cache_usage(struct r5conf *conf); +extern void r5c_check_cached_full_stripe(struct r5conf *conf); +extern struct md_sysfs_entry r5c_journal_mode; +extern void r5c_update_on_rdev_error(struct mddev *mddev); +extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect); #endif |