diff options
author | Jiri Kosina <jkosina@suse.cz> | 2014-11-20 16:42:02 +0300 |
---|---|---|
committer | Jiri Kosina <jkosina@suse.cz> | 2014-11-20 16:42:02 +0300 |
commit | a02001086bbfb4da35d1228bebc2f1b442db455f (patch) | |
tree | 62ab47936cef06fd08657ca5b6cd1df98c19be57 /drivers/md | |
parent | eff264efeeb0898408e8c9df72d8a32621035bed (diff) | |
parent | fc14f9c1272f62c3e8d01300f52467c0d9af50f9 (diff) | |
download | linux-a02001086bbfb4da35d1228bebc2f1b442db455f.tar.xz |
Merge Linus' tree to be be to apply submitted patches to newer code than
current trivial.git base
Diffstat (limited to 'drivers/md')
45 files changed, 1167 insertions, 978 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 443d03fbac47..8eeab72b93e2 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -331,7 +331,7 @@ static int bch_allocator_thread(void *arg) mutex_unlock(&ca->set->bucket_lock); blkdev_issue_discard(ca->bdev, bucket_to_sector(ca->set, bucket), - ca->sb.block_size, GFP_KERNEL, 0); + ca->sb.bucket_size, GFP_KERNEL, 0); mutex_lock(&ca->set->bucket_lock); } diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d2ebcf323094..04f7bc28ef83 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -477,9 +477,13 @@ struct gc_stat { * CACHE_SET_STOPPING always gets set first when we're closing down a cache set; * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e. * flushing dirty data). + * + * CACHE_SET_RUNNING means all cache devices have been registered and journal + * replay is complete. */ #define CACHE_SET_UNREGISTERING 0 #define CACHE_SET_STOPPING 1 +#define CACHE_SET_RUNNING 2 struct cache_set { struct closure cl; diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 545416415305..646fe85261c1 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -1182,7 +1182,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, { uint64_t start_time; bool used_mempool = false; - struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO, + struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order); if (!out) { struct page *outp; diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 5f6728d5d4dd..ae964624efb2 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -453,7 +453,7 @@ static inline bool bch_bkey_equal_header(const struct bkey *l, { return (KEY_DIRTY(l) == KEY_DIRTY(r) && KEY_PTRS(l) == KEY_PTRS(r) && - KEY_CSUM(l) == KEY_CSUM(l)); + KEY_CSUM(l) == KEY_CSUM(r)); } /* Keylists */ diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 7347b6100961..00cde40db572 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -117,9 +117,9 @@ ({ \ int _r, l = (b)->level - 1; \ bool _w = l <= (op)->lock; \ - struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\ + struct btree *_child = bch_btree_node_get((b)->c, op, key, l, \ + _w, b); \ if (!IS_ERR(_child)) { \ - _child->parent = (b); \ _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ rw_unlock(_w, _child); \ } else \ @@ -142,7 +142,6 @@ rw_lock(_w, _b, _b->level); \ if (_b == (c)->root && \ _w == insert_lock(op, _b)) { \ - _b->parent = NULL; \ _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ } \ rw_unlock(_w, _b); \ @@ -202,7 +201,7 @@ void bch_btree_node_read_done(struct btree *b) struct bset *i = btree_bset_first(b); struct btree_iter *iter; - iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); + iter = mempool_alloc(b->c->fill_iter, GFP_NOIO); iter->size = b->c->sb.bucket_size / b->c->sb.block_size; iter->used = 0; @@ -421,7 +420,7 @@ static void do_btree_node_write(struct btree *b) SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_sector_offset(&b->keys, i)); - if (!bio_alloc_pages(b->bio, GFP_NOIO)) { + if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { int j; struct bio_vec *bv; void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); @@ -967,7 +966,8 @@ err: * level and op->lock. */ struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, - struct bkey *k, int level, bool write) + struct bkey *k, int level, bool write, + struct btree *parent) { int i = 0; struct btree *b; @@ -1002,6 +1002,7 @@ retry: BUG_ON(b->level != level); } + b->parent = parent; b->accessed = 1; for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { @@ -1022,15 +1023,16 @@ retry: return b; } -static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) +static void btree_node_prefetch(struct btree *parent, struct bkey *k) { struct btree *b; - mutex_lock(&c->bucket_lock); - b = mca_alloc(c, NULL, k, level); - mutex_unlock(&c->bucket_lock); + mutex_lock(&parent->c->bucket_lock); + b = mca_alloc(parent->c, NULL, k, parent->level - 1); + mutex_unlock(&parent->c->bucket_lock); if (!IS_ERR_OR_NULL(b)) { + b->parent = parent; bch_btree_node_read(b); rw_unlock(true, b); } @@ -1060,15 +1062,16 @@ static void btree_node_free(struct btree *b) mutex_unlock(&b->c->bucket_lock); } -struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, - int level) +struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, + int level, bool wait, + struct btree *parent) { BKEY_PADDED(key) k; struct btree *b = ERR_PTR(-EAGAIN); mutex_lock(&c->bucket_lock); retry: - if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, op != NULL)) + if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait)) goto err; bkey_put(c, &k.key); @@ -1085,6 +1088,7 @@ retry: } b->accessed = 1; + b->parent = parent; bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb)); mutex_unlock(&c->bucket_lock); @@ -1096,14 +1100,21 @@ err_free: err: mutex_unlock(&c->bucket_lock); - trace_bcache_btree_node_alloc_fail(b); + trace_bcache_btree_node_alloc_fail(c); return b; } +static struct btree *bch_btree_node_alloc(struct cache_set *c, + struct btree_op *op, int level, + struct btree *parent) +{ + return __bch_btree_node_alloc(c, op, level, op != NULL, parent); +} + static struct btree *btree_node_alloc_replacement(struct btree *b, struct btree_op *op) { - struct btree *n = bch_btree_node_alloc(b->c, op, b->level); + struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent); if (!IS_ERR_OR_NULL(n)) { mutex_lock(&n->write_lock); bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); @@ -1403,6 +1414,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, BUG_ON(btree_bset_first(new_nodes[0])->keys); btree_node_free(new_nodes[0]); rw_unlock(true, new_nodes[0]); + new_nodes[0] = NULL; for (i = 0; i < nodes; i++) { if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key))) @@ -1516,7 +1528,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); if (k) { r->b = bch_btree_node_get(b->c, op, k, b->level - 1, - true); + true, b); if (IS_ERR(r->b)) { ret = PTR_ERR(r->b); break; @@ -1811,7 +1823,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); if (k) - btree_node_prefetch(b->c, k, b->level - 1); + btree_node_prefetch(b, k); if (p) ret = btree(check_recurse, p, b, op); @@ -1976,12 +1988,12 @@ static int btree_split(struct btree *b, struct btree_op *op, trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); - n2 = bch_btree_node_alloc(b->c, op, b->level); + n2 = bch_btree_node_alloc(b->c, op, b->level, b->parent); if (IS_ERR(n2)) goto err_free1; if (!b->parent) { - n3 = bch_btree_node_alloc(b->c, op, b->level + 1); + n3 = bch_btree_node_alloc(b->c, op, b->level + 1, NULL); if (IS_ERR(n3)) goto err_free2; } diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 91dfa5e69685..5c391fa01bed 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -242,9 +242,10 @@ void __bch_btree_node_write(struct btree *, struct closure *); void bch_btree_node_write(struct btree *, struct closure *); void bch_btree_set_root(struct btree *); -struct btree *bch_btree_node_alloc(struct cache_set *, struct btree_op *, int); +struct btree *__bch_btree_node_alloc(struct cache_set *, struct btree_op *, + int, bool, struct btree *); struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *, - struct bkey *, int, bool); + struct bkey *, int, bool, struct btree *); int bch_btree_insert_check_key(struct btree *, struct btree_op *, struct bkey *); diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 3a0de4cf9771..243de0bf15cd 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -474,9 +474,8 @@ out: return false; } -static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k) +bool __bch_extent_invalid(struct cache_set *c, const struct bkey *k) { - struct btree *b = container_of(bk, struct btree, keys); char buf[80]; if (!KEY_SIZE(k)) @@ -485,16 +484,22 @@ static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k) if (KEY_SIZE(k) > KEY_OFFSET(k)) goto bad; - if (__ptr_invalid(b->c, k)) + if (__ptr_invalid(c, k)) goto bad; return false; bad: bch_extent_to_text(buf, sizeof(buf), k); - cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k)); + cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k)); return true; } +static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k) +{ + struct btree *b = container_of(bk, struct btree, keys); + return __bch_extent_invalid(b->c, k); +} + static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k, unsigned ptr) { diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h index e4e23409782d..e2ed54054e7a 100644 --- a/drivers/md/bcache/extents.h +++ b/drivers/md/bcache/extents.h @@ -9,5 +9,6 @@ struct cache_set; void bch_extent_to_text(char *, size_t, const struct bkey *); bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); +bool __bch_extent_invalid(struct cache_set *, const struct bkey *); #endif /* _BCACHE_EXTENTS_H */ diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 59e82021b5bb..fe080ad0e558 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -7,6 +7,7 @@ #include "bcache.h" #include "btree.h" #include "debug.h" +#include "extents.h" #include <trace/events/bcache.h> @@ -189,11 +190,15 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) if (read_bucket(l)) goto bsearch; - if (list_empty(list)) + /* no journal entries on this device? */ + if (l == ca->sb.njournal_buckets) continue; bsearch: + BUG_ON(list_empty(list)); + /* Binary search */ - m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); + m = l; + r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); pr_debug("starting binary search, l %u r %u", l, r); while (l + 1 < r) { @@ -291,15 +296,16 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) for (k = i->j.start; k < bset_bkey_last(&i->j); - k = bkey_next(k)) { - unsigned j; + k = bkey_next(k)) + if (!__bch_extent_invalid(c, k)) { + unsigned j; - for (j = 0; j < KEY_PTRS(k); j++) - if (ptr_available(c, k, j)) - atomic_inc(&PTR_BUCKET(c, k, j)->pin); + for (j = 0; j < KEY_PTRS(k); j++) + if (ptr_available(c, k, j)) + atomic_inc(&PTR_BUCKET(c, k, j)->pin); - bch_initial_mark_key(c, 0, k); - } + bch_initial_mark_key(c, 0, k); + } } } diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 15fff4f68a7c..62e6e98186b5 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -311,7 +311,8 @@ void bch_data_insert(struct closure *cl) { struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); - trace_bcache_write(op->bio, op->writeback, op->bypass); + trace_bcache_write(op->c, op->inode, op->bio, + op->writeback, op->bypass); bch_keylist_init(&op->insert_keys); bio_get(op->bio); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 926ded8ccbf5..4dd2bb7167f0 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -733,8 +733,6 @@ static void bcache_device_detach(struct bcache_device *d) static void bcache_device_attach(struct bcache_device *d, struct cache_set *c, unsigned id) { - BUG_ON(test_bit(CACHE_SET_STOPPING, &c->flags)); - d->id = id; d->c = c; c->devices[id] = d; @@ -844,6 +842,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, q->limits.logical_block_size = block_size; q->limits.physical_block_size = block_size; set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags); + clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags); set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags); blk_queue_flush(q, REQ_FLUSH|REQ_FUA); @@ -927,6 +926,7 @@ static void cached_dev_detach_finish(struct work_struct *w) list_move(&dc->list, &uncached_devices); clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags); + clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags); mutex_unlock(&bch_register_lock); @@ -1041,6 +1041,9 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) */ atomic_set(&dc->count, 1); + if (bch_cached_dev_writeback_start(dc)) + return -ENOMEM; + if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { bch_sectors_dirty_init(dc); atomic_set(&dc->has_dirty, 1); @@ -1070,7 +1073,8 @@ static void cached_dev_free(struct closure *cl) struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); cancel_delayed_work_sync(&dc->writeback_rate_update); - kthread_stop(dc->writeback_thread); + if (!IS_ERR_OR_NULL(dc->writeback_thread)) + kthread_stop(dc->writeback_thread); mutex_lock(&bch_register_lock); @@ -1081,12 +1085,8 @@ static void cached_dev_free(struct closure *cl) mutex_unlock(&bch_register_lock); - if (!IS_ERR_OR_NULL(dc->bdev)) { - if (dc->bdev->bd_disk) - blk_sync_queue(bdev_get_queue(dc->bdev)); - + if (!IS_ERR_OR_NULL(dc->bdev)) blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); - } wake_up(&unregister_wait); @@ -1213,7 +1213,9 @@ void bch_flash_dev_release(struct kobject *kobj) static void flash_dev_free(struct closure *cl) { struct bcache_device *d = container_of(cl, struct bcache_device, cl); + mutex_lock(&bch_register_lock); bcache_device_free(d); + mutex_unlock(&bch_register_lock); kobject_put(&d->kobj); } @@ -1221,7 +1223,9 @@ static void flash_dev_flush(struct closure *cl) { struct bcache_device *d = container_of(cl, struct bcache_device, cl); + mutex_lock(&bch_register_lock); bcache_device_unlink(d); + mutex_unlock(&bch_register_lock); kobject_del(&d->kobj); continue_at(cl, flash_dev_free, system_wq); } @@ -1277,6 +1281,9 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size) if (test_bit(CACHE_SET_STOPPING, &c->flags)) return -EINTR; + if (!test_bit(CACHE_SET_RUNNING, &c->flags)) + return -EPERM; + u = uuid_find_empty(c); if (!u) { pr_err("Can't create volume, no room for UUID"); @@ -1346,8 +1353,11 @@ static void cache_set_free(struct closure *cl) bch_journal_free(c); for_each_cache(ca, c, i) - if (ca) + if (ca) { + ca->set = NULL; + c->cache[ca->sb.nr_this_dev] = NULL; kobject_put(&ca->kobj); + } bch_bset_sort_state_free(&c->sort); free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); @@ -1405,9 +1415,11 @@ static void cache_set_flush(struct closure *cl) if (ca->alloc_thread) kthread_stop(ca->alloc_thread); - cancel_delayed_work_sync(&c->journal.work); - /* flush last journal entry if needed */ - c->journal.work.work.func(&c->journal.work.work); + if (c->journal.cur) { + cancel_delayed_work_sync(&c->journal.work); + /* flush last journal entry if needed */ + c->journal.work.work.func(&c->journal.work.work); + } closure_return(cl); } @@ -1586,7 +1598,7 @@ static void run_cache_set(struct cache_set *c) goto err; err = "error reading btree root"; - c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true); + c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true, NULL); if (IS_ERR_OR_NULL(c->root)) goto err; @@ -1661,7 +1673,7 @@ static void run_cache_set(struct cache_set *c) goto err; err = "cannot allocate new btree root"; - c->root = bch_btree_node_alloc(c, NULL, 0); + c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL); if (IS_ERR_OR_NULL(c->root)) goto err; @@ -1697,6 +1709,7 @@ static void run_cache_set(struct cache_set *c) flash_devs_run(c); + set_bit(CACHE_SET_RUNNING, &c->flags); return; err: closure_sync(&cl); @@ -1760,6 +1773,7 @@ found: pr_debug("set version = %llu", c->sb.version); } + kobject_get(&ca->kobj); ca->set = c; ca->set->cache[ca->sb.nr_this_dev] = ca; c->cache_by_alloc[c->caches_loaded++] = ca; @@ -1780,8 +1794,10 @@ void bch_cache_release(struct kobject *kobj) struct cache *ca = container_of(kobj, struct cache, kobj); unsigned i; - if (ca->set) + if (ca->set) { + BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca); ca->set->cache[ca->sb.nr_this_dev] = NULL; + } bio_split_pool_free(&ca->bio_split_hook); @@ -1798,10 +1814,8 @@ void bch_cache_release(struct kobject *kobj) if (ca->sb_bio.bi_inline_vecs[0].bv_page) put_page(ca->sb_bio.bi_io_vec[0].bv_page); - if (!IS_ERR_OR_NULL(ca->bdev)) { - blk_sync_queue(bdev_get_queue(ca->bdev)); + if (!IS_ERR_OR_NULL(ca->bdev)) blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); - } kfree(ca); module_put(THIS_MODULE); @@ -1844,7 +1858,7 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) } static void register_cache(struct cache_sb *sb, struct page *sb_page, - struct block_device *bdev, struct cache *ca) + struct block_device *bdev, struct cache *ca) { char name[BDEVNAME_SIZE]; const char *err = "cannot allocate memory"; @@ -1877,10 +1891,12 @@ static void register_cache(struct cache_sb *sb, struct page *sb_page, goto err; pr_info("registered cache device %s", bdevname(bdev, name)); +out: + kobject_put(&ca->kobj); return; err: pr_notice("error opening %s: %s", bdevname(bdev, name), err); - kobject_put(&ca->kobj); + goto out; } /* Global interfaces/init */ @@ -1945,10 +1961,12 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (IS_ERR(bdev)) { if (bdev == ERR_PTR(-EBUSY)) { bdev = lookup_bdev(strim(path)); + mutex_lock(&bch_register_lock); if (!IS_ERR(bdev) && bch_is_open(bdev)) err = "device already registered"; else err = "device busy"; + mutex_unlock(&bch_register_lock); } goto err; } diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index ac7d0d1f70d7..98df7572b5f7 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -416,8 +416,8 @@ do { \ average_frequency, frequency_units); \ __print_time_stat(stats, name, \ average_duration, duration_units); \ - __print_time_stat(stats, name, \ - max_duration, duration_units); \ + sysfs_print(name ## _ ##max_duration ## _ ## duration_units, \ + div_u64((stats)->max_duration, NSEC_PER_ ## duration_units));\ \ sysfs_print(name ## _last_ ## frequency_units, (stats)->last \ ? div_s64(local_clock() - (stats)->last, \ diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index f4300e4c0114..f1986bcd1bf0 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -239,7 +239,7 @@ static void read_dirty(struct cached_dev *dc) if (KEY_START(&w->key) != dc->last_read || jiffies_to_msecs(delay) > 50) while (!kthread_should_stop() && delay) - delay = schedule_timeout_uninterruptible(delay); + delay = schedule_timeout_interruptible(delay); dc->last_read = KEY_OFFSET(&w->key); @@ -436,7 +436,7 @@ static int bch_writeback_thread(void *arg) while (delay && !kthread_should_stop() && !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) - delay = schedule_timeout_uninterruptible(delay); + delay = schedule_timeout_interruptible(delay); } } @@ -478,7 +478,7 @@ void bch_sectors_dirty_init(struct cached_dev *dc) dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk); } -int bch_cached_dev_writeback_init(struct cached_dev *dc) +void bch_cached_dev_writeback_init(struct cached_dev *dc) { sema_init(&dc->in_flight, 64); init_rwsem(&dc->writeback_lock); @@ -494,14 +494,20 @@ int bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_rate_d_term = 30; dc->writeback_rate_p_term_inverse = 6000; + INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); +} + +int bch_cached_dev_writeback_start(struct cached_dev *dc) +{ dc->writeback_thread = kthread_create(bch_writeback_thread, dc, "bcache_writeback"); if (IS_ERR(dc->writeback_thread)) return PTR_ERR(dc->writeback_thread); - INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); schedule_delayed_work(&dc->writeback_rate_update, dc->writeback_rate_update_seconds * HZ); + bch_writeback_queue(dc); + return 0; } diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index e2f8598937ac..0a9dab187b79 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -85,6 +85,7 @@ static inline void bch_writeback_add(struct cached_dev *dc) void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); void bch_sectors_dirty_init(struct cached_dev *dc); -int bch_cached_dev_writeback_init(struct cached_dev *); +void bch_cached_dev_writeback_init(struct cached_dev *); +int bch_cached_dev_writeback_start(struct cached_dev *); #endif diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 67f8b31e2054..da3604e73e8a 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -879,7 +879,6 @@ void bitmap_unplug(struct bitmap *bitmap) { unsigned long i; int dirty, need_write; - int wait = 0; if (!bitmap || !bitmap->storage.filemap || test_bit(BITMAP_STALE, &bitmap->flags)) @@ -897,16 +896,13 @@ void bitmap_unplug(struct bitmap *bitmap) clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); write_page(bitmap, bitmap->storage.filemap[i], 0); } - if (dirty) - wait = 1; - } - if (wait) { /* if any writes were performed, we need to wait on them */ - if (bitmap->storage.file) - wait_event(bitmap->write_wait, - atomic_read(&bitmap->pending_writes)==0); - else - md_super_wait(bitmap->mddev); } + if (bitmap->storage.file) + wait_event(bitmap->write_wait, + atomic_read(&bitmap->pending_writes)==0); + else + md_super_wait(bitmap->mddev); + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) bitmap_file_kick(bitmap); } diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index ab472c557d18..afe79719ea32 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -465,6 +465,7 @@ static void __relink_lru(struct dm_buffer *b, int dirty) c->n_buffers[dirty]++; b->list_mode = dirty; list_move(&b->lru_list, &c->lru[dirty]); + b->last_accessed = jiffies; } /*---------------------------------------------------------------- @@ -720,7 +721,6 @@ static void __wait_for_free_buffer(struct dm_bufio_client *c) io_schedule(); - set_task_state(current, TASK_RUNNING); remove_wait_queue(&c->free_buffer_wait, &wait); dm_bufio_lock(c); @@ -1434,9 +1434,9 @@ static void drop_buffers(struct dm_bufio_client *c) /* * Test if the buffer is unused and too old, and commit it. - * At if noio is set, we must not do any I/O because we hold - * dm_bufio_clients_lock and we would risk deadlock if the I/O gets rerouted to - * different bufio client. + * And if GFP_NOFS is used, we must not do any I/O because we hold + * dm_bufio_clients_lock and we would risk deadlock if the I/O gets + * rerouted to different bufio client. */ static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp, unsigned long max_jiffies) @@ -1444,7 +1444,7 @@ static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp, if (jiffies - b->last_accessed < max_jiffies) return 0; - if (!(gfp & __GFP_IO)) { + if (!(gfp & __GFP_FS)) { if (test_bit(B_READING, &b->state) || test_bit(B_WRITING, &b->state) || test_bit(B_DIRTY, &b->state)) @@ -1472,9 +1472,9 @@ static long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) { freed += __cleanup_old_buffer(b, gfp_mask, 0); if (!--nr_to_scan) - break; + return freed; + dm_bufio_cond_resched(); } - dm_bufio_cond_resched(); } return freed; } @@ -1486,7 +1486,7 @@ dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) unsigned long freed; c = container_of(shrink, struct dm_bufio_client, shrinker); - if (sc->gfp_mask & __GFP_IO) + if (sc->gfp_mask & __GFP_FS) dm_bufio_lock(c); else if (!dm_bufio_trylock(c)) return SHRINK_STOP; @@ -1503,7 +1503,7 @@ dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) unsigned long count; c = container_of(shrink, struct dm_bufio_client, shrinker); - if (sc->gfp_mask & __GFP_IO) + if (sc->gfp_mask & __GFP_FS) dm_bufio_lock(c); else if (!dm_bufio_trylock(c)) return 0; diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index d2899e7eb3aa..06709257adde 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -330,7 +330,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) disk_super->discard_root = cpu_to_le64(cmd->discard_root); disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks)); - disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); + disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE); disk_super->data_block_size = cpu_to_le32(cmd->data_block_size); disk_super->cache_blocks = cpu_to_le32(0); @@ -478,7 +478,7 @@ static int __create_persistent_data_objects(struct dm_cache_metadata *cmd, bool may_format_device) { int r; - cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE, + cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, CACHE_METADATA_CACHE_SIZE, CACHE_MAX_CONCURRENT_LOCKS); if (IS_ERR(cmd->bm)) { diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index cd70a78623a3..7383c90ccdb8 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h @@ -9,19 +9,17 @@ #include "dm-cache-block-types.h" #include "dm-cache-policy-internal.h" +#include "persistent-data/dm-space-map-metadata.h" /*----------------------------------------------------------------*/ -#define DM_CACHE_METADATA_BLOCK_SIZE 4096 +#define DM_CACHE_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE /* FIXME: remove this restriction */ /* * The metadata device is currently limited in size. - * - * We have one block of index, which can hold 255 index entries. Each - * index entry contains allocation info about 16k metadata blocks. */ -#define DM_CACHE_METADATA_MAX_SECTORS (255 * (1 << 14) * (DM_CACHE_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT))) +#define DM_CACHE_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS /* * A metadata device larger than 16GB triggers a warning. diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 2c63326638b6..7130505c2425 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -718,6 +718,22 @@ static int bio_triggers_commit(struct cache *cache, struct bio *bio) return bio->bi_rw & (REQ_FLUSH | REQ_FUA); } +/* + * You must increment the deferred set whilst the prison cell is held. To + * encourage this, we ask for 'cell' to be passed in. + */ +static void inc_ds(struct cache *cache, struct bio *bio, + struct dm_bio_prison_cell *cell) +{ + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + + BUG_ON(!cell); + BUG_ON(pb->all_io_entry); + + pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); +} + static void issue(struct cache *cache, struct bio *bio) { unsigned long flags; @@ -737,6 +753,12 @@ static void issue(struct cache *cache, struct bio *bio) spin_unlock_irqrestore(&cache->lock, flags); } +static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell) +{ + inc_ds(cache, bio, cell); + issue(cache, bio); +} + static void defer_writethrough_bio(struct cache *cache, struct bio *bio) { unsigned long flags; @@ -873,8 +895,8 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg) struct cache *cache = mg->cache; if (mg->writeback) { - cell_defer(cache, mg->old_ocell, false); clear_dirty(cache, mg->old_oblock, mg->cblock); + cell_defer(cache, mg->old_ocell, false); cleanup_migration(mg); return; @@ -929,13 +951,13 @@ static void migration_success_post_commit(struct dm_cache_migration *mg) } } else { + clear_dirty(cache, mg->new_oblock, mg->cblock); if (mg->requeue_holder) cell_defer(cache, mg->new_ocell, true); else { bio_endio(mg->new_ocell->holder, 0); cell_defer(cache, mg->new_ocell, false); } - clear_dirty(cache, mg->new_oblock, mg->cblock); cleanup_migration(mg); } } @@ -1015,6 +1037,11 @@ static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock); + + /* + * No need to inc_ds() here, since the cell will be held for the + * duration of the io. + */ generic_make_request(bio); } @@ -1115,8 +1142,7 @@ static void check_for_quiesced_migrations(struct cache *cache, return; INIT_LIST_HEAD(&work); - if (pb->all_io_entry) - dm_deferred_entry_dec(pb->all_io_entry, &work); + dm_deferred_entry_dec(pb->all_io_entry, &work); if (!list_empty(&work)) queue_quiesced_migrations(cache, &work); @@ -1252,6 +1278,11 @@ static void process_flush_bio(struct cache *cache, struct bio *bio) else remap_to_cache(cache, bio, 0); + /* + * REQ_FLUSH is not directed at any particular block so we don't + * need to inc_ds(). REQ_FUA's are split into a write + REQ_FLUSH + * by dm-core. + */ issue(cache, bio); } @@ -1301,15 +1332,6 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio) &cache->stats.read_miss : &cache->stats.write_miss); } -static void issue_cache_bio(struct cache *cache, struct bio *bio, - struct per_bio_data *pb, - dm_oblock_t oblock, dm_cblock_t cblock) -{ - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); - remap_to_cache_dirty(cache, bio, oblock, cblock); - issue(cache, bio); -} - static void process_bio(struct cache *cache, struct prealloc *structs, struct bio *bio) { @@ -1318,8 +1340,6 @@ static void process_bio(struct cache *cache, struct prealloc *structs, dm_oblock_t block = get_bio_block(cache, bio); struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell; struct policy_result lookup_result; - size_t pb_data_size = get_per_bio_data_size(cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); bool discarded_block = is_discarded_oblock(cache, block); bool passthrough = passthrough_mode(&cache->features); bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache)); @@ -1359,9 +1379,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs, } else { /* FIXME: factor out issue_origin() */ - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); remap_to_origin_clear_discard(cache, bio, block); - issue(cache, bio); + inc_and_issue(cache, bio, new_ocell); } } else { inc_hit_counter(cache, bio); @@ -1369,20 +1388,21 @@ static void process_bio(struct cache *cache, struct prealloc *structs, if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && !is_dirty(cache, lookup_result.cblock)) { - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); - issue(cache, bio); - } else - issue_cache_bio(cache, bio, pb, block, lookup_result.cblock); + inc_and_issue(cache, bio, new_ocell); + + } else { + remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); + inc_and_issue(cache, bio, new_ocell); + } } break; case POLICY_MISS: inc_miss_counter(cache, bio); - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); remap_to_origin_clear_discard(cache, bio, block); - issue(cache, bio); + inc_and_issue(cache, bio, new_ocell); break; case POLICY_NEW: @@ -1501,6 +1521,9 @@ static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) bio_list_init(&cache->deferred_flush_bios); spin_unlock_irqrestore(&cache->lock, flags); + /* + * These bios have already been through inc_ds() + */ while ((bio = bio_list_pop(&bios))) submit_bios ? generic_make_request(bio) : bio_io_error(bio); } @@ -1518,6 +1541,9 @@ static void process_deferred_writethrough_bios(struct cache *cache) bio_list_init(&cache->deferred_writethrough_bios); spin_unlock_irqrestore(&cache->lock, flags); + /* + * These bios have already been through inc_ds() + */ while ((bio = bio_list_pop(&bios))) generic_make_request(bio); } @@ -1694,6 +1720,7 @@ static void do_worker(struct work_struct *ws) if (commit_if_needed(cache)) { process_deferred_flush_bios(cache, false); + process_migrations(cache, &cache->need_commit_migrations, migration_failure); /* * FIXME: rollback metadata or just go into a @@ -2406,16 +2433,13 @@ out: return r; } -static int cache_map(struct dm_target *ti, struct bio *bio) +static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell **cell) { - struct cache *cache = ti->private; - int r; dm_oblock_t block = get_bio_block(cache, bio); size_t pb_data_size = get_per_bio_data_size(cache); bool can_migrate = false; bool discarded_block; - struct dm_bio_prison_cell *cell; struct policy_result lookup_result; struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size); @@ -2437,15 +2461,15 @@ static int cache_map(struct dm_target *ti, struct bio *bio) /* * Check to see if that block is currently migrating. */ - cell = alloc_prison_cell(cache); - if (!cell) { + *cell = alloc_prison_cell(cache); + if (!*cell) { defer_bio(cache, bio); return DM_MAPIO_SUBMITTED; } - r = bio_detain(cache, block, bio, cell, + r = bio_detain(cache, block, bio, *cell, (cell_free_fn) free_prison_cell, - cache, &cell); + cache, cell); if (r) { if (r < 0) defer_bio(cache, bio); @@ -2458,11 +2482,12 @@ static int cache_map(struct dm_target *ti, struct bio *bio) r = policy_map(cache->policy, block, false, can_migrate, discarded_block, bio, &lookup_result); if (r == -EWOULDBLOCK) { - cell_defer(cache, cell, true); + cell_defer(cache, *cell, true); return DM_MAPIO_SUBMITTED; } else if (r) { DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r); + cell_defer(cache, *cell, false); bio_io_error(bio); return DM_MAPIO_SUBMITTED; } @@ -2476,52 +2501,44 @@ static int cache_map(struct dm_target *ti, struct bio *bio) * We need to invalidate this block, so * defer for the worker thread. */ - cell_defer(cache, cell, true); + cell_defer(cache, *cell, true); r = DM_MAPIO_SUBMITTED; } else { - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); inc_miss_counter(cache, bio); remap_to_origin_clear_discard(cache, bio, block); - - cell_defer(cache, cell, false); } } else { inc_hit_counter(cache, bio); - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); - if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && !is_dirty(cache, lookup_result.cblock)) remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); else remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); - - cell_defer(cache, cell, false); } break; case POLICY_MISS: inc_miss_counter(cache, bio); - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); - if (pb->req_nr != 0) { /* * This is a duplicate writethrough io that is no * longer needed because the block has been demoted. */ bio_endio(bio, 0); - cell_defer(cache, cell, false); - return DM_MAPIO_SUBMITTED; - } else { + cell_defer(cache, *cell, false); + r = DM_MAPIO_SUBMITTED; + + } else remap_to_origin_clear_discard(cache, bio, block); - cell_defer(cache, cell, false); - } + break; default: DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__, (unsigned) lookup_result.op); + cell_defer(cache, *cell, false); bio_io_error(bio); r = DM_MAPIO_SUBMITTED; } @@ -2529,6 +2546,21 @@ static int cache_map(struct dm_target *ti, struct bio *bio) return r; } +static int cache_map(struct dm_target *ti, struct bio *bio) +{ + int r; + struct dm_bio_prison_cell *cell; + struct cache *cache = ti->private; + + r = __cache_map(cache, bio, &cell); + if (r == DM_MAPIO_REMAPPED) { + inc_ds(cache, bio, cell); + cell_defer(cache, cell, false); + } + + return r; +} + static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) { struct cache *cache = ti->private; @@ -2808,7 +2840,7 @@ static void cache_status(struct dm_target *ti, status_type_t type, residency = policy_residency(cache->policy); DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ", - (unsigned)(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT), + (unsigned)DM_CACHE_METADATA_BLOCK_SIZE, (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), (unsigned long long)nr_blocks_metadata, cache->sectors_per_block, @@ -3062,7 +3094,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) */ if (io_opt_sectors < cache->sectors_per_block || do_div(io_opt_sectors, cache->sectors_per_block)) { - blk_limits_io_min(limits, 0); + blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT); blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT); } set_discard_limits(cache, limits); @@ -3072,7 +3104,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {1, 4, 0}, + .version = {1, 5, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 4cba2d808afb..fc93b9330af4 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -59,7 +59,7 @@ struct dm_crypt_io { int error; sector_t sector; struct dm_crypt_io *base_io; -}; +} CRYPTO_MINALIGN_ATTR; struct dm_crypt_request { struct convert_context *ctx; @@ -162,6 +162,8 @@ struct crypt_config { */ unsigned int dmreq_start; + unsigned int per_bio_data_size; + unsigned long flags; unsigned int key_size; unsigned int key_parts; /* independent parts in key buffer */ @@ -524,29 +526,26 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, u8 *data) { struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; - struct { - struct shash_desc desc; - char ctx[crypto_shash_descsize(lmk->hash_tfm)]; - } sdesc; + SHASH_DESC_ON_STACK(desc, lmk->hash_tfm); struct md5_state md5state; __le32 buf[4]; int i, r; - sdesc.desc.tfm = lmk->hash_tfm; - sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + desc->tfm = lmk->hash_tfm; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; - r = crypto_shash_init(&sdesc.desc); + r = crypto_shash_init(desc); if (r) return r; if (lmk->seed) { - r = crypto_shash_update(&sdesc.desc, lmk->seed, LMK_SEED_SIZE); + r = crypto_shash_update(desc, lmk->seed, LMK_SEED_SIZE); if (r) return r; } /* Sector is always 512B, block size 16, add data of blocks 1-31 */ - r = crypto_shash_update(&sdesc.desc, data + 16, 16 * 31); + r = crypto_shash_update(desc, data + 16, 16 * 31); if (r) return r; @@ -555,12 +554,12 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000); buf[2] = cpu_to_le32(4024); buf[3] = 0; - r = crypto_shash_update(&sdesc.desc, (u8 *)buf, sizeof(buf)); + r = crypto_shash_update(desc, (u8 *)buf, sizeof(buf)); if (r) return r; /* No MD5 padding here */ - r = crypto_shash_export(&sdesc.desc, &md5state); + r = crypto_shash_export(desc, &md5state); if (r) return r; @@ -677,10 +676,7 @@ static int crypt_iv_tcw_whitening(struct crypt_config *cc, struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; u64 sector = cpu_to_le64((u64)dmreq->iv_sector); u8 buf[TCW_WHITENING_SIZE]; - struct { - struct shash_desc desc; - char ctx[crypto_shash_descsize(tcw->crc32_tfm)]; - } sdesc; + SHASH_DESC_ON_STACK(desc, tcw->crc32_tfm); int i, r; /* xor whitening with sector number */ @@ -689,16 +685,16 @@ static int crypt_iv_tcw_whitening(struct crypt_config *cc, crypto_xor(&buf[8], (u8 *)§or, 8); /* calculate crc32 for every 32bit part and xor it */ - sdesc.desc.tfm = tcw->crc32_tfm; - sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + desc->tfm = tcw->crc32_tfm; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; for (i = 0; i < 4; i++) { - r = crypto_shash_init(&sdesc.desc); + r = crypto_shash_init(desc); if (r) goto out; - r = crypto_shash_update(&sdesc.desc, &buf[i * 4], 4); + r = crypto_shash_update(desc, &buf[i * 4], 4); if (r) goto out; - r = crypto_shash_final(&sdesc.desc, &buf[i * 4]); + r = crypto_shash_final(desc, &buf[i * 4]); if (r) goto out; } @@ -895,6 +891,15 @@ static void crypt_alloc_req(struct crypt_config *cc, kcryptd_async_done, dmreq_of_req(cc, ctx->req)); } +static void crypt_free_req(struct crypt_config *cc, + struct ablkcipher_request *req, struct bio *base_bio) +{ + struct dm_crypt_io *io = dm_per_bio_data(base_bio, cc->per_bio_data_size); + + if ((struct ablkcipher_request *)(io + 1) != req) + mempool_free(req, cc->req_pool); +} + /* * Encrypt / decrypt data from one bio to another one (can be the same one) */ @@ -1008,12 +1013,9 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone) } } -static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc, - struct bio *bio, sector_t sector) +static void crypt_io_init(struct dm_crypt_io *io, struct crypt_config *cc, + struct bio *bio, sector_t sector) { - struct dm_crypt_io *io; - - io = mempool_alloc(cc->io_pool, GFP_NOIO); io->cc = cc; io->base_bio = bio; io->sector = sector; @@ -1021,8 +1023,6 @@ static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc, io->base_io = NULL; io->ctx.req = NULL; atomic_set(&io->io_pending, 0); - - return io; } static void crypt_inc_pending(struct dm_crypt_io *io) @@ -1046,8 +1046,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io) return; if (io->ctx.req) - mempool_free(io->ctx.req, cc->req_pool); - mempool_free(io, cc->io_pool); + crypt_free_req(cc, io->ctx.req, base_bio); + if (io != dm_per_bio_data(base_bio, cc->per_bio_data_size)) + mempool_free(io, cc->io_pool); if (likely(!base_io)) bio_endio(base_bio, error); @@ -1255,8 +1256,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) * between fragments, so switch to a new dm_crypt_io structure. */ if (unlikely(!crypt_finished && remaining)) { - new_io = crypt_io_alloc(io->cc, io->base_bio, - sector); + new_io = mempool_alloc(cc->io_pool, GFP_NOIO); + crypt_io_init(new_io, io->cc, io->base_bio, sector); crypt_inc_pending(new_io); crypt_convert_init(cc, &new_io->ctx, NULL, io->base_bio, sector); @@ -1325,7 +1326,7 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, if (error < 0) io->error = -EIO; - mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); + crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio); if (!atomic_dec_and_test(&ctx->cc_pending)) return; @@ -1681,6 +1682,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) unsigned int key_size, opt_params; unsigned long long tmpll; int ret; + size_t iv_size_padding; struct dm_arg_set as; const char *opt_string; char dummy; @@ -1717,17 +1719,33 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) cc->dmreq_start = sizeof(struct ablkcipher_request); cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc)); - cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment()); - cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) & - ~(crypto_tfm_ctx_alignment() - 1); + cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request)); + + if (crypto_ablkcipher_alignmask(any_tfm(cc)) < CRYPTO_MINALIGN) { + /* Allocate the padding exactly */ + iv_size_padding = -(cc->dmreq_start + sizeof(struct dm_crypt_request)) + & crypto_ablkcipher_alignmask(any_tfm(cc)); + } else { + /* + * If the cipher requires greater alignment than kmalloc + * alignment, we don't know the exact position of the + * initialization vector. We must assume worst case. + */ + iv_size_padding = crypto_ablkcipher_alignmask(any_tfm(cc)); + } cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start + - sizeof(struct dm_crypt_request) + cc->iv_size); + sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size); if (!cc->req_pool) { ti->error = "Cannot allocate crypt request mempool"; goto bad; } + cc->per_bio_data_size = ti->per_bio_data_size = + ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + + sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size, + ARCH_KMALLOC_MINALIGN); + cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0); if (!cc->page_pool) { ti->error = "Cannot allocate page mempool"; @@ -1824,7 +1842,9 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_REMAPPED; } - io = crypt_io_alloc(cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector)); + io = dm_per_bio_data(bio, cc->per_bio_data_size); + crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector)); + io->ctx.req = (struct ablkcipher_request *)(io + 1); if (bio_data_dir(io->base_bio) == READ) { if (kcryptd_io_read(io, GFP_NOWAIT)) diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index db404a0f7e2c..c09359db3a90 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -33,7 +33,6 @@ struct dm_io_client { struct io { unsigned long error_bits; atomic_t count; - struct completion *wait; struct dm_io_client *client; io_notify_fn callback; void *context; @@ -112,28 +111,27 @@ static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io, * We need an io object to keep track of the number of bios that * have been dispatched for a particular io. *---------------------------------------------------------------*/ -static void dec_count(struct io *io, unsigned int region, int error) +static void complete_io(struct io *io) { - if (error) - set_bit(region, &io->error_bits); + unsigned long error_bits = io->error_bits; + io_notify_fn fn = io->callback; + void *context = io->context; - if (atomic_dec_and_test(&io->count)) { - if (io->vma_invalidate_size) - invalidate_kernel_vmap_range(io->vma_invalidate_address, - io->vma_invalidate_size); + if (io->vma_invalidate_size) + invalidate_kernel_vmap_range(io->vma_invalidate_address, + io->vma_invalidate_size); - if (io->wait) - complete(io->wait); + mempool_free(io, io->client->pool); + fn(error_bits, context); +} - else { - unsigned long r = io->error_bits; - io_notify_fn fn = io->callback; - void *context = io->context; +static void dec_count(struct io *io, unsigned int region, int error) +{ + if (error) + set_bit(region, &io->error_bits); - mempool_free(io, io->client->pool); - fn(r, context); - } - } + if (atomic_dec_and_test(&io->count)) + complete_io(io); } static void endio(struct bio *bio, int error) @@ -376,41 +374,51 @@ static void dispatch_io(int rw, unsigned int num_regions, dec_count(io, 0, 0); } +struct sync_io { + unsigned long error_bits; + struct completion wait; +}; + +static void sync_io_complete(unsigned long error, void *context) +{ + struct sync_io *sio = context; + + sio->error_bits = error; + complete(&sio->wait); +} + static int sync_io(struct dm_io_client *client, unsigned int num_regions, struct dm_io_region *where, int rw, struct dpages *dp, unsigned long *error_bits) { - /* - * gcc <= 4.3 can't do the alignment for stack variables, so we must - * align it on our own. - * volatile prevents the optimizer from removing or reusing - * "io_" field from the stack frame (allowed in ANSI C). - */ - volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1]; - struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io)); - DECLARE_COMPLETION_ONSTACK(wait); + struct io *io; + struct sync_io sio; if (num_regions > 1 && (rw & RW_MASK) != WRITE) { WARN_ON(1); return -EIO; } + init_completion(&sio.wait); + + io = mempool_alloc(client->pool, GFP_NOIO); io->error_bits = 0; atomic_set(&io->count, 1); /* see dispatch_io() */ - io->wait = &wait; io->client = client; + io->callback = sync_io_complete; + io->context = &sio; io->vma_invalidate_address = dp->vma_invalidate_address; io->vma_invalidate_size = dp->vma_invalidate_size; dispatch_io(rw, num_regions, where, dp, io, 1); - wait_for_completion_io(&wait); + wait_for_completion_io(&sio.wait); if (error_bits) - *error_bits = io->error_bits; + *error_bits = sio.error_bits; - return io->error_bits ? -EIO : 0; + return sio.error_bits ? -EIO : 0; } static int async_io(struct dm_io_client *client, unsigned int num_regions, @@ -428,7 +436,6 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions, io = mempool_alloc(client->pool, GFP_NOIO); io->error_bits = 0; atomic_set(&io->count, 1); /* see dispatch_io() */ - io->wait = NULL; io->client = client; io->callback = fn; io->context = context; @@ -481,9 +488,9 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp, * New collapsed (a)synchronous interface. * * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug - * the queue with blk_unplug() some time later or set REQ_SYNC in -io_req->bi_rw. If you fail to do one of these, the IO will be submitted to - * the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c. + * the queue with blk_unplug() some time later or set REQ_SYNC in io_req->bi_rw. + * If you fail to do one of these, the IO will be submitted to the disk after + * q->unplug_delay, which defaults to 3ms in blk-settings.c. */ int dm_io(struct dm_io_request *io_req, unsigned num_regions, struct dm_io_region *where, unsigned long *sync_error_bits) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 51521429fb59..0be9381365d7 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1418,7 +1418,7 @@ static void retrieve_deps(struct dm_table *table, deps->count = count; count = 0; list_for_each_entry (dd, dm_table_get_devices(table), list) - deps->dev[count++] = huge_encode_dev(dd->dm_dev.bdev->bd_dev); + deps->dev[count++] = huge_encode_dev(dd->dm_dev->bdev->bd_dev); param->data_size = param->data_start + needed; } diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c index b428c0ae63d5..39ad9664d397 100644 --- a/drivers/md/dm-log-userspace-transfer.c +++ b/drivers/md/dm-log-userspace-transfer.c @@ -272,7 +272,7 @@ int dm_ulog_tfr_init(void) r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback); if (r) { - cn_del_callback(&ulog_cn_id); + kfree(prealloced_cn_msg); return r; } diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index f4167b013d99..7b6b0f0f831a 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -317,8 +317,10 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes) struct priority_group *pg; unsigned bypassed = 1; - if (!m->nr_valid_paths) + if (!m->nr_valid_paths) { + m->queue_io = 0; goto failed; + } /* Were we instructed to switch PG? */ if (m->next_pg) { @@ -373,8 +375,6 @@ static int __must_push_back(struct multipath *m) dm_noflush_suspending(m->ti))); } -#define pg_ready(m) (!(m)->queue_io && !(m)->pg_init_required) - /* * Map cloned requests */ @@ -402,11 +402,11 @@ static int multipath_map(struct dm_target *ti, struct request *clone, if (!__must_push_back(m)) r = -EIO; /* Failed */ goto out_unlock; - } - if (!pg_ready(m)) { + } else if (m->queue_io || m->pg_init_required) { __pg_init_all_paths(m); goto out_unlock; } + if (set_mapinfo(m, map_context) < 0) /* ENOMEM, requeue */ goto out_unlock; diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 4880b69e2e9e..07c0fa0fa284 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2010-2011 Neil Brown - * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved. + * Copyright (C) 2010-2014 Red Hat, Inc. All rights reserved. * * This file is released under the GPL. */ @@ -18,6 +18,8 @@ #define DM_MSG_PREFIX "raid" +static bool devices_handle_discard_safely = false; + /* * The following flags are used by dm-raid.c to set up the array state. * They must be cleared before md_run is called. @@ -475,6 +477,8 @@ too_many: * will form the "stripe" * [[no]sync] Force or prevent recovery of the * entire array + * [devices_handle_discard_safely] Allow discards on RAID4/5/6; useful if RAID + * member device(s) properly support TRIM/UNMAP * [rebuild <idx>] Rebuild the drive indicated by the index * [daemon_sleep <ms>] Time between bitmap daemon work to * clear bits @@ -785,8 +789,7 @@ struct dm_raid_superblock { __le32 layout; __le32 stripe_sectors; - __u8 pad[452]; /* Round struct to 512 bytes. */ - /* Always set to 0 when writing. */ + /* Remainder of a logical block is zero-filled when writing (see super_sync()). */ } __packed; static int read_disk_sb(struct md_rdev *rdev, int size) @@ -823,7 +826,7 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev) test_bit(Faulty, &(rs->dev[i].rdev.flags))) failed_devices |= (1ULL << i); - memset(sb, 0, sizeof(*sb)); + memset(sb + 1, 0, rdev->sb_size - sizeof(*sb)); sb->magic = cpu_to_le32(DM_RAID_MAGIC); sb->features = cpu_to_le32(0); /* No features yet */ @@ -858,7 +861,11 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev) uint64_t events_sb, events_refsb; rdev->sb_start = 0; - rdev->sb_size = sizeof(*sb); + rdev->sb_size = bdev_logical_block_size(rdev->meta_bdev); + if (rdev->sb_size < sizeof(*sb) || rdev->sb_size > PAGE_SIZE) { + DMERR("superblock size of a logical block is no longer valid"); + return -EINVAL; + } ret = read_disk_sb(rdev, rdev->sb_size); if (ret) @@ -1150,6 +1157,53 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) } /* + * Enable/disable discard support on RAID set depending on + * RAID level and discard properties of underlying RAID members. + */ +static void configure_discard_support(struct dm_target *ti, struct raid_set *rs) +{ + int i; + bool raid456; + + /* Assume discards not supported until after checks below. */ + ti->discards_supported = false; + + /* RAID level 4,5,6 require discard_zeroes_data for data integrity! */ + raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6); + + for (i = 0; i < rs->md.raid_disks; i++) { + struct request_queue *q; + + if (!rs->dev[i].rdev.bdev) + continue; + + q = bdev_get_queue(rs->dev[i].rdev.bdev); + if (!q || !blk_queue_discard(q)) + return; + + if (raid456) { + if (!q->limits.discard_zeroes_data) + return; + if (!devices_handle_discard_safely) { + DMERR("raid456 discard support disabled due to discard_zeroes_data uncertainty."); + DMERR("Set dm-raid.devices_handle_discard_safely=Y to override."); + return; + } + } + } + + /* All RAID members properly support discards */ + ti->discards_supported = true; + + /* + * RAID1 and RAID10 personalities require bio splitting, + * RAID0/4/5/6 don't and process large discard bios properly. + */ + ti->split_discard_bios = !!(rs->md.level == 1 || rs->md.level == 10); + ti->num_discard_bios = 1; +} + +/* * Construct a RAID4/5/6 mapping: * Args: * <raid_type> <#raid_params> <raid_params> \ @@ -1231,6 +1285,11 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->private = rs; ti->num_flush_bios = 1; + /* + * Disable/enable discard support on RAID set. + */ + configure_discard_support(ti, rs); + mutex_lock(&rs->md.reconfig_mutex); ret = md_run(&rs->md); rs->md.in_sync = 0; /* Assume already marked dirty */ @@ -1652,7 +1711,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 5, 2}, + .version = {1, 6, 0}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, @@ -1683,6 +1742,10 @@ static void __exit dm_raid_exit(void) module_init(dm_raid_init); module_exit(dm_raid_exit); +module_param(devices_handle_discard_safely, bool, 0644); +MODULE_PARM_DESC(devices_handle_discard_safely, + "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); + MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target"); MODULE_ALIAS("dm-raid1"); MODULE_ALIAS("dm-raid10"); diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 28a90122a5a8..87f86c77b094 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -548,7 +548,7 @@ void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw, * A race condition can at worst result in the merged flag being * misrepresented, so we don't have to disable preemption here. */ - last = __this_cpu_ptr(stats->last); + last = raw_cpu_ptr(stats->last); stats_aux->merged = (bi_sector == (ACCESS_ONCE(last->last_sector) && ((bi_rw & (REQ_WRITE | REQ_DISCARD)) == diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index d1600d2aa2e2..f8b37d4c05d8 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -159,8 +159,10 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) sc->stripes_shift = __ffs(stripes); r = dm_set_target_max_io_len(ti, chunk_size); - if (r) + if (r) { + kfree(sc); return r; + } ti->num_flush_bios = stripes; ti->num_discard_bios = stripes; diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c index 09a688b3d48c..50fca469cafd 100644 --- a/drivers/md/dm-switch.c +++ b/drivers/md/dm-switch.c @@ -137,13 +137,23 @@ static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr *bit *= sctx->region_table_entry_bits; } +static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long region_nr) +{ + unsigned long region_index; + unsigned bit; + + switch_get_position(sctx, region_nr, ®ion_index, &bit); + + return (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) & + ((1 << sctx->region_table_entry_bits) - 1); +} + /* * Find which path to use at given offset. */ static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset) { - unsigned long region_index; - unsigned bit, path_nr; + unsigned path_nr; sector_t p; p = offset; @@ -152,9 +162,7 @@ static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset) else sector_div(p, sctx->region_size); - switch_get_position(sctx, p, ®ion_index, &bit); - path_nr = (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) & - ((1 << sctx->region_table_entry_bits) - 1); + path_nr = switch_region_table_read(sctx, p); /* This can only happen if the processor uses non-atomic stores. */ if (unlikely(path_nr >= sctx->nr_paths)) @@ -363,7 +371,7 @@ static __always_inline unsigned long parse_hex(const char **string) } static int process_set_region_mappings(struct switch_ctx *sctx, - unsigned argc, char **argv) + unsigned argc, char **argv) { unsigned i; unsigned long region_index = 0; @@ -372,6 +380,51 @@ static int process_set_region_mappings(struct switch_ctx *sctx, unsigned long path_nr; const char *string = argv[i]; + if ((*string & 0xdf) == 'R') { + unsigned long cycle_length, num_write; + + string++; + if (unlikely(*string == ',')) { + DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); + return -EINVAL; + } + cycle_length = parse_hex(&string); + if (unlikely(*string != ',')) { + DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); + return -EINVAL; + } + string++; + if (unlikely(!*string)) { + DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); + return -EINVAL; + } + num_write = parse_hex(&string); + if (unlikely(*string)) { + DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); + return -EINVAL; + } + + if (unlikely(!cycle_length) || unlikely(cycle_length - 1 > region_index)) { + DMWARN("invalid set_region_mappings cycle length: %lu > %lu", + cycle_length - 1, region_index); + return -EINVAL; + } + if (unlikely(region_index + num_write < region_index) || + unlikely(region_index + num_write >= sctx->nr_regions)) { + DMWARN("invalid set_region_mappings region number: %lu + %lu >= %lu", + region_index, num_write, sctx->nr_regions); + return -EINVAL; + } + + while (num_write--) { + region_index++; + path_nr = switch_region_table_read(sctx, region_index - cycle_length); + switch_region_table_write(sctx, region_index, path_nr); + } + + continue; + } + if (*string == ':') region_index++; else { @@ -500,7 +553,7 @@ static int switch_iterate_devices(struct dm_target *ti, static struct target_type switch_target = { .name = "switch", - .version = {1, 0, 0}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = switch_ctr, .dtr = switch_dtr, diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 5f59f1e3e5b1..b2bd1ebf4562 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -210,15 +210,16 @@ int dm_table_create(struct dm_table **result, fmode_t mode, return 0; } -static void free_devices(struct list_head *devices) +static void free_devices(struct list_head *devices, struct mapped_device *md) { struct list_head *tmp, *next; list_for_each_safe(tmp, next, devices) { struct dm_dev_internal *dd = list_entry(tmp, struct dm_dev_internal, list); - DMWARN("dm_table_destroy: dm_put_device call missing for %s", - dd->dm_dev.name); + DMWARN("%s: dm_table_destroy: dm_put_device call missing for %s", + dm_device_name(md), dd->dm_dev->name); + dm_put_table_device(md, dd->dm_dev); kfree(dd); } } @@ -247,7 +248,7 @@ void dm_table_destroy(struct dm_table *t) vfree(t->highs); /* free the device list */ - free_devices(&t->devices); + free_devices(&t->devices, t->md); dm_free_md_mempools(t->mempools); @@ -262,53 +263,13 @@ static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev) struct dm_dev_internal *dd; list_for_each_entry (dd, l, list) - if (dd->dm_dev.bdev->bd_dev == dev) + if (dd->dm_dev->bdev->bd_dev == dev) return dd; return NULL; } /* - * Open a device so we can use it as a map destination. - */ -static int open_dev(struct dm_dev_internal *d, dev_t dev, - struct mapped_device *md) -{ - static char *_claim_ptr = "I belong to device-mapper"; - struct block_device *bdev; - - int r; - - BUG_ON(d->dm_dev.bdev); - - bdev = blkdev_get_by_dev(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); - - r = bd_link_disk_holder(bdev, dm_disk(md)); - if (r) { - blkdev_put(bdev, d->dm_dev.mode | FMODE_EXCL); - return r; - } - - d->dm_dev.bdev = bdev; - return 0; -} - -/* - * Close a device that we've been using. - */ -static void close_dev(struct dm_dev_internal *d, struct mapped_device *md) -{ - if (!d->dm_dev.bdev) - return; - - bd_unlink_disk_holder(d->dm_dev.bdev, dm_disk(md)); - blkdev_put(d->dm_dev.bdev, d->dm_dev.mode | FMODE_EXCL); - d->dm_dev.bdev = NULL; -} - -/* * If possible, this checks an area of a destination device is invalid. */ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, @@ -386,19 +347,17 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, struct mapped_device *md) { int r; - struct dm_dev_internal dd_new, dd_old; - - dd_new = dd_old = *dd; + struct dm_dev *old_dev, *new_dev; - dd_new.dm_dev.mode |= new_mode; - dd_new.dm_dev.bdev = NULL; + old_dev = dd->dm_dev; - r = open_dev(&dd_new, dd->dm_dev.bdev->bd_dev, md); + r = dm_get_table_device(md, dd->dm_dev->bdev->bd_dev, + dd->dm_dev->mode | new_mode, &new_dev); if (r) return r; - dd->dm_dev.mode |= new_mode; - close_dev(&dd_old, md); + dd->dm_dev = new_dev; + dm_put_table_device(md, old_dev); return 0; } @@ -440,27 +399,22 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, if (!dd) return -ENOMEM; - dd->dm_dev.mode = mode; - dd->dm_dev.bdev = NULL; - - if ((r = open_dev(dd, dev, t->md))) { + if ((r = dm_get_table_device(t->md, dev, mode, &dd->dm_dev))) { kfree(dd); return r; } - format_dev_t(dd->dm_dev.name, dev); - atomic_set(&dd->count, 0); list_add(&dd->list, &t->devices); - } else if (dd->dm_dev.mode != (mode | dd->dm_dev.mode)) { + } else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) { r = upgrade_mode(dd, mode, t->md); if (r) return r; } atomic_inc(&dd->count); - *result = &dd->dm_dev; + *result = dd->dm_dev; return 0; } EXPORT_SYMBOL(dm_get_device); @@ -505,11 +459,23 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, */ void dm_put_device(struct dm_target *ti, struct dm_dev *d) { - struct dm_dev_internal *dd = container_of(d, struct dm_dev_internal, - dm_dev); + int found = 0; + struct list_head *devices = &ti->table->devices; + struct dm_dev_internal *dd; + list_for_each_entry(dd, devices, list) { + if (dd->dm_dev == d) { + found = 1; + break; + } + } + if (!found) { + DMWARN("%s: device %s not in table devices list", + dm_device_name(ti->table->md), d->name); + return; + } if (atomic_dec_and_test(&dd->count)) { - close_dev(dd, ti->table->md); + dm_put_table_device(ti->table->md, d); list_del(&dd->list); kfree(dd); } @@ -906,7 +872,7 @@ static int dm_table_set_type(struct dm_table *t) /* Non-request-stackable devices can't be used for request-based dm */ devices = dm_table_get_devices(t); list_for_each_entry(dd, devices, list) { - if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev.bdev))) { + if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev->bdev))) { DMWARN("table load rejected: including" " non-request-stackable devices"); return -EINVAL; @@ -1043,7 +1009,7 @@ static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t, struct gendisk *prev_disk = NULL, *template_disk = NULL; list_for_each_entry(dd, devices, list) { - template_disk = dd->dm_dev.bdev->bd_disk; + template_disk = dd->dm_dev->bdev->bd_disk; if (!blk_get_integrity(template_disk)) goto no_integrity; if (!match_all && !blk_integrity_is_initialized(template_disk)) @@ -1386,6 +1352,14 @@ static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, return q && !blk_queue_add_random(q); } +static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags); +} + static bool dm_table_all_devices_attribute(struct dm_table *t, iterate_devices_callout_fn func) { @@ -1430,6 +1404,43 @@ static bool dm_table_supports_write_same(struct dm_table *t) return true; } +static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + + return q && blk_queue_discard(q); +} + +static bool dm_table_supports_discards(struct dm_table *t) +{ + struct dm_target *ti; + unsigned i = 0; + + /* + * Unless any target used by the table set discards_supported, + * require at least one underlying device to support discards. + * t->devices includes internal dm devices such as mirror logs + * so we need to use iterate_devices here, which targets + * supporting discard selectively must provide. + */ + while (i < dm_table_get_num_targets(t)) { + ti = dm_table_get_target(t, i++); + + if (!ti->num_discard_bios) + continue; + + if (ti->discards_supported) + return 1; + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, device_discard_capable, NULL)) + return 1; + } + + return 0; +} + void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits) { @@ -1464,6 +1475,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (!dm_table_supports_write_same(t)) q->limits.max_write_same_sectors = 0; + if (dm_table_all_devices_attribute(t, queue_supports_sg_merge)) + queue_flag_clear_unlocked(QUEUE_FLAG_NO_SG_MERGE, q); + else + queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q); + dm_table_set_integrity(t); /* @@ -1579,7 +1595,7 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) int r = 0; list_for_each_entry(dd, devices, list) { - struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev); + struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev); char b[BDEVNAME_SIZE]; if (likely(q)) @@ -1587,7 +1603,7 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) else DMWARN_LIMIT("%s: any_congested: nonexistent device %s", dm_device_name(t->md), - bdevname(dd->dm_dev.bdev, b)); + bdevname(dd->dm_dev->bdev, b)); } list_for_each_entry(cb, &t->target_callbacks, list) @@ -1636,39 +1652,3 @@ void dm_table_run_md_queue_async(struct dm_table *t) } EXPORT_SYMBOL(dm_table_run_md_queue_async); -static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - struct request_queue *q = bdev_get_queue(dev->bdev); - - return q && blk_queue_discard(q); -} - -bool dm_table_supports_discards(struct dm_table *t) -{ - struct dm_target *ti; - unsigned i = 0; - - /* - * Unless any target used by the table set discards_supported, - * require at least one underlying device to support discards. - * t->devices includes internal dm devices such as mirror logs - * so we need to use iterate_devices here, which targets - * supporting discard selectively must provide. - */ - while (i < dm_table_get_num_targets(t)) { - ti = dm_table_get_target(t, i++); - - if (!ti->num_discard_bios) - continue; - - if (ti->discards_supported) - return 1; - - if (ti->type->iterate_devices && - ti->type->iterate_devices(ti, device_discard_capable, NULL)) - return 1; - } - - return 0; -} diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index fc9c848a60c9..0f86d802b533 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -227,6 +227,7 @@ struct thin_c { struct list_head list; struct dm_dev *pool_dev; struct dm_dev *origin_dev; + sector_t origin_size; dm_thin_id dev_id; struct pool *pool; @@ -554,11 +555,16 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio, struct dm_thin_new_mapping { struct list_head list; - bool quiesced:1; - bool prepared:1; bool pass_discard:1; bool definitely_not_shared:1; + /* + * Track quiescing, copying and zeroing preparation actions. When this + * counter hits zero the block is prepared and can be inserted into the + * btree. + */ + atomic_t prepare_actions; + int err; struct thin_c *tc; dm_block_t virt_block; @@ -575,43 +581,41 @@ struct dm_thin_new_mapping { bio_end_io_t *saved_bi_end_io; }; -static void __maybe_add_mapping(struct dm_thin_new_mapping *m) +static void __complete_mapping_preparation(struct dm_thin_new_mapping *m) { struct pool *pool = m->tc->pool; - if (m->quiesced && m->prepared) { + if (atomic_dec_and_test(&m->prepare_actions)) { list_add_tail(&m->list, &pool->prepared_mappings); wake_worker(pool); } } -static void copy_complete(int read_err, unsigned long write_err, void *context) +static void complete_mapping_preparation(struct dm_thin_new_mapping *m) { unsigned long flags; - struct dm_thin_new_mapping *m = context; struct pool *pool = m->tc->pool; - m->err = read_err || write_err ? -EIO : 0; - spin_lock_irqsave(&pool->lock, flags); - m->prepared = true; - __maybe_add_mapping(m); + __complete_mapping_preparation(m); spin_unlock_irqrestore(&pool->lock, flags); } +static void copy_complete(int read_err, unsigned long write_err, void *context) +{ + struct dm_thin_new_mapping *m = context; + + m->err = read_err || write_err ? -EIO : 0; + complete_mapping_preparation(m); +} + static void overwrite_endio(struct bio *bio, int err) { - unsigned long flags; struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); struct dm_thin_new_mapping *m = h->overwrite_mapping; - struct pool *pool = m->tc->pool; m->err = err; - - spin_lock_irqsave(&pool->lock, flags); - m->prepared = true; - __maybe_add_mapping(m); - spin_unlock_irqrestore(&pool->lock, flags); + complete_mapping_preparation(m); } /*----------------------------------------------------------------*/ @@ -821,10 +825,31 @@ static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool) return m; } +static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m, + sector_t begin, sector_t end) +{ + int r; + struct dm_io_region to; + + to.bdev = tc->pool_dev->bdev; + to.sector = begin; + to.count = end - begin; + + r = dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m); + if (r < 0) { + DMERR_LIMIT("dm_kcopyd_zero() failed"); + copy_complete(1, 1, m); + } +} + +/* + * A partial copy also needs to zero the uncopied region. + */ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, struct dm_dev *origin, dm_block_t data_origin, dm_block_t data_dest, - struct dm_bio_prison_cell *cell, struct bio *bio) + struct dm_bio_prison_cell *cell, struct bio *bio, + sector_t len) { int r; struct pool *pool = tc->pool; @@ -835,8 +860,15 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, m->data_block = data_dest; m->cell = cell; + /* + * quiesce action + copy action + an extra reference held for the + * duration of this function (we may need to inc later for a + * partial zero). + */ + atomic_set(&m->prepare_actions, 3); + if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list)) - m->quiesced = true; + complete_mapping_preparation(m); /* already quiesced */ /* * IO to pool_dev remaps to the pool target's data_dev. @@ -857,20 +889,38 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, from.bdev = origin->bdev; from.sector = data_origin * pool->sectors_per_block; - from.count = pool->sectors_per_block; + from.count = len; to.bdev = tc->pool_dev->bdev; to.sector = data_dest * pool->sectors_per_block; - to.count = pool->sectors_per_block; + to.count = len; r = dm_kcopyd_copy(pool->copier, &from, 1, &to, 0, copy_complete, m); if (r < 0) { - mempool_free(m, pool->mapping_pool); DMERR_LIMIT("dm_kcopyd_copy() failed"); - cell_error(pool, cell); + copy_complete(1, 1, m); + + /* + * We allow the zero to be issued, to simplify the + * error path. Otherwise we'd need to start + * worrying about decrementing the prepare_actions + * counter. + */ + } + + /* + * Do we need to zero a tail region? + */ + if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) { + atomic_inc(&m->prepare_actions); + ll_zero(tc, m, + data_dest * pool->sectors_per_block + len, + (data_dest + 1) * pool->sectors_per_block); } } + + complete_mapping_preparation(m); /* drop our ref */ } static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, @@ -878,15 +928,8 @@ static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, struct dm_bio_prison_cell *cell, struct bio *bio) { schedule_copy(tc, virt_block, tc->pool_dev, - data_origin, data_dest, cell, bio); -} - -static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, - dm_block_t data_dest, - struct dm_bio_prison_cell *cell, struct bio *bio) -{ - schedule_copy(tc, virt_block, tc->origin_dev, - virt_block, data_dest, cell, bio); + data_origin, data_dest, cell, bio, + tc->pool->sectors_per_block); } static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, @@ -896,8 +939,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, struct pool *pool = tc->pool; struct dm_thin_new_mapping *m = get_next_mapping(pool); - m->quiesced = true; - m->prepared = false; + atomic_set(&m->prepare_actions, 1); /* no need to quiesce */ m->tc = tc; m->virt_block = virt_block; m->data_block = data_block; @@ -919,21 +961,33 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); inc_all_io_entry(pool, bio); remap_and_issue(tc, bio, data_block); - } else { - int r; - struct dm_io_region to; - to.bdev = tc->pool_dev->bdev; - to.sector = data_block * pool->sectors_per_block; - to.count = pool->sectors_per_block; + } else + ll_zero(tc, m, + data_block * pool->sectors_per_block, + (data_block + 1) * pool->sectors_per_block); +} - r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m); - if (r < 0) { - mempool_free(m, pool->mapping_pool); - DMERR_LIMIT("dm_kcopyd_zero() failed"); - cell_error(pool, cell); - } - } +static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, + dm_block_t data_dest, + struct dm_bio_prison_cell *cell, struct bio *bio) +{ + struct pool *pool = tc->pool; + sector_t virt_block_begin = virt_block * pool->sectors_per_block; + sector_t virt_block_end = (virt_block + 1) * pool->sectors_per_block; + + if (virt_block_end <= tc->origin_size) + schedule_copy(tc, virt_block, tc->origin_dev, + virt_block, data_dest, cell, bio, + pool->sectors_per_block); + + else if (virt_block_begin < tc->origin_size) + schedule_copy(tc, virt_block, tc->origin_dev, + virt_block, data_dest, cell, bio, + tc->origin_size - virt_block_begin); + + else + schedule_zero(tc, virt_block, data_dest, cell, bio); } /* @@ -1315,7 +1369,18 @@ static void process_bio(struct thin_c *tc, struct bio *bio) inc_all_io_entry(pool, bio); cell_defer_no_holder(tc, cell); - remap_to_origin_and_issue(tc, bio); + if (bio_end_sector(bio) <= tc->origin_size) + remap_to_origin_and_issue(tc, bio); + + else if (bio->bi_iter.bi_sector < tc->origin_size) { + zero_fill_bio(bio); + bio->bi_iter.bi_size = (tc->origin_size - bio->bi_iter.bi_sector) << SECTOR_SHIFT; + remap_to_origin_and_issue(tc, bio); + + } else { + zero_fill_bio(bio); + bio_endio(bio, 0); + } } else provision_block(tc, bio, block, cell); break; @@ -1871,6 +1936,14 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } + /* + * We must hold the virtual cell before doing the lookup, otherwise + * there's a race with discard. + */ + build_virtual_key(tc->td, block, &key); + if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result)) + return DM_MAPIO_SUBMITTED; + r = dm_thin_find_block(td, block, 0, &result); /* @@ -1894,13 +1967,10 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) * shared flag will be set in their case. */ thin_defer_bio(tc, bio); + cell_defer_no_holder_no_free(tc, &cell1); return DM_MAPIO_SUBMITTED; } - build_virtual_key(tc->td, block, &key); - if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result)) - return DM_MAPIO_SUBMITTED; - build_data_key(tc->td, result.block, &key); if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) { cell_defer_no_holder_no_free(tc, &cell1); @@ -1921,6 +1991,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) * of doing so. */ handle_unserviceable_bio(tc->pool, bio); + cell_defer_no_holder_no_free(tc, &cell1); return DM_MAPIO_SUBMITTED; } /* fall through */ @@ -1931,6 +2002,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) * provide the hint to load the metadata into cache. */ thin_defer_bio(tc, bio); + cell_defer_no_holder_no_free(tc, &cell1); return DM_MAPIO_SUBMITTED; default: @@ -1940,6 +2012,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) * pool is switched to fail-io mode. */ bio_io_error(bio); + cell_defer_no_holder_no_free(tc, &cell1); return DM_MAPIO_SUBMITTED; } } @@ -3112,7 +3185,7 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) */ if (io_opt_sectors < pool->sectors_per_block || do_div(io_opt_sectors, pool->sectors_per_block)) { - blk_limits_io_min(limits, 0); + blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT); blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); } @@ -3141,7 +3214,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 12, 0}, + .version = {1, 13, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -3361,8 +3434,7 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err) spin_lock_irqsave(&pool->lock, flags); list_for_each_entry_safe(m, tmp, &work, list) { list_del(&m->list); - m->quiesced = true; - __maybe_add_mapping(m); + __complete_mapping_preparation(m); } spin_unlock_irqrestore(&pool->lock, flags); } @@ -3401,6 +3473,16 @@ static void thin_postsuspend(struct dm_target *ti) noflush_work(tc, do_noflush_stop); } +static int thin_preresume(struct dm_target *ti) +{ + struct thin_c *tc = ti->private; + + if (tc->origin_dev) + tc->origin_size = get_dev_size(tc->origin_dev->bdev); + + return 0; +} + /* * <nr mapped sectors> <highest mapped sector> */ @@ -3483,12 +3565,13 @@ static int thin_iterate_devices(struct dm_target *ti, static struct target_type thin_target = { .name = "thin", - .version = {1, 12, 0}, + .version = {1, 13, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, .map = thin_map, .end_io = thin_endio, + .preresume = thin_preresume, .presuspend = thin_presuspend, .postsuspend = thin_postsuspend, .status = thin_status, diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 32b958dbc499..58f3927fd7cc 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -142,6 +142,9 @@ struct mapped_device { */ struct dm_table *map; + struct list_head table_devices; + struct mutex table_devices_lock; + unsigned long flags; struct request_queue *queue; @@ -212,6 +215,12 @@ struct dm_md_mempools { struct bio_set *bs; }; +struct table_device { + struct list_head list; + atomic_t count; + struct dm_dev dm_dev; +}; + #define RESERVED_BIO_BASED_IOS 16 #define RESERVED_REQUEST_BASED_IOS 256 #define RESERVED_MAX_IOS 1024 @@ -670,6 +679,120 @@ static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) } /* + * Open a table device so we can use it as a map destination. + */ +static int open_table_device(struct table_device *td, dev_t dev, + struct mapped_device *md) +{ + static char *_claim_ptr = "I belong to device-mapper"; + struct block_device *bdev; + + int r; + + BUG_ON(td->dm_dev.bdev); + + bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + + r = bd_link_disk_holder(bdev, dm_disk(md)); + if (r) { + blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL); + return r; + } + + td->dm_dev.bdev = bdev; + return 0; +} + +/* + * Close a table device that we've been using. + */ +static void close_table_device(struct table_device *td, struct mapped_device *md) +{ + if (!td->dm_dev.bdev) + return; + + bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); + blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); + td->dm_dev.bdev = NULL; +} + +static struct table_device *find_table_device(struct list_head *l, dev_t dev, + fmode_t mode) { + struct table_device *td; + + list_for_each_entry(td, l, list) + if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode) + return td; + + return NULL; +} + +int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, + struct dm_dev **result) { + int r; + struct table_device *td; + + mutex_lock(&md->table_devices_lock); + td = find_table_device(&md->table_devices, dev, mode); + if (!td) { + td = kmalloc(sizeof(*td), GFP_KERNEL); + if (!td) { + mutex_unlock(&md->table_devices_lock); + return -ENOMEM; + } + + td->dm_dev.mode = mode; + td->dm_dev.bdev = NULL; + + if ((r = open_table_device(td, dev, md))) { + mutex_unlock(&md->table_devices_lock); + kfree(td); + return r; + } + + format_dev_t(td->dm_dev.name, dev); + + atomic_set(&td->count, 0); + list_add(&td->list, &md->table_devices); + } + atomic_inc(&td->count); + mutex_unlock(&md->table_devices_lock); + + *result = &td->dm_dev; + return 0; +} +EXPORT_SYMBOL_GPL(dm_get_table_device); + +void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) +{ + struct table_device *td = container_of(d, struct table_device, dm_dev); + + mutex_lock(&md->table_devices_lock); + if (atomic_dec_and_test(&td->count)) { + close_table_device(td, md); + list_del(&td->list); + kfree(td); + } + mutex_unlock(&md->table_devices_lock); +} +EXPORT_SYMBOL(dm_put_table_device); + +static void free_table_devices(struct list_head *devices) +{ + struct list_head *tmp, *next; + + list_for_each_safe(tmp, next, devices) { + struct table_device *td = list_entry(tmp, struct table_device, list); + + DMWARN("dm_destroy: %s still exists with %d references", + td->dm_dev.name, atomic_read(&td->count)); + kfree(td); + } +} + +/* * Get the geometry associated with a dm device */ int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) @@ -1249,13 +1372,13 @@ static void clone_bio(struct dm_target_io *tio, struct bio *bio, } static struct dm_target_io *alloc_tio(struct clone_info *ci, - struct dm_target *ti, int nr_iovecs, + struct dm_target *ti, unsigned target_bio_nr) { struct dm_target_io *tio; struct bio *clone; - clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, ci->md->bs); + clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); tio = container_of(clone, struct dm_target_io, clone); tio->io = ci->io; @@ -1269,17 +1392,12 @@ static void __clone_and_map_simple_bio(struct clone_info *ci, struct dm_target *ti, unsigned target_bio_nr, unsigned *len) { - struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs, target_bio_nr); + struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr); struct bio *clone = &tio->clone; tio->len_ptr = len; - /* - * Discard requests require the bio's inline iovecs be initialized. - * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush - * and discard, so no need for concern about wasted bvec allocations. - */ - __bio_clone_fast(clone, ci->bio); + __bio_clone_fast(clone, ci->bio); if (len) bio_setup_sector(clone, ci->sector, *len); @@ -1322,7 +1440,7 @@ static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti num_target_bios = ti->num_write_bios(ti, bio); for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { - tio = alloc_tio(ci, ti, 0, target_bio_nr); + tio = alloc_tio(ci, ti, target_bio_nr); tio->len_ptr = len; clone_bio(tio, bio, sector, *len); __map_bio(tio); @@ -1949,12 +2067,14 @@ static struct mapped_device *alloc_dev(int minor) md->type = DM_TYPE_NONE; mutex_init(&md->suspend_lock); mutex_init(&md->type_lock); + mutex_init(&md->table_devices_lock); spin_lock_init(&md->deferred_lock); atomic_set(&md->holders, 1); atomic_set(&md->open_count, 0); atomic_set(&md->event_nr, 0); atomic_set(&md->uevent_seq, 0); INIT_LIST_HEAD(&md->uevent_list); + INIT_LIST_HEAD(&md->table_devices); spin_lock_init(&md->uevent_lock); md->queue = blk_alloc_queue(GFP_KERNEL); @@ -2040,6 +2160,7 @@ static void free_dev(struct mapped_device *md) blk_integrity_unregister(md->disk); del_gendisk(md->disk); cleanup_srcu_struct(&md->io_barrier); + free_table_devices(&md->table_devices); free_minor(minor); spin_lock(&_minor_lock); @@ -2900,7 +3021,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u if (!pools->io_pool) goto out; - pools->bs = bioset_create(pool_size, front_pad); + pools->bs = bioset_create_nobvec(pool_size, front_pad); if (!pools->bs) goto out; diff --git a/drivers/md/dm.h b/drivers/md/dm.h index ed76126aac54..988c7fb7b145 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -44,7 +44,7 @@ struct dm_dev_internal { struct list_head list; atomic_t count; - struct dm_dev dm_dev; + struct dm_dev *dm_dev; }; struct dm_table; @@ -72,7 +72,6 @@ int dm_table_any_busy_target(struct dm_table *t); unsigned dm_table_get_type(struct dm_table *t); struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); bool dm_table_request_based(struct dm_table *t); -bool dm_table_supports_discards(struct dm_table *t); void dm_table_free_md_mempools(struct dm_table *t); struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t); @@ -189,6 +188,9 @@ int dm_cancel_deferred_remove(struct mapped_device *md); int dm_request_based(struct mapped_device *md); sector_t dm_get_size(struct mapped_device *md); struct request_queue *dm_get_md_queue(struct mapped_device *md); +int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, + struct dm_dev **result); +void dm_put_table_device(struct mapped_device *md, struct dm_dev *d); struct dm_stats *dm_get_stats(struct mapped_device *md); int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 56f534b4a2d2..64713b77df1c 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -10,10 +10,10 @@ it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. - + You should have received a copy of the GNU General Public License (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include <linux/blkdev.h> @@ -25,7 +25,7 @@ #include "linear.h" /* - * find which device holds a particular offset + * find which device holds a particular offset */ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) { @@ -355,7 +355,6 @@ static void linear_status (struct seq_file *seq, struct mddev *mddev) seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); } - static struct md_personality linear_personality = { .name = "linear", @@ -379,7 +378,6 @@ static void linear_exit (void) unregister_md_personality (&linear_personality); } - module_init(linear_init); module_exit(linear_exit); MODULE_LICENSE("GPL"); diff --git a/drivers/md/md.c b/drivers/md/md.c index 32fc19c540d4..9233c71138f1 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1,6 +1,6 @@ /* md.c : Multiple Devices driver for Linux - Copyright (C) 1998, 1999, 2000 Ingo Molnar + Copyright (C) 1998, 1999, 2000 Ingo Molnar completely rewritten, based on the MD driver code from Marc Zyngier @@ -66,8 +66,6 @@ static void autostart_arrays(int part); static LIST_HEAD(pers_list); static DEFINE_SPINLOCK(pers_lock); -static void md_print_devices(void); - static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; static struct workqueue_struct *md_misc_wq; @@ -75,8 +73,6 @@ static struct workqueue_struct *md_misc_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); -#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } - /* * Default number of read corrections we'll attempt on an rdev * before ejecting it from the array. We divide the read error @@ -218,7 +214,6 @@ static void md_new_event_inintr(struct mddev *mddev) static LIST_HEAD(all_mddevs); static DEFINE_SPINLOCK(all_mddevs_lock); - /* * iterates through all used mddevs in the system. * We take care to grab the all_mddevs_lock whenever navigating @@ -228,7 +223,7 @@ static DEFINE_SPINLOCK(all_mddevs_lock); */ #define for_each_mddev(_mddev,_tmp) \ \ - for (({ spin_lock(&all_mddevs_lock); \ + for (({ spin_lock(&all_mddevs_lock); \ _tmp = all_mddevs.next; \ _mddev = NULL;}); \ ({ if (_tmp != &all_mddevs) \ @@ -241,7 +236,6 @@ static DEFINE_SPINLOCK(all_mddevs_lock); _tmp = _tmp->next;}) \ ) - /* Rather than calling directly into the personality make_request function, * IO requests come here first so that we can check if the device is * being suspended pending a reconfiguration. @@ -488,7 +482,7 @@ void mddev_init(struct mddev *mddev) } EXPORT_SYMBOL_GPL(mddev_init); -static struct mddev * mddev_find(dev_t unit) +static struct mddev *mddev_find(dev_t unit) { struct mddev *mddev, *new = NULL; @@ -530,7 +524,7 @@ static struct mddev * mddev_find(dev_t unit) kfree(new); return NULL; } - + is_free = 1; list_for_each_entry(mddev, &all_mddevs, all_mddevs) if (mddev->unit == dev) { @@ -562,7 +556,7 @@ static struct mddev * mddev_find(dev_t unit) goto retry; } -static inline int __must_check mddev_lock(struct mddev * mddev) +static inline int __must_check mddev_lock(struct mddev *mddev) { return mutex_lock_interruptible(&mddev->reconfig_mutex); } @@ -570,7 +564,7 @@ static inline int __must_check mddev_lock(struct mddev * mddev) /* Sometimes we need to take the lock in a situation where * failure due to interrupts is not acceptable. */ -static inline void mddev_lock_nointr(struct mddev * mddev) +static inline void mddev_lock_nointr(struct mddev *mddev) { mutex_lock(&mddev->reconfig_mutex); } @@ -580,14 +574,14 @@ static inline int mddev_is_locked(struct mddev *mddev) return mutex_is_locked(&mddev->reconfig_mutex); } -static inline int mddev_trylock(struct mddev * mddev) +static inline int mddev_trylock(struct mddev *mddev) { return mutex_trylock(&mddev->reconfig_mutex); } static struct attribute_group md_redundancy_group; -static void mddev_unlock(struct mddev * mddev) +static void mddev_unlock(struct mddev *mddev) { if (mddev->to_remove) { /* These cannot be removed under reconfig_mutex as @@ -630,17 +624,6 @@ static void mddev_unlock(struct mddev * mddev) spin_unlock(&pers_lock); } -static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr) -{ - struct md_rdev *rdev; - - rdev_for_each(rdev, mddev) - if (rdev->desc_nr == nr) - return rdev; - - return NULL; -} - static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) { struct md_rdev *rdev; @@ -693,11 +676,8 @@ static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) return MD_NEW_SIZE_SECTORS(num_sectors); } -static int alloc_disk_sb(struct md_rdev * rdev) +static int alloc_disk_sb(struct md_rdev *rdev) { - if (rdev->sb_page) - MD_BUG(); - rdev->sb_page = alloc_page(GFP_KERNEL); if (!rdev->sb_page) { printk(KERN_ALERT "md: out of memory.\n"); @@ -766,14 +746,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, void md_super_wait(struct mddev *mddev) { /* wait for all superblock writes that were scheduled to complete */ - DEFINE_WAIT(wq); - for(;;) { - prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); - if (atomic_read(&mddev->pending_writes)==0) - break; - schedule(); - } - finish_wait(&mddev->sb_wait, &wq); + wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); } int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, @@ -801,17 +774,13 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, } EXPORT_SYMBOL_GPL(sync_page_io); -static int read_disk_sb(struct md_rdev * rdev, int size) +static int read_disk_sb(struct md_rdev *rdev, int size) { char b[BDEVNAME_SIZE]; - if (!rdev->sb_page) { - MD_BUG(); - return -EINVAL; - } + if (rdev->sb_loaded) return 0; - if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true)) goto fail; rdev->sb_loaded = 1; @@ -825,7 +794,7 @@ fail: static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) { - return sb1->set_uuid0 == sb2->set_uuid0 && + return sb1->set_uuid0 == sb2->set_uuid0 && sb1->set_uuid1 == sb2->set_uuid1 && sb1->set_uuid2 == sb2->set_uuid2 && sb1->set_uuid3 == sb2->set_uuid3; @@ -861,14 +830,13 @@ abort: return ret; } - static u32 md_csum_fold(u32 csum) { csum = (csum & 0xffff) + (csum >> 16); return (csum & 0xffff) + (csum >> 16); } -static unsigned int calc_sb_csum(mdp_super_t * sb) +static unsigned int calc_sb_csum(mdp_super_t *sb) { u64 newcsum = 0; u32 *sb32 = (u32*)sb; @@ -882,7 +850,6 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) newcsum += sb32[i]; csum = (newcsum & 0xffffffff) + (newcsum>>32); - #ifdef CONFIG_ALPHA /* This used to use csum_partial, which was wrong for several * reasons including that different results are returned on @@ -899,7 +866,6 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) return csum; } - /* * Handle superblock details. * We want to be able to handle multiple superblock formats @@ -965,7 +931,7 @@ int md_check_no_bitmap(struct mddev *mddev) EXPORT_SYMBOL(md_check_no_bitmap); /* - * load_super for 0.90.0 + * load_super for 0.90.0 */ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) { @@ -1044,7 +1010,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor ev2 = md_event(refsb); if (ev1 > ev2) ret = 1; - else + else ret = 0; } rdev->sectors = rdev->sb_start; @@ -1118,7 +1084,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) if (sb->state & (1<<MD_SB_CLEAN)) mddev->recovery_cp = MaxSector; else { - if (sb->events_hi == sb->cp_events_hi && + if (sb->events_hi == sb->cp_events_hi && sb->events_lo == sb->cp_events_lo) { mddev->recovery_cp = sb->recovery_cp; } else @@ -1146,7 +1112,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) ++ev1; if (sb->disks[rdev->desc_nr].state & ( (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) - if (ev1 < mddev->events) + if (ev1 < mddev->events) return -EINVAL; } else if (mddev->bitmap) { /* if adding to array with a bitmap, then we can accept an @@ -1197,7 +1163,6 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) struct md_rdev *rdev2; int next_spare = mddev->raid_disks; - /* make rdev->sb match mddev data.. * * 1/ zero out disks @@ -1366,7 +1331,7 @@ super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) * version 1 superblock */ -static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) +static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) { __le32 disk_csum; u32 csum; @@ -1430,7 +1395,6 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ ret = read_disk_sb(rdev, 4096); if (ret) return ret; - sb = page_address(rdev->sb_page); if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || @@ -1817,7 +1781,7 @@ retry: for (i=0; i<max_dev;i++) sb->dev_roles[i] = cpu_to_le16(0xfffe); - + rdev_for_each(rdev2, mddev) { i = rdev2->desc_nr; if (test_bit(Faulty, &rdev2->flags)) @@ -2033,18 +1997,13 @@ void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) } EXPORT_SYMBOL(md_integrity_add_rdev); -static int bind_rdev_to_array(struct md_rdev * rdev, struct mddev * mddev) +static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) { char b[BDEVNAME_SIZE]; struct kobject *ko; char *s; int err; - if (rdev->mddev) { - MD_BUG(); - return -EINVAL; - } - /* prevent duplicates */ if (find_rdev(mddev, rdev->bdev->bd_dev)) return -EEXIST; @@ -2067,16 +2026,21 @@ static int bind_rdev_to_array(struct md_rdev * rdev, struct mddev * mddev) * If it is -1, assign a free number, else * check number is not in use */ + rcu_read_lock(); if (rdev->desc_nr < 0) { int choice = 0; - if (mddev->pers) choice = mddev->raid_disks; - while (find_rdev_nr(mddev, choice)) + if (mddev->pers) + choice = mddev->raid_disks; + while (find_rdev_nr_rcu(mddev, choice)) choice++; rdev->desc_nr = choice; } else { - if (find_rdev_nr(mddev, rdev->desc_nr)) + if (find_rdev_nr_rcu(mddev, rdev->desc_nr)) { + rcu_read_unlock(); return -EBUSY; + } } + rcu_read_unlock(); if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { printk(KERN_WARNING "md: %s: array is limited to %d devices\n", mdname(mddev), mddev->max_disks); @@ -2118,13 +2082,10 @@ static void md_delayed_delete(struct work_struct *ws) kobject_put(&rdev->kobj); } -static void unbind_rdev_from_array(struct md_rdev * rdev) +static void unbind_rdev_from_array(struct md_rdev *rdev) { char b[BDEVNAME_SIZE]; - if (!rdev->mddev) { - MD_BUG(); - return; - } + bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); @@ -2169,20 +2130,17 @@ static void unlock_rdev(struct md_rdev *rdev) { struct block_device *bdev = rdev->bdev; rdev->bdev = NULL; - if (!bdev) - MD_BUG(); blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } void md_autodetect_dev(dev_t dev); -static void export_rdev(struct md_rdev * rdev) +static void export_rdev(struct md_rdev *rdev) { char b[BDEVNAME_SIZE]; + printk(KERN_INFO "md: export_rdev(%s)\n", bdevname(rdev->bdev,b)); - if (rdev->mddev) - MD_BUG(); md_rdev_clear(rdev); #ifndef MODULE if (test_bit(AutoDetected, &rdev->flags)) @@ -2192,7 +2150,7 @@ static void export_rdev(struct md_rdev * rdev) kobject_put(&rdev->kobj); } -static void kick_rdev_from_array(struct md_rdev * rdev) +static void kick_rdev_from_array(struct md_rdev *rdev) { unbind_rdev_from_array(rdev); export_rdev(rdev); @@ -2200,153 +2158,18 @@ static void kick_rdev_from_array(struct md_rdev * rdev) static void export_array(struct mddev *mddev) { - struct md_rdev *rdev, *tmp; + struct md_rdev *rdev; - rdev_for_each_safe(rdev, tmp, mddev) { - if (!rdev->mddev) { - MD_BUG(); - continue; - } + while (!list_empty(&mddev->disks)) { + rdev = list_first_entry(&mddev->disks, struct md_rdev, + same_set); kick_rdev_from_array(rdev); } - if (!list_empty(&mddev->disks)) - MD_BUG(); mddev->raid_disks = 0; mddev->major_version = 0; } -static void print_desc(mdp_disk_t *desc) -{ - printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, - desc->major,desc->minor,desc->raid_disk,desc->state); -} - -static void print_sb_90(mdp_super_t *sb) -{ - int i; - - printk(KERN_INFO - "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", - sb->major_version, sb->minor_version, sb->patch_version, - sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, - sb->ctime); - printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", - sb->level, sb->size, sb->nr_disks, sb->raid_disks, - sb->md_minor, sb->layout, sb->chunk_size); - printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" - " FD:%d SD:%d CSUM:%08x E:%08lx\n", - sb->utime, sb->state, sb->active_disks, sb->working_disks, - sb->failed_disks, sb->spare_disks, - sb->sb_csum, (unsigned long)sb->events_lo); - - printk(KERN_INFO); - for (i = 0; i < MD_SB_DISKS; i++) { - mdp_disk_t *desc; - - desc = sb->disks + i; - if (desc->number || desc->major || desc->minor || - desc->raid_disk || (desc->state && (desc->state != 4))) { - printk(" D %2d: ", i); - print_desc(desc); - } - } - printk(KERN_INFO "md: THIS: "); - print_desc(&sb->this_disk); -} - -static void print_sb_1(struct mdp_superblock_1 *sb) -{ - __u8 *uuid; - - uuid = sb->set_uuid; - printk(KERN_INFO - "md: SB: (V:%u) (F:0x%08x) Array-ID:<%pU>\n" - "md: Name: \"%s\" CT:%llu\n", - le32_to_cpu(sb->major_version), - le32_to_cpu(sb->feature_map), - uuid, - sb->set_name, - (unsigned long long)le64_to_cpu(sb->ctime) - & MD_SUPERBLOCK_1_TIME_SEC_MASK); - - uuid = sb->device_uuid; - printk(KERN_INFO - "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" - " RO:%llu\n" - "md: Dev:%08x UUID: %pU\n" - "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" - "md: (MaxDev:%u) \n", - le32_to_cpu(sb->level), - (unsigned long long)le64_to_cpu(sb->size), - le32_to_cpu(sb->raid_disks), - le32_to_cpu(sb->layout), - le32_to_cpu(sb->chunksize), - (unsigned long long)le64_to_cpu(sb->data_offset), - (unsigned long long)le64_to_cpu(sb->data_size), - (unsigned long long)le64_to_cpu(sb->super_offset), - (unsigned long long)le64_to_cpu(sb->recovery_offset), - le32_to_cpu(sb->dev_number), - uuid, - sb->devflags, - (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK, - (unsigned long long)le64_to_cpu(sb->events), - (unsigned long long)le64_to_cpu(sb->resync_offset), - le32_to_cpu(sb->sb_csum), - le32_to_cpu(sb->max_dev) - ); -} - -static void print_rdev(struct md_rdev *rdev, int major_version) -{ - char b[BDEVNAME_SIZE]; - printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n", - bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors, - test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), - rdev->desc_nr); - if (rdev->sb_loaded) { - printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); - switch (major_version) { - case 0: - print_sb_90(page_address(rdev->sb_page)); - break; - case 1: - print_sb_1(page_address(rdev->sb_page)); - break; - } - } else - printk(KERN_INFO "md: no rdev superblock!\n"); -} - -static void md_print_devices(void) -{ - struct list_head *tmp; - struct md_rdev *rdev; - struct mddev *mddev; - char b[BDEVNAME_SIZE]; - - printk("\n"); - printk("md: **********************************\n"); - printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); - printk("md: **********************************\n"); - for_each_mddev(mddev, tmp) { - - if (mddev->bitmap) - bitmap_print_sb(mddev->bitmap); - else - printk("%s: ", mdname(mddev)); - rdev_for_each(rdev, mddev) - printk("<%s>", bdevname(rdev->bdev,b)); - printk("\n"); - - rdev_for_each(rdev, mddev) - print_rdev(rdev, mddev->major_version); - } - printk("md: **********************************\n"); - printk("\n"); -} - - -static void sync_sbs(struct mddev * mddev, int nospares) +static void sync_sbs(struct mddev *mddev, int nospares) { /* Update each superblock (in-memory image), but * if we are allowed to, skip spares which already @@ -2369,7 +2192,7 @@ static void sync_sbs(struct mddev * mddev, int nospares) } } -static void md_update_sb(struct mddev * mddev, int force_change) +static void md_update_sb(struct mddev *mddev, int force_change) { struct md_rdev *rdev; int sync_req; @@ -2390,7 +2213,7 @@ repeat: mddev->curr_resync_completed > rdev->recovery_offset) rdev->recovery_offset = mddev->curr_resync_completed; - } + } if (!mddev->persistent) { clear_bit(MD_CHANGE_CLEAN, &mddev->flags); clear_bit(MD_CHANGE_DEVS, &mddev->flags); @@ -2453,15 +2276,12 @@ repeat: mddev->can_decrease_events = nospares; } - if (!mddev->events) { - /* - * oops, this 64-bit counter should never wrap. - * Either we are in around ~1 trillion A.C., assuming - * 1 reboot per second, or we have a bug: - */ - MD_BUG(); - mddev->events --; - } + /* + * This 64-bit counter should never wrap. + * Either we are in around ~1 trillion A.C., assuming + * 1 reboot per second, or we have a bug... + */ + WARN_ON(mddev->events == 0); rdev_for_each(rdev, mddev) { if (rdev->badblocks.changed) @@ -2668,10 +2488,12 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) set_bit(In_sync, &rdev->flags); err = 0; } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) { - clear_bit(In_sync, &rdev->flags); - rdev->saved_raid_disk = rdev->raid_disk; - rdev->raid_disk = -1; - err = 0; + if (rdev->mddev->pers == NULL) { + clear_bit(In_sync, &rdev->flags); + rdev->saved_raid_disk = rdev->raid_disk; + rdev->raid_disk = -1; + err = 0; + } } else if (cmd_match(buf, "write_error")) { set_bit(WriteErrorSeen, &rdev->flags); err = 0; @@ -2829,7 +2651,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) return len; } - static struct rdev_sysfs_entry rdev_slot = __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); @@ -2980,20 +2801,20 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) rdev->sectors = sectors; if (sectors > oldsectors && my_mddev->external) { - /* need to check that all other rdevs with the same ->bdev - * do not overlap. We need to unlock the mddev to avoid - * a deadlock. We have already changed rdev->sectors, and if - * we have to change it back, we will have the lock again. + /* Need to check that all other rdevs with the same + * ->bdev do not overlap. 'rcu' is sufficient to walk + * the rdev lists safely. + * This check does not provide a hard guarantee, it + * just helps avoid dangerous mistakes. */ struct mddev *mddev; int overlap = 0; struct list_head *tmp; - mddev_unlock(my_mddev); + rcu_read_lock(); for_each_mddev(mddev, tmp) { struct md_rdev *rdev2; - mddev_lock_nointr(mddev); rdev_for_each(rdev2, mddev) if (rdev->bdev == rdev2->bdev && rdev != rdev2 && @@ -3003,13 +2824,12 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) overlap = 1; break; } - mddev_unlock(mddev); if (overlap) { mddev_put(mddev); break; } } - mddev_lock_nointr(my_mddev); + rcu_read_unlock(); if (overlap) { /* Someone else could have slipped in a size * change here, but doing so is just silly. @@ -3027,7 +2847,6 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) static struct rdev_sysfs_entry rdev_size = __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); - static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) { unsigned long long recovery_start = rdev->recovery_offset; @@ -3063,7 +2882,6 @@ static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_ static struct rdev_sysfs_entry rdev_recovery_start = __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); - static ssize_t badblocks_show(struct badblocks *bb, char *page, int unack); static ssize_t @@ -3084,7 +2902,6 @@ static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) static struct rdev_sysfs_entry rdev_bad_blocks = __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); - static ssize_t ubb_show(struct md_rdev *rdev, char *page) { return badblocks_show(&rdev->badblocks, page, 1); @@ -3241,7 +3058,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; if (!size) { - printk(KERN_WARNING + printk(KERN_WARNING "md: %s has zero or unknown size, marking faulty!\n", bdevname(rdev->bdev,b)); err = -EINVAL; @@ -3260,7 +3077,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe goto abort_free; } if (err < 0) { - printk(KERN_WARNING + printk(KERN_WARNING "md: could not read %s's sb, not importing!\n", bdevname(rdev->bdev,b)); goto abort_free; @@ -3281,8 +3098,7 @@ abort_free: * Check a full RAID array for plausibility */ - -static void analyze_sbs(struct mddev * mddev) +static void analyze_sbs(struct mddev *mddev) { int i; struct md_rdev *rdev, *freshest, *tmp; @@ -3300,12 +3116,11 @@ static void analyze_sbs(struct mddev * mddev) default: printk( KERN_ERR \ "md: fatal superblock inconsistency in %s" - " -- removing from array\n", + " -- removing from array\n", bdevname(rdev->bdev,b)); kick_rdev_from_array(rdev); } - super_types[mddev->major_version]. validate_super(mddev, freshest); @@ -3344,7 +3159,7 @@ static void analyze_sbs(struct mddev * mddev) /* Read a fixed-point number. * Numbers in sysfs attributes should be in "standard" units where * possible, so time should be in seconds. - * However we internally use a a much smaller unit such as + * However we internally use a a much smaller unit such as * milliseconds or jiffies. * This function takes a decimal number with a possible fractional * component, and produces an integer which is the result of @@ -3381,7 +3196,6 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) return 0; } - static void md_safemode_timeout(unsigned long data); static ssize_t @@ -3524,7 +3338,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) /* Looks like we have a winner */ mddev_suspend(mddev); mddev->pers->stop(mddev); - + if (mddev->pers->sync_request == NULL && pers->sync_request != NULL) { /* need to add the md_redundancy_group */ @@ -3533,7 +3347,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) "md: cannot register extra attributes for %s\n", mdname(mddev)); mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); - } + } if (mddev->pers->sync_request != NULL && pers->sync_request == NULL) { /* need to remove the md_redundancy_group */ @@ -3611,7 +3425,6 @@ level_store(struct mddev *mddev, const char *buf, size_t len) static struct md_sysfs_entry md_level = __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); - static ssize_t layout_show(struct mddev *mddev, char *page) { @@ -3654,7 +3467,6 @@ layout_store(struct mddev *mddev, const char *buf, size_t len) static struct md_sysfs_entry md_layout = __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); - static ssize_t raid_disks_show(struct mddev *mddev, char *page) { @@ -3859,9 +3671,9 @@ array_state_show(struct mddev *mddev, char *page) return sprintf(page, "%s\n", array_states[st]); } -static int do_md_stop(struct mddev * mddev, int ro, struct block_device *bdev); -static int md_set_readonly(struct mddev * mddev, struct block_device *bdev); -static int do_md_run(struct mddev * mddev); +static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev); +static int md_set_readonly(struct mddev *mddev, struct block_device *bdev); +static int do_md_run(struct mddev *mddev); static int restart_array(struct mddev *mddev); static ssize_t @@ -4012,7 +3824,6 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) minor != MINOR(dev)) return -EOVERFLOW; - if (mddev->persistent) { rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); @@ -4108,7 +3919,6 @@ size_store(struct mddev *mddev, const char *buf, size_t len) static struct md_sysfs_entry md_size = __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); - /* Metadata version. * This is one of * 'none' for arrays with no metadata (good luck...) @@ -4490,7 +4300,7 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) unsigned long long new = simple_strtoull(buf, &e, 10); unsigned long long old = mddev->suspend_lo; - if (mddev->pers == NULL || + if (mddev->pers == NULL || mddev->pers->quiesce == NULL) return -EINVAL; if (buf == e || (*e && *e != '\n')) @@ -4510,7 +4320,6 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) static struct md_sysfs_entry md_suspend_lo = __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); - static ssize_t suspend_hi_show(struct mddev *mddev, char *page) { @@ -4698,7 +4507,6 @@ static struct attribute_group md_redundancy_group = { .attrs = md_redundancy_attrs, }; - static ssize_t md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { @@ -5111,7 +4919,7 @@ int md_run(struct mddev *mddev) } else if (mddev->ro == 2) /* auto-readonly not meaningful */ mddev->ro = 0; - atomic_set(&mddev->writes_pending,0); + atomic_set(&mddev->writes_pending,0); atomic_set(&mddev->max_corr_read_errors, MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); mddev->safemode = 0; @@ -5125,9 +4933,9 @@ int md_run(struct mddev *mddev) if (rdev->raid_disk >= 0) if (sysfs_link_rdev(mddev, rdev)) /* failure here is OK */; - + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - + if (mddev->flags & MD_UPDATE_SB_FLAGS) md_update_sb(mddev, 0); @@ -5307,12 +5115,13 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) mddev_lock_nointr(mddev); mutex_lock(&mddev->open_mutex); - if (atomic_read(&mddev->openers) > !!bdev || + if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || mddev->sync_thread || (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { printk("md: %s still in use.\n",mdname(mddev)); if (did_freeze) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); } err = -EBUSY; @@ -5327,6 +5136,8 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) mddev->ro = 1; set_disk_ro(mddev->gendisk, 1); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); sysfs_notify_dirent_safe(mddev->sysfs_state); err = 0; } @@ -5339,7 +5150,7 @@ out: * 0 - completely stop and dis-assemble array * 2 - stop but do not disassemble array */ -static int do_md_stop(struct mddev * mddev, int mode, +static int do_md_stop(struct mddev *mddev, int mode, struct block_device *bdev) { struct gendisk *disk = mddev->gendisk; @@ -5362,7 +5173,7 @@ static int do_md_stop(struct mddev * mddev, int mode, mddev_lock_nointr(mddev); mutex_lock(&mddev->open_mutex); - if (atomic_read(&mddev->openers) > !!bdev || + if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || mddev->sysfs_active || mddev->sync_thread || (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { @@ -5370,6 +5181,7 @@ static int do_md_stop(struct mddev * mddev, int mode, mutex_unlock(&mddev->open_mutex); if (did_freeze) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); } return -EBUSY; @@ -5512,12 +5324,12 @@ static void autorun_devices(int part) "md: cannot allocate memory for md drive.\n"); break; } - if (mddev_lock(mddev)) + if (mddev_lock(mddev)) printk(KERN_WARNING "md: %s locked, cannot run\n", mdname(mddev)); else if (mddev->raid_disks || mddev->major_version || !list_empty(&mddev->disks)) { - printk(KERN_WARNING + printk(KERN_WARNING "md: %s already running, cannot run %s\n", mdname(mddev), bdevname(rdev0->bdev,b)); mddev_unlock(mddev); @@ -5545,7 +5357,7 @@ static void autorun_devices(int part) } #endif /* !MODULE */ -static int get_version(void __user * arg) +static int get_version(void __user *arg) { mdu_version_t ver; @@ -5559,7 +5371,7 @@ static int get_version(void __user * arg) return 0; } -static int get_array_info(struct mddev * mddev, void __user * arg) +static int get_array_info(struct mddev *mddev, void __user *arg) { mdu_array_info_t info; int nr,working,insync,failed,spare; @@ -5574,7 +5386,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg) else { working++; if (test_bit(In_sync, &rdev->flags)) - insync++; + insync++; else spare++; } @@ -5614,7 +5426,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg) return 0; } -static int get_bitmap_file(struct mddev * mddev, void __user * arg) +static int get_bitmap_file(struct mddev *mddev, void __user * arg) { mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ char *ptr, *buf = NULL; @@ -5652,7 +5464,7 @@ out: return err; } -static int get_disk_info(struct mddev * mddev, void __user * arg) +static int get_disk_info(struct mddev *mddev, void __user * arg) { mdu_disk_info_t info; struct md_rdev *rdev; @@ -5688,7 +5500,7 @@ static int get_disk_info(struct mddev * mddev, void __user * arg) return 0; } -static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) +static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) { char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; struct md_rdev *rdev; @@ -5702,7 +5514,7 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) /* expecting a device which has a superblock */ rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); if (IS_ERR(rdev)) { - printk(KERN_WARNING + printk(KERN_WARNING "md: md_import_device returned %ld\n", PTR_ERR(rdev)); return PTR_ERR(rdev); @@ -5714,9 +5526,9 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) err = super_types[mddev->major_version] .load_super(rdev, rdev0, mddev->minor_version); if (err < 0) { - printk(KERN_WARNING + printk(KERN_WARNING "md: %s has different UUID to %s\n", - bdevname(rdev->bdev,b), + bdevname(rdev->bdev,b), bdevname(rdev0->bdev,b2)); export_rdev(rdev); return -EINVAL; @@ -5736,7 +5548,7 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) if (mddev->pers) { int err; if (!mddev->pers->hot_add_disk) { - printk(KERN_WARNING + printk(KERN_WARNING "%s: personality does not support diskops!\n", mdname(mddev)); return -EINVAL; @@ -5747,7 +5559,7 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) else rdev = md_import_device(dev, -1, -1); if (IS_ERR(rdev)) { - printk(KERN_WARNING + printk(KERN_WARNING "md: md_import_device returned %ld\n", PTR_ERR(rdev)); return PTR_ERR(rdev); @@ -5821,7 +5633,7 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) int err; rdev = md_import_device(dev, -1, 0); if (IS_ERR(rdev)) { - printk(KERN_WARNING + printk(KERN_WARNING "md: error, md_import_device() returned %ld\n", PTR_ERR(rdev)); return PTR_ERR(rdev); @@ -5856,7 +5668,7 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) return 0; } -static int hot_remove_disk(struct mddev * mddev, dev_t dev) +static int hot_remove_disk(struct mddev *mddev, dev_t dev) { char b[BDEVNAME_SIZE]; struct md_rdev *rdev; @@ -5882,7 +5694,7 @@ busy: return -EBUSY; } -static int hot_add_disk(struct mddev * mddev, dev_t dev) +static int hot_add_disk(struct mddev *mddev, dev_t dev) { char b[BDEVNAME_SIZE]; int err; @@ -5898,7 +5710,7 @@ static int hot_add_disk(struct mddev * mddev, dev_t dev) return -EINVAL; } if (!mddev->pers->hot_add_disk) { - printk(KERN_WARNING + printk(KERN_WARNING "%s: personality does not support diskops!\n", mdname(mddev)); return -EINVAL; @@ -5906,7 +5718,7 @@ static int hot_add_disk(struct mddev * mddev, dev_t dev) rdev = md_import_device(dev, -1, 0); if (IS_ERR(rdev)) { - printk(KERN_WARNING + printk(KERN_WARNING "md: error, md_import_device() returned %ld\n", PTR_ERR(rdev)); return -EINVAL; @@ -5920,7 +5732,7 @@ static int hot_add_disk(struct mddev * mddev, dev_t dev) rdev->sectors = rdev->sb_start; if (test_bit(Faulty, &rdev->flags)) { - printk(KERN_WARNING + printk(KERN_WARNING "md: can not hot-add faulty %s disk to %s!\n", bdevname(rdev->bdev,b), mdname(mddev)); err = -EINVAL; @@ -5961,14 +5773,13 @@ static int set_bitmap_file(struct mddev *mddev, int fd) int err = 0; if (mddev->pers) { - if (!mddev->pers->quiesce) + if (!mddev->pers->quiesce || !mddev->thread) return -EBUSY; if (mddev->recovery || mddev->sync_thread) return -EBUSY; /* we should be able to change the bitmap.. */ } - if (fd >= 0) { struct inode *inode; if (mddev->bitmap) @@ -6039,7 +5850,7 @@ static int set_bitmap_file(struct mddev *mddev, int fd) * The minor and patch _version numbers are also kept incase the * super_block handler wishes to interpret them. */ -static int set_array_info(struct mddev * mddev, mdu_array_info_t *info) +static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) { if (info->raid_disks == 0) { @@ -6048,7 +5859,7 @@ static int set_array_info(struct mddev * mddev, mdu_array_info_t *info) info->major_version >= ARRAY_SIZE(super_types) || super_types[info->major_version].name == NULL) { /* maybe try to auto-load a module? */ - printk(KERN_INFO + printk(KERN_INFO "md: superblock version %d not known\n", info->major_version); return -EINVAL; @@ -6196,7 +6007,6 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) return rv; } - /* * update_array_info is used to change the configuration of an * on-line array. @@ -6263,7 +6073,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) rv = update_raid_disks(mddev, info->raid_disks); if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { - if (mddev->pers->quiesce == NULL) + if (mddev->pers->quiesce == NULL || mddev->thread == NULL) return -EINVAL; if (mddev->recovery || mddev->sync_thread) return -EBUSY; @@ -6347,7 +6157,6 @@ static inline bool md_ioctl_valid(unsigned int cmd) case GET_DISK_INFO: case HOT_ADD_DISK: case HOT_REMOVE_DISK: - case PRINT_RAID_DEBUG: case RAID_AUTORUN: case RAID_VERSION: case RESTART_ARRAY_RW: @@ -6391,18 +6200,13 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, switch (cmd) { case RAID_VERSION: err = get_version(argp); - goto done; - - case PRINT_RAID_DEBUG: - err = 0; - md_print_devices(); - goto done; + goto out; #ifndef MODULE case RAID_AUTORUN: err = 0; autostart_arrays(arg); - goto done; + goto out; #endif default:; } @@ -6415,7 +6219,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, if (!mddev) { BUG(); - goto abort; + goto out; } /* Some actions do not requires the mutex */ @@ -6425,18 +6229,18 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, err = -ENODEV; else err = get_array_info(mddev, argp); - goto abort; + goto out; case GET_DISK_INFO: if (!mddev->raid_disks && !mddev->external) err = -ENODEV; else err = get_disk_info(mddev, argp); - goto abort; + goto out; case SET_DISK_FAULTY: err = set_disk_faulty(mddev, new_decode_dev(arg)); - goto abort; + goto out; } if (cmd == ADD_NEW_DISK) @@ -6454,10 +6258,10 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, * and writes */ mutex_lock(&mddev->open_mutex); - if (atomic_read(&mddev->openers) > 1) { + if (mddev->pers && atomic_read(&mddev->openers) > 1) { mutex_unlock(&mddev->open_mutex); err = -EBUSY; - goto abort; + goto out; } set_bit(MD_STILL_CLOSED, &mddev->flags); mutex_unlock(&mddev->open_mutex); @@ -6465,10 +6269,10 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, } err = mddev_lock(mddev); if (err) { - printk(KERN_INFO + printk(KERN_INFO "md: ioctl lock interrupted, reason %d, cmd %d\n", err, cmd); - goto abort; + goto out; } if (cmd == SET_ARRAY_INFO) { @@ -6477,38 +6281,38 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, memset(&info, 0, sizeof(info)); else if (copy_from_user(&info, argp, sizeof(info))) { err = -EFAULT; - goto abort_unlock; + goto unlock; } if (mddev->pers) { err = update_array_info(mddev, &info); if (err) { printk(KERN_WARNING "md: couldn't update" " array info. %d\n", err); - goto abort_unlock; + goto unlock; } - goto done_unlock; + goto unlock; } if (!list_empty(&mddev->disks)) { printk(KERN_WARNING "md: array %s already has disks!\n", mdname(mddev)); err = -EBUSY; - goto abort_unlock; + goto unlock; } if (mddev->raid_disks) { printk(KERN_WARNING "md: array %s already initialised!\n", mdname(mddev)); err = -EBUSY; - goto abort_unlock; + goto unlock; } err = set_array_info(mddev, &info); if (err) { printk(KERN_WARNING "md: couldn't set" " array info. %d\n", err); - goto abort_unlock; + goto unlock; } - goto done_unlock; + goto unlock; } /* @@ -6521,7 +6325,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE && cmd != GET_BITMAP_FILE) { err = -ENODEV; - goto abort_unlock; + goto unlock; } /* @@ -6530,23 +6334,23 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, switch (cmd) { case GET_BITMAP_FILE: err = get_bitmap_file(mddev, argp); - goto done_unlock; + goto unlock; case RESTART_ARRAY_RW: err = restart_array(mddev); - goto done_unlock; + goto unlock; case STOP_ARRAY: err = do_md_stop(mddev, 0, bdev); - goto done_unlock; + goto unlock; case STOP_ARRAY_RO: err = md_set_readonly(mddev, bdev); - goto done_unlock; + goto unlock; case HOT_REMOVE_DISK: err = hot_remove_disk(mddev, new_decode_dev(arg)); - goto done_unlock; + goto unlock; case ADD_NEW_DISK: /* We can support ADD_NEW_DISK on read-only arrays @@ -6562,14 +6366,14 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, break; else err = add_new_disk(mddev, &info); - goto done_unlock; + goto unlock; } break; case BLKROSET: if (get_user(ro, (int __user *)(arg))) { err = -EFAULT; - goto done_unlock; + goto unlock; } err = -EINVAL; @@ -6577,11 +6381,11 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, * does not matter, no writes are coming */ if (ro) - goto done_unlock; + goto unlock; /* are we are already prepared for writes? */ if (mddev->ro != 1) - goto done_unlock; + goto unlock; /* transitioning to readauto need only happen for * arrays that call md_write_start @@ -6593,17 +6397,14 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, set_disk_ro(mddev->gendisk, 0); } } - goto done_unlock; + goto unlock; } /* * The remaining ioctls are changing the state of the * superblock, so we do not allow them on read-only arrays. - * However non-MD ioctls (e.g. get-size) will still come through - * here and hit the 'default' below, so only disallow - * 'md' ioctls, and switch to rw mode if started auto-readonly. */ - if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) { + if (mddev->ro && mddev->pers) { if (mddev->ro == 2) { mddev->ro = 0; sysfs_notify_dirent_safe(mddev->sysfs_state); @@ -6621,7 +6422,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, } } else { err = -EROFS; - goto abort_unlock; + goto unlock; } } @@ -6633,38 +6434,32 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, err = -EFAULT; else err = add_new_disk(mddev, &info); - goto done_unlock; + goto unlock; } case HOT_ADD_DISK: err = hot_add_disk(mddev, new_decode_dev(arg)); - goto done_unlock; + goto unlock; case RUN_ARRAY: err = do_md_run(mddev); - goto done_unlock; + goto unlock; case SET_BITMAP_FILE: err = set_bitmap_file(mddev, (int)arg); - goto done_unlock; + goto unlock; default: err = -EINVAL; - goto abort_unlock; + goto unlock; } -done_unlock: -abort_unlock: +unlock: if (mddev->hold_active == UNTIL_IOCTL && err != -EINVAL) mddev->hold_active = 0; mddev_unlock(mddev); - - return err; -done: - if (err) - MD_BUG(); -abort: +out: return err; } #ifdef CONFIG_COMPAT @@ -6726,7 +6521,7 @@ static int md_open(struct block_device *bdev, fmode_t mode) static void md_release(struct gendisk *disk, fmode_t mode) { - struct mddev *mddev = disk->private_data; + struct mddev *mddev = disk->private_data; BUG_ON(!mddev); atomic_dec(&mddev->openers); @@ -6761,7 +6556,7 @@ static const struct block_device_operations md_fops = .revalidate_disk= md_revalidate, }; -static int md_thread(void * arg) +static int md_thread(void *arg) { struct md_thread *thread = arg; @@ -6810,6 +6605,7 @@ void md_wakeup_thread(struct md_thread *thread) wake_up(&thread->wqueue); } } +EXPORT_SYMBOL(md_wakeup_thread); struct md_thread *md_register_thread(void (*run) (struct md_thread *), struct mddev *mddev, const char *name) @@ -6835,6 +6631,7 @@ struct md_thread *md_register_thread(void (*run) (struct md_thread *), } return thread; } +EXPORT_SYMBOL(md_register_thread); void md_unregister_thread(struct md_thread **threadp) { @@ -6852,14 +6649,10 @@ void md_unregister_thread(struct md_thread **threadp) kthread_stop(thread->tsk); kfree(thread); } +EXPORT_SYMBOL(md_unregister_thread); void md_error(struct mddev *mddev, struct md_rdev *rdev) { - if (!mddev) { - MD_BUG(); - return; - } - if (!rdev || test_bit(Faulty, &rdev->flags)) return; @@ -6876,6 +6669,7 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev) queue_work(md_misc_wq, &mddev->event_work); md_new_event_inintr(mddev); } +EXPORT_SYMBOL(md_error); /* seq_file implementation /proc/mdstat */ @@ -6898,8 +6692,7 @@ static void status_unused(struct seq_file *seq) seq_printf(seq, "\n"); } - -static void status_resync(struct seq_file *seq, struct mddev * mddev) +static void status_resync(struct seq_file *seq, struct mddev *mddev) { sector_t max_sectors, resync, res; unsigned long dt, db; @@ -6919,13 +6712,7 @@ static void status_resync(struct seq_file *seq, struct mddev * mddev) else max_sectors = mddev->dev_sectors; - /* - * Should not happen. - */ - if (!max_sectors) { - MD_BUG(); - return; - } + WARN_ON(max_sectors == 0); /* Pick 'scale' such that (resync>>scale)*1000 will fit * in a sector_t, and (max_sectors>>scale) will fit in a * u32, as those are the requirements for sector_div. @@ -7021,7 +6808,7 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct list_head *tmp; struct mddev *next_mddev, *mddev = v; - + ++*pos; if (v == (void*)2) return NULL; @@ -7036,7 +6823,7 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) else { next_mddev = (void*)2; *pos = 0x10000; - } + } spin_unlock(&all_mddevs_lock); if (v != (void*)1) @@ -7132,7 +6919,7 @@ static int md_seq_show(struct seq_file *seq, void *v) if (mddev->pers) { mddev->pers->status(seq, mddev); - seq_printf(seq, "\n "); + seq_printf(seq, "\n "); if (mddev->pers->sync_request) { if (mddev->curr_resync > 2) { status_resync(seq, mddev); @@ -7150,7 +6937,7 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\n"); } mddev_unlock(mddev); - + return 0; } @@ -7204,12 +6991,14 @@ static const struct file_operations md_seq_fops = { int register_md_personality(struct md_personality *p) { + printk(KERN_INFO "md: %s personality registered for level %d\n", + p->name, p->level); spin_lock(&pers_lock); list_add_tail(&p->list, &pers_list); - printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level); spin_unlock(&pers_lock); return 0; } +EXPORT_SYMBOL(register_md_personality); int unregister_md_personality(struct md_personality *p) { @@ -7219,10 +7008,11 @@ int unregister_md_personality(struct md_personality *p) spin_unlock(&pers_lock); return 0; } +EXPORT_SYMBOL(unregister_md_personality); static int is_mddev_idle(struct mddev *mddev, int init) { - struct md_rdev * rdev; + struct md_rdev *rdev; int idle; int curr_events; @@ -7276,7 +7066,7 @@ void md_done_sync(struct mddev *mddev, int blocks, int ok) // stop recovery, signal do_sync .... } } - +EXPORT_SYMBOL(md_done_sync); /* md_write_start(mddev, bi) * If we need to update some array metadata (e.g. 'active' flag @@ -7317,6 +7107,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi) wait_event(mddev->sb_wait, !test_bit(MD_CHANGE_PENDING, &mddev->flags)); } +EXPORT_SYMBOL(md_write_start); void md_write_end(struct mddev *mddev) { @@ -7327,6 +7118,7 @@ void md_write_end(struct mddev *mddev) mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); } } +EXPORT_SYMBOL(md_write_end); /* md_allow_write(mddev) * Calling this ensures that the array is marked 'active' so that writes @@ -7376,7 +7168,7 @@ void md_do_sync(struct md_thread *thread) struct mddev *mddev2; unsigned int currspeed = 0, window; - sector_t max_sectors,j, io_sectors; + sector_t max_sectors,j, io_sectors, recovery_done; unsigned long mark[SYNC_MARKS]; unsigned long update_time; sector_t mark_cnt[SYNC_MARKS]; @@ -7652,7 +7444,8 @@ void md_do_sync(struct md_thread *thread) */ cond_resched(); - currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 + recovery_done = io_sectors - atomic_read(&mddev->recovery_active); + currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2 /((jiffies-mddev->resync_mark)/HZ +1) +1; if (currspeed > speed_min(mddev)) { @@ -7783,6 +7576,33 @@ no_add: return spares; } +static void md_start_sync(struct work_struct *ws) +{ + struct mddev *mddev = container_of(ws, struct mddev, del_work); + + mddev->sync_thread = md_register_thread(md_do_sync, + mddev, + "resync"); + if (!mddev->sync_thread) { + printk(KERN_ERR "%s: could not start resync" + " thread...\n", + mdname(mddev)); + /* leave the spares where they are, it shouldn't hurt */ + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + if (test_and_clear_bit(MD_RECOVERY_RECOVER, + &mddev->recovery)) + if (mddev->sysfs_action) + sysfs_notify_dirent_safe(mddev->sysfs_action); + } else + md_wakeup_thread(mddev->sync_thread); + sysfs_notify_dirent_safe(mddev->sysfs_action); + md_new_event(mddev); +} + /* * This routine is regularly called by all per-raid-array threads to * deal with generic issues like resync and super-block update. @@ -7899,7 +7719,7 @@ void md_check_recovery(struct mddev *mddev) if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) - goto unlock; + goto not_running; /* no recovery is running. * remove any failed drives, then * add spares if possible. @@ -7911,7 +7731,7 @@ void md_check_recovery(struct mddev *mddev) if (mddev->pers->check_reshape == NULL || mddev->pers->check_reshape(mddev) != 0) /* Cannot proceed */ - goto unlock; + goto not_running; set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); } else if ((spares = remove_and_add_spares(mddev, NULL))) { @@ -7924,7 +7744,7 @@ void md_check_recovery(struct mddev *mddev) clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) /* nothing to be done ... */ - goto unlock; + goto not_running; if (mddev->pers->sync_request) { if (spares) { @@ -7934,27 +7754,11 @@ void md_check_recovery(struct mddev *mddev) */ bitmap_write_all(mddev->bitmap); } - mddev->sync_thread = md_register_thread(md_do_sync, - mddev, - "resync"); - if (!mddev->sync_thread) { - printk(KERN_ERR "%s: could not start resync" - " thread...\n", - mdname(mddev)); - /* leave the spares where they are, it shouldn't hurt */ - clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); - clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); - clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); - } else - md_wakeup_thread(mddev->sync_thread); - sysfs_notify_dirent_safe(mddev->sysfs_action); - md_new_event(mddev); + INIT_WORK(&mddev->del_work, md_start_sync); + queue_work(md_misc_wq, &mddev->del_work); + goto unlock; } - unlock: - wake_up(&mddev->sb_wait); - + not_running: if (!mddev->sync_thread) { clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); if (test_and_clear_bit(MD_RECOVERY_RECOVER, @@ -7962,9 +7766,12 @@ void md_check_recovery(struct mddev *mddev) if (mddev->sysfs_action) sysfs_notify_dirent_safe(mddev->sysfs_action); } + unlock: + wake_up(&mddev->sb_wait); mddev_unlock(mddev); } } +EXPORT_SYMBOL(md_check_recovery); void md_reap_sync_thread(struct mddev *mddev) { @@ -8007,6 +7814,7 @@ void md_reap_sync_thread(struct mddev *mddev) if (mddev->event_work.func) queue_work(md_misc_wq, &mddev->event_work); } +EXPORT_SYMBOL(md_reap_sync_thread); void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) { @@ -8592,7 +8400,7 @@ static int __init md_init(void) goto err_mdp; mdp_major = ret; - blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE, + blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE, md_probe, NULL, NULL); blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, md_probe, NULL, NULL); @@ -8640,7 +8448,6 @@ void md_autodetect_dev(dev_t dev) } } - static void autostart_arrays(int part) { struct md_rdev *rdev; @@ -8664,10 +8471,9 @@ static void autostart_arrays(int part) if (IS_ERR(rdev)) continue; - if (test_bit(Faulty, &rdev->flags)) { - MD_BUG(); + if (test_bit(Faulty, &rdev->flags)) continue; - } + set_bit(AutoDetected, &rdev->flags); list_add(&rdev->same_set, &pending_raid_disks); i_passed++; @@ -8687,7 +8493,7 @@ static __exit void md_exit(void) struct list_head *tmp; int delay = 1; - blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS); + blk_unregister_region(MKDEV(MD_MAJOR,0), 512); blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); unregister_blkdev(MD_MAJOR,"md"); @@ -8735,20 +8541,8 @@ static int set_ro(const char *val, struct kernel_param *kp) module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); - module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); -EXPORT_SYMBOL(register_md_personality); -EXPORT_SYMBOL(unregister_md_personality); -EXPORT_SYMBOL(md_error); -EXPORT_SYMBOL(md_done_sync); -EXPORT_SYMBOL(md_write_start); -EXPORT_SYMBOL(md_write_end); -EXPORT_SYMBOL(md_register_thread); -EXPORT_SYMBOL(md_unregister_thread); -EXPORT_SYMBOL(md_wakeup_thread); -EXPORT_SYMBOL(md_check_recovery); -EXPORT_SYMBOL(md_reap_sync_thread); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MD RAID framework"); MODULE_ALIAS("md"); diff --git a/drivers/md/md.h b/drivers/md/md.h index a49d991f3fe1..03cec5bdcaae 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -1,15 +1,15 @@ /* md.h : kernel internal structure of the Linux MD driver Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman - + This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. - + You should have received a copy of the GNU General Public License (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #ifndef _MD_MD_H @@ -56,7 +56,7 @@ struct md_rdev { __u64 sb_events; sector_t data_offset; /* start of data in array */ sector_t new_data_offset;/* only relevant while reshaping */ - sector_t sb_start; /* offset of the super block (in 512byte sectors) */ + sector_t sb_start; /* offset of the super block (in 512byte sectors) */ int sb_size; /* bytes in the superblock */ int preferred_minor; /* autorun support */ @@ -239,7 +239,7 @@ struct mddev { minor_version, patch_version; int persistent; - int external; /* metadata is + int external; /* metadata is * managed externally */ char metadata_type[17]; /* externally set*/ int chunk_sectors; @@ -248,7 +248,7 @@ struct mddev { char clevel[16]; int raid_disks; int max_disks; - sector_t dev_sectors; /* used size of + sector_t dev_sectors; /* used size of * component devices */ sector_t array_sectors; /* exported array size */ int external_size; /* size managed @@ -312,7 +312,7 @@ struct mddev { int parallel_resync; int ok_start_degraded; - /* recovery/resync flags + /* recovery/resync flags * NEEDED: we might need to start a resync/recover * RUNNING: a thread is running, or about to be started * SYNC: actually doing a resync, not a recovery @@ -392,20 +392,20 @@ struct mddev { unsigned int safemode; /* if set, update "clean" superblock * when no writes pending. - */ + */ unsigned int safemode_delay; struct timer_list safemode_timer; - atomic_t writes_pending; + atomic_t writes_pending; struct request_queue *queue; /* for plugging ... */ - struct bitmap *bitmap; /* the bitmap for the device */ + struct bitmap *bitmap; /* the bitmap for the device */ struct { struct file *file; /* the bitmap file */ loff_t offset; /* offset from superblock of * start of bitmap. May be * negative, but not '0' * For external metadata, offset - * from start of device. + * from start of device. */ unsigned long space; /* space available at this offset */ loff_t default_offset; /* this is the offset to use when @@ -421,7 +421,7 @@ struct mddev { int external; } bitmap_info; - atomic_t max_corr_read_errors; /* max read retries */ + atomic_t max_corr_read_errors; /* max read retries */ struct list_head all_mddevs; struct attribute_group *to_remove; @@ -439,7 +439,6 @@ struct mddev { void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); }; - static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) { int faulty = test_bit(Faulty, &rdev->flags); @@ -449,7 +448,7 @@ static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) { - atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); + atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); } struct md_personality @@ -463,7 +462,7 @@ struct md_personality int (*stop)(struct mddev *mddev); void (*status)(struct seq_file *seq, struct mddev *mddev); /* error_handler must set ->faulty and clear ->in_sync - * if appropriate, and should abort recovery if needed + * if appropriate, and should abort recovery if needed */ void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev); int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev); @@ -493,7 +492,6 @@ struct md_personality void *(*takeover) (struct mddev *mddev); }; - struct md_sysfs_entry { struct attribute attr; ssize_t (*show)(struct mddev *, char *); @@ -560,7 +558,7 @@ struct md_thread { void (*run) (struct md_thread *thread); struct mddev *mddev; wait_queue_head_t wqueue; - unsigned long flags; + unsigned long flags; struct task_struct *tsk; unsigned long timeout; void *private; @@ -594,7 +592,7 @@ extern void md_flush_request(struct mddev *mddev, struct bio *bio); extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, sector_t sector, int size, struct page *page); extern void md_super_wait(struct mddev *mddev); -extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, +extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, int rw, bool metadata_op); extern void md_do_sync(struct md_thread *thread); extern void md_new_event(struct mddev *mddev); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 849ad39f547b..399272f9c042 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -31,13 +31,12 @@ #define NR_RESERVED_BUFS 32 - static int multipath_map (struct mpconf *conf) { int i, disks = conf->raid_disks; /* - * Later we do read balancing on the read side + * Later we do read balancing on the read side * now we use the first available disk. */ @@ -68,7 +67,6 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh) md_wakeup_thread(mddev->thread); } - /* * multipath_end_bh_io() is called when we have finished servicing a multipathed * operation and are ready to return a success/failure code to the buffer @@ -98,8 +96,8 @@ static void multipath_end_request(struct bio *bio, int error) */ char b[BDEVNAME_SIZE]; md_error (mp_bh->mddev, rdev); - printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n", - bdevname(rdev->bdev,b), + printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n", + bdevname(rdev->bdev,b), (unsigned long long)bio->bi_iter.bi_sector); multipath_reschedule_retry(mp_bh); } else @@ -145,12 +143,12 @@ static void multipath_status (struct seq_file *seq, struct mddev *mddev) { struct mpconf *conf = mddev->private; int i; - + seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); for (i = 0; i < conf->raid_disks; i++) seq_printf (seq, "%s", - conf->multipaths[i].rdev && + conf->multipaths[i].rdev && test_bit(In_sync, &conf->multipaths[i].rdev->flags) ? "U" : "_"); seq_printf (seq, "]"); } @@ -195,7 +193,7 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev) * first check if this is a queued request for a device * which has just failed. */ - printk(KERN_ALERT + printk(KERN_ALERT "multipath: only one IO path left and IO error.\n"); /* leave it active... it's all we have */ return; @@ -242,7 +240,6 @@ static void print_multipath_conf (struct mpconf *conf) } } - static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) { struct mpconf *conf = mddev->private; @@ -325,8 +322,6 @@ abort: return err; } - - /* * This is a kernel thread which: * @@ -356,7 +351,7 @@ static void multipathd(struct md_thread *thread) bio = &mp_bh->bio; bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector; - + if ((mp_bh->path = multipath_map (conf))<0) { printk(KERN_ALERT "multipath: %s: unrecoverable IO read" " error for block %llu\n", @@ -414,7 +409,7 @@ static int multipath_run (struct mddev *mddev) conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL); mddev->private = conf; if (!conf) { - printk(KERN_ERR + printk(KERN_ERR "multipath: couldn't allocate memory for %s\n", mdname(mddev)); goto out; @@ -423,7 +418,7 @@ static int multipath_run (struct mddev *mddev) conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks, GFP_KERNEL); if (!conf->multipaths) { - printk(KERN_ERR + printk(KERN_ERR "multipath: couldn't allocate memory for %s\n", mdname(mddev)); goto out_free_conf; @@ -469,7 +464,7 @@ static int multipath_run (struct mddev *mddev) conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS, sizeof(struct multipath_bh)); if (conf->pool == NULL) { - printk(KERN_ERR + printk(KERN_ERR "multipath: couldn't allocate memory for %s\n", mdname(mddev)); goto out_free_conf; @@ -485,7 +480,7 @@ static int multipath_run (struct mddev *mddev) } } - printk(KERN_INFO + printk(KERN_INFO "multipath: array %s active with %d out of %d IO paths\n", mdname(mddev), conf->raid_disks - mddev->degraded, mddev->raid_disks); @@ -512,7 +507,6 @@ out: return -EIO; } - static int multipath_stop (struct mddev *mddev) { struct mpconf *conf = mddev->private; diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index 37d367bb9aa8..bf2b80d5c470 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -42,6 +42,12 @@ struct btree_node { } __packed; +/* + * Locks a block using the btree node validator. + */ +int bn_read_lock(struct dm_btree_info *info, dm_block_t b, + struct dm_block **result); + void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, struct dm_btree_value_type *vt); diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c index cf9fd676ae44..1b5e13ec7f96 100644 --- a/drivers/md/persistent-data/dm-btree-spine.c +++ b/drivers/md/persistent-data/dm-btree-spine.c @@ -92,7 +92,7 @@ struct dm_block_validator btree_node_validator = { /*----------------------------------------------------------------*/ -static int bn_read_lock(struct dm_btree_info *info, dm_block_t b, +int bn_read_lock(struct dm_btree_info *info, dm_block_t b, struct dm_block **result) { return dm_tm_read_lock(info->tm, b, &btree_node_validator, result); diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index 416060c25709..200ac12a1d40 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -847,22 +847,26 @@ EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key); * FIXME: We shouldn't use a recursive algorithm when we have limited stack * space. Also this only works for single level trees. */ -static int walk_node(struct ro_spine *s, dm_block_t block, +static int walk_node(struct dm_btree_info *info, dm_block_t block, int (*fn)(void *context, uint64_t *keys, void *leaf), void *context) { int r; unsigned i, nr; + struct dm_block *node; struct btree_node *n; uint64_t keys; - r = ro_step(s, block); - n = ro_node(s); + r = bn_read_lock(info, block, &node); + if (r) + return r; + + n = dm_block_data(node); nr = le32_to_cpu(n->header.nr_entries); for (i = 0; i < nr; i++) { if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) { - r = walk_node(s, value64(n, i), fn, context); + r = walk_node(info, value64(n, i), fn, context); if (r) goto out; } else { @@ -874,7 +878,7 @@ static int walk_node(struct ro_spine *s, dm_block_t block, } out: - ro_pop(s); + dm_tm_unlock(info->tm, node); return r; } @@ -882,15 +886,7 @@ int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, int (*fn)(void *context, uint64_t *keys, void *leaf), void *context) { - int r; - struct ro_spine spine; - BUG_ON(info->levels > 1); - - init_ro_spine(&spine, info); - r = walk_node(&spine, root, fn, context); - exit_ro_spine(&spine); - - return r; + return walk_node(info, root, fn, context); } EXPORT_SYMBOL_GPL(dm_btree_walk); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 407a99e46f69..ba6b85de96d2 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -1,10 +1,9 @@ /* raid0.c : Multiple Devices driver for Linux - Copyright (C) 1994-96 Marc ZYNGIER + Copyright (C) 1994-96 Marc ZYNGIER <zyngier@ufr-info-p7.ibp.fr> or <maz@gloups.fdn.fr> - Copyright (C) 1999, 2000 Ingo Molnar, Red Hat - + Copyright (C) 1999, 2000 Ingo Molnar, Red Hat RAID-0 management functions. @@ -12,10 +11,10 @@ it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. - + You should have received a copy of the GNU General Public License (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include <linux/blkdev.h> @@ -685,6 +684,12 @@ static void *raid0_takeover(struct mddev *mddev) * raid10 - assuming we have all necessary active disks * raid1 - with (N -1) mirror drives faulty */ + + if (mddev->bitmap) { + printk(KERN_ERR "md/raid0: %s: cannot takeover array with bitmap\n", + mdname(mddev)); + return ERR_PTR(-EBUSY); + } if (mddev->level == 4) return raid0_takeover_raid45(mddev); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 56e24c072b62..40b35be34f8d 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -494,7 +494,6 @@ static void raid1_end_write_request(struct bio *bio, int error) bio_put(to_put); } - /* * This routine returns the disk from which the requested read should * be done. There is a per-array 'next expected sequential IO' sector @@ -540,11 +539,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect has_nonrot_disk = 0; choose_next_idle = 0; - if (conf->mddev->recovery_cp < MaxSector && - (this_sector + sectors >= conf->next_resync)) - choose_first = 1; - else - choose_first = 0; + choose_first = (conf->mddev->recovery_cp < this_sector + sectors); for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { sector_t dist; @@ -831,7 +826,7 @@ static void flush_pending_writes(struct r1conf *conf) * there is no normal IO happeing. It must arrange to call * lower_barrier when the particular background IO completes. */ -static void raise_barrier(struct r1conf *conf) +static void raise_barrier(struct r1conf *conf, sector_t sector_nr) { spin_lock_irq(&conf->resync_lock); @@ -841,6 +836,7 @@ static void raise_barrier(struct r1conf *conf) /* block any new IO from starting */ conf->barrier++; + conf->next_resync = sector_nr; /* For these conditions we must wait: * A: while the array is in frozen state @@ -849,14 +845,17 @@ static void raise_barrier(struct r1conf *conf) * C: next_resync + RESYNC_SECTORS > start_next_window, meaning * next resync will reach to the window which normal bios are * handling. + * D: while there are any active requests in the current window. */ wait_event_lock_irq(conf->wait_barrier, !conf->array_frozen && conf->barrier < RESYNC_DEPTH && + conf->current_window_requests == 0 && (conf->start_next_window >= conf->next_resync + RESYNC_SECTORS), conf->resync_lock); + conf->nr_pending++; spin_unlock_irq(&conf->resync_lock); } @@ -866,6 +865,7 @@ static void lower_barrier(struct r1conf *conf) BUG_ON(conf->barrier <= 0); spin_lock_irqsave(&conf->resync_lock, flags); conf->barrier--; + conf->nr_pending--; spin_unlock_irqrestore(&conf->resync_lock, flags); wake_up(&conf->wait_barrier); } @@ -877,12 +877,10 @@ static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio) if (conf->array_frozen || !bio) wait = true; else if (conf->barrier && bio_data_dir(bio) == WRITE) { - if (conf->next_resync < RESYNC_WINDOW_SECTORS) - wait = true; - else if ((conf->next_resync - RESYNC_WINDOW_SECTORS - >= bio_end_sector(bio)) || - (conf->next_resync + NEXT_NORMALIO_DISTANCE - <= bio->bi_iter.bi_sector)) + if ((conf->mddev->curr_resync_completed + >= bio_end_sector(bio)) || + (conf->next_resync + NEXT_NORMALIO_DISTANCE + <= bio->bi_iter.bi_sector)) wait = false; else wait = true; @@ -902,25 +900,25 @@ static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) * However if there are already pending * requests (preventing the barrier from * rising completely), and the - * pre-process bio queue isn't empty, + * per-process bio queue isn't empty, * then don't wait, as we need to empty - * that queue to get the nr_pending - * count down. + * that queue to allow conf->start_next_window + * to increase. */ wait_event_lock_irq(conf->wait_barrier, !conf->array_frozen && (!conf->barrier || - ((conf->start_next_window < - conf->next_resync + RESYNC_SECTORS) && - current->bio_list && - !bio_list_empty(current->bio_list))), + ((conf->start_next_window < + conf->next_resync + RESYNC_SECTORS) && + current->bio_list && + !bio_list_empty(current->bio_list))), conf->resync_lock); conf->nr_waiting--; } if (bio && bio_data_dir(bio) == WRITE) { - if (conf->next_resync + NEXT_NORMALIO_DISTANCE - <= bio->bi_iter.bi_sector) { + if (bio->bi_iter.bi_sector >= + conf->mddev->curr_resync_completed) { if (conf->start_next_window == MaxSector) conf->start_next_window = conf->next_resync + @@ -1002,8 +1000,7 @@ static void unfreeze_array(struct r1conf *conf) spin_unlock_irq(&conf->resync_lock); } - -/* duplicate the data pages for behind I/O +/* duplicate the data pages for behind I/O */ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio) { @@ -1186,6 +1183,7 @@ read_again: atomic_read(&bitmap->behind_writes) == 0); } r1_bio->read_disk = rdisk; + r1_bio->start_next_window = 0; read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector, @@ -1471,7 +1469,6 @@ static void status(struct seq_file *seq, struct mddev *mddev) seq_printf(seq, "]"); } - static void error(struct mddev *mddev, struct md_rdev *rdev) { char b[BDEVNAME_SIZE]; @@ -1501,12 +1498,12 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) mddev->degraded++; set_bit(Faulty, &rdev->flags); spin_unlock_irqrestore(&conf->device_lock, flags); - /* - * if recovery is running, make sure it aborts. - */ - set_bit(MD_RECOVERY_INTR, &mddev->recovery); } else set_bit(Faulty, &rdev->flags); + /* + * if recovery is running, make sure it aborts. + */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_CHANGE_DEVS, &mddev->flags); printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" @@ -1548,8 +1545,13 @@ static void close_sync(struct r1conf *conf) mempool_destroy(conf->r1buf_pool); conf->r1buf_pool = NULL; + spin_lock_irq(&conf->resync_lock); conf->next_resync = 0; conf->start_next_window = MaxSector; + conf->current_window_requests += + conf->next_window_requests; + conf->next_window_requests = 0; + spin_unlock_irq(&conf->resync_lock); } static int raid1_spare_active(struct mddev *mddev) @@ -1560,7 +1562,7 @@ static int raid1_spare_active(struct mddev *mddev) unsigned long flags; /* - * Find all failed disks within the RAID1 configuration + * Find all failed disks within the RAID1 configuration * and mark them readable. * Called under mddev lock, so rcu protection not needed. */ @@ -1601,7 +1603,6 @@ static int raid1_spare_active(struct mddev *mddev) return count; } - static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) { struct r1conf *conf = mddev->private; @@ -1730,7 +1731,6 @@ abort: return err; } - static void end_sync_read(struct bio *bio, int error) { struct r1bio *r1_bio = bio->bi_private; @@ -1942,7 +1942,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) return 1; } -static int process_checks(struct r1bio *r1_bio) +static void process_checks(struct r1bio *r1_bio) { /* We have read all readable devices. If we haven't * got the block, then there is no hope left. @@ -2034,7 +2034,6 @@ static int process_checks(struct r1bio *r1_bio) bio_copy_data(sbio, pbio); } - return 0; } static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) @@ -2052,8 +2051,8 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) return; if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) - if (process_checks(r1_bio) < 0) - return; + process_checks(r1_bio); + /* * schedule writes */ @@ -2150,7 +2149,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, d--; rdev = conf->mirrors[d].rdev; if (rdev && - test_bit(In_sync, &rdev->flags)) + !test_bit(Faulty, &rdev->flags)) r1_sync_page_io(rdev, sect, s, conf->tmppage, WRITE); } @@ -2162,7 +2161,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk, d--; rdev = conf->mirrors[d].rdev; if (rdev && - test_bit(In_sync, &rdev->flags)) { + !test_bit(Faulty, &rdev->flags)) { if (r1_sync_page_io(rdev, sect, s, conf->tmppage, READ)) { atomic_add(s, &rdev->corrected_errors); @@ -2453,7 +2452,6 @@ static void raid1d(struct md_thread *thread) blk_finish_plug(&plug); } - static int init_resync(struct r1conf *conf) { int buffs; @@ -2541,9 +2539,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp bitmap_cond_end_sync(mddev->bitmap, sector_nr); r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); - raise_barrier(conf); - conf->next_resync = sector_nr; + raise_barrier(conf, sector_nr); rcu_read_lock(); /* @@ -2718,7 +2715,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp /* remove last page from this bio */ bio->bi_vcnt--; bio->bi_iter.bi_size -= len; - bio->bi_flags &= ~(1<< BIO_SEG_VALID); + __clear_bit(BIO_SEG_VALID, &bio->bi_flags); } goto bio_full; } @@ -2943,9 +2940,9 @@ static int run(struct mddev *mddev) printk(KERN_NOTICE "md/raid1:%s: not clean" " -- starting background reconstruction\n", mdname(mddev)); - printk(KERN_INFO + printk(KERN_INFO "md/raid1:%s: active with %d out of %d mirrors\n", - mdname(mddev), mddev->raid_disks - mddev->degraded, + mdname(mddev), mddev->raid_disks - mddev->degraded, mddev->raid_disks); /* diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 9bebca7bff2f..33bda55ef9f7 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -90,7 +90,6 @@ struct r1conf { */ int recovery_disabled; - /* poolinfo contains information about the content of the * mempools - it changes when the array grows or shrinks */ @@ -103,7 +102,6 @@ struct r1conf { */ struct page *tmppage; - /* When taking over an array from a different personality, we store * the new thread here until we fully activate the array. */ diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index cb882aae9e20..32e282f4c83c 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -366,7 +366,6 @@ static void raid10_end_read_request(struct bio *bio, int error) struct md_rdev *rdev; struct r10conf *conf = r10_bio->mddev->private; - slot = r10_bio->read_slot; dev = r10_bio->devs[slot].devnum; rdev = r10_bio->devs[slot].rdev; @@ -1559,7 +1558,6 @@ static void make_request(struct mddev *mddev, struct bio *bio) md_write_start(mddev, bio); - do { /* @@ -1684,13 +1682,12 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) spin_unlock_irqrestore(&conf->device_lock, flags); return; } - if (test_and_clear_bit(In_sync, &rdev->flags)) { + if (test_and_clear_bit(In_sync, &rdev->flags)) mddev->degraded++; - /* - * if recovery is running, make sure it aborts. - */ - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - } + /* + * If recovery is running, make sure it aborts. + */ + set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); @@ -1783,7 +1780,6 @@ static int raid10_spare_active(struct mddev *mddev) return count; } - static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) { struct r10conf *conf = mddev->private; @@ -1930,7 +1926,6 @@ abort: return err; } - static void end_sync_read(struct bio *bio, int error) { struct r10bio *r10_bio = bio->bi_private; @@ -2296,7 +2291,6 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) } } - /* * Used by fix_read_error() to decay the per rdev read_errors. * We halve the read error count for every hour that has elapsed @@ -2853,7 +2847,6 @@ static void raid10d(struct md_thread *thread) blk_finish_plug(&plug); } - static int init_resync(struct r10conf *conf) { int buffs; @@ -2954,6 +2947,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, */ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { end_reshape(conf); + close_sync(conf); return 0; } @@ -3082,6 +3076,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, } r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); + r10_bio->state = 0; raise_barrier(conf, rb2 != NULL); atomic_set(&r10_bio->remaining, 0); @@ -3270,6 +3265,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, if (sync_blocks < max_sync) max_sync = sync_blocks; r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); + r10_bio->state = 0; r10_bio->mddev = mddev; atomic_set(&r10_bio->remaining, 0); @@ -3386,7 +3382,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, /* remove last page from this bio */ bio2->bi_vcnt--; bio2->bi_iter.bi_size -= len; - bio2->bi_flags &= ~(1<< BIO_SEG_VALID); + __clear_bit(BIO_SEG_VALID, &bio2->bi_flags); } goto bio_full; } @@ -3774,7 +3770,6 @@ static int run(struct mddev *mddev) blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); } - if (md_integrity_register(mddev)) goto out_free_conf; @@ -3832,6 +3827,8 @@ static int stop(struct mddev *mddev) mempool_destroy(conf->r10bio_pool); safe_put_page(conf->tmppage); kfree(conf->mirrors); + kfree(conf->mirrors_old); + kfree(conf->mirrors_new); kfree(conf); mddev->private = NULL; return 0; @@ -4119,7 +4116,7 @@ static int raid10_start_reshape(struct mddev *mddev) memcpy(conf->mirrors_new, conf->mirrors, sizeof(struct raid10_info)*conf->prev.raid_disks); smp_mb(); - kfree(conf->mirrors_old); /* FIXME and elsewhere */ + kfree(conf->mirrors_old); conf->mirrors_old = conf->mirrors; conf->mirrors = conf->mirrors_new; conf->mirrors_new = NULL; @@ -4385,6 +4382,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, read_more: /* Now schedule reads for blocks from sector_nr to last */ r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); + r10_bio->state = 0; raise_barrier(conf, sectors_done != 0); atomic_set(&r10_bio->remaining, 0); r10_bio->mddev = mddev; @@ -4399,6 +4397,7 @@ read_more: * on all the target devices. */ // FIXME + mempool_free(r10_bio, conf->r10buf_pool); set_bit(MD_RECOVERY_INTR, &mddev->recovery); return sectors_done; } @@ -4411,8 +4410,8 @@ read_more: read_bio->bi_private = r10_bio; read_bio->bi_end_io = end_sync_read; read_bio->bi_rw = READ; - read_bio->bi_flags &= ~(BIO_POOL_MASK - 1); - read_bio->bi_flags |= 1 << BIO_UPTODATE; + read_bio->bi_flags &= (~0UL << BIO_RESET_BITS); + __set_bit(BIO_UPTODATE, &read_bio->bi_flags); read_bio->bi_vcnt = 0; read_bio->bi_iter.bi_size = 0; r10_bio->master_bio = read_bio; @@ -4469,7 +4468,7 @@ read_more: /* Remove last page from this bio */ bio2->bi_vcnt--; bio2->bi_iter.bi_size -= len; - bio2->bi_flags &= ~(1<<BIO_SEG_VALID); + __clear_bit(BIO_SEG_VALID, &bio2->bi_flags); } goto bio_full; } @@ -4571,7 +4570,6 @@ static void end_reshape(struct r10conf *conf) conf->fullsync = 0; } - static int handle_reshape_read_error(struct mddev *mddev, struct r10bio *r10_bio) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6234b2e84587..9c66e5997fc8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -64,6 +64,10 @@ #define cpu_to_group(cpu) cpu_to_node(cpu) #define ANY_GROUP NUMA_NO_NODE +static bool devices_handle_discard_safely = false; +module_param(devices_handle_discard_safely, bool, 0644); +MODULE_PARM_DESC(devices_handle_discard_safely, + "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"); static struct workqueue_struct *raid5_wq; /* * Stripe cache @@ -459,7 +463,6 @@ static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) hlist_add_head(&sh->hash, hp); } - /* find an idle stripe, make sure it is unhashed, and return it. */ static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash) { @@ -527,9 +530,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) BUG_ON(stripe_operations_active(sh)); pr_debug("init_stripe called, stripe %llu\n", - (unsigned long long)sh->sector); - - remove_hash(sh); + (unsigned long long)sector); retry: seq = read_seqcount_begin(&conf->gen_lock); sh->generation = conf->generation - previous; @@ -538,7 +539,6 @@ retry: stripe_set_idx(sector, conf, previous, sh); sh->state = 0; - for (i = sh->disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -1346,7 +1346,6 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) } } - static void ops_complete_prexor(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; @@ -2415,7 +2414,6 @@ static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, return new_sector; } - static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) { struct r5conf *conf = sh->raid_conf; @@ -2433,7 +2431,6 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) sector_t r_sector; struct stripe_head sh2; - chunk_offset = sector_div(new_sector, sectors_per_chunk); stripe = new_sector; @@ -2537,7 +2534,6 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) return r_sector; } - static void schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, int rcw, int expand) @@ -2922,7 +2918,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) && !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || (sh->raid_conf->level == 6 && s->failed && s->to_write && - s->to_write < sh->raid_conf->raid_disks - 2 && + s->to_write - s->non_overwrite < sh->raid_conf->raid_disks - 2 && (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) { /* we would like to get this block, possibly by computing it, * otherwise read it if the backing disk is insync @@ -3009,7 +3005,6 @@ static void handle_stripe_fill(struct stripe_head *sh, set_bit(STRIPE_HANDLE, &sh->state); } - /* handle_stripe_clean_event * any written block on an uptodate or failed drive can be returned. * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but @@ -3300,7 +3295,6 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, } } - static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks) @@ -3817,6 +3811,8 @@ static void handle_stripe(struct stripe_head *sh) set_bit(R5_Wantwrite, &dev->flags); if (prexor) continue; + if (s.failed > 1) + continue; if (!test_bit(R5_Insync, &dev->flags) || ((i == sh->pd_idx || i == sh->qd_idx) && s.failed == 0)) @@ -3933,7 +3929,6 @@ static void handle_stripe(struct stripe_head *sh) } } - /* Finish reconstruct operations initiated by the expansion process */ if (sh->reconstruct_state == reconstruct_state_result) { struct stripe_head *sh_src @@ -4131,7 +4126,6 @@ static int raid5_mergeable_bvec(struct request_queue *q, return max; } - static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) { sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev); @@ -4161,7 +4155,6 @@ static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) md_wakeup_thread(conf->mddev->thread); } - static struct bio *remove_bio_from_retry(struct r5conf *conf) { struct bio *bi; @@ -4185,7 +4178,6 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf) return bi; } - /* * The "raid5_align_endio" should check if the read succeeded and if it * did, call bio_endio on the original bio (having bio_put the new bio @@ -4218,7 +4210,6 @@ static void raid5_align_endio(struct bio *bi, int error) return; } - pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); add_bio_to_retry(raid_bi, conf); @@ -4243,7 +4234,6 @@ static int bio_fits_rdev(struct bio *bi) return 1; } - static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) { struct r5conf *conf = mddev->private; @@ -4295,7 +4285,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) rcu_read_unlock(); raid_bio->bi_next = (void*)rdev; align_bi->bi_bdev = rdev->bdev; - align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); + __clear_bit(BIO_SEG_VALID, &align_bi->bi_flags); if (!bio_fits_rdev(align_bi) || is_badblock(rdev, align_bi->bi_iter.bi_sector, @@ -5440,7 +5430,6 @@ raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR, raid5_show_skip_copy, raid5_store_skip_copy); - static ssize_t stripe_cache_active_show(struct mddev *mddev, char *page) { @@ -5892,7 +5881,6 @@ static struct r5conf *setup_conf(struct mddev *mddev) return ERR_PTR(-ENOMEM); } - static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded) { switch (algo) { @@ -5905,7 +5893,7 @@ static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded return 1; break; case ALGORITHM_PARITY_0_6: - if (raid_disk == 0 || + if (raid_disk == 0 || raid_disk == raid_disks - 1) return 1; break; @@ -6159,7 +6147,6 @@ static int run(struct mddev *mddev) "reshape"); } - /* Ok, everything is just fine now */ if (mddev->to_remove == &raid5_attrs_group) mddev->to_remove = NULL; @@ -6206,7 +6193,7 @@ static int run(struct mddev *mddev) mddev->queue->limits.discard_granularity = stripe; /* * unaligned part of discard request will be ignored, so can't - * guarantee discard_zerors_data + * guarantee discard_zeroes_data */ mddev->queue->limits.discard_zeroes_data = 0; @@ -6231,6 +6218,18 @@ static int run(struct mddev *mddev) !bdev_get_queue(rdev->bdev)-> limits.discard_zeroes_data) discard_supported = false; + /* Unfortunately, discard_zeroes_data is not currently + * a guarantee - just a hint. So we only allow DISCARD + * if the sysadmin has confirmed that only safe devices + * are in use by setting a module parameter. + */ + if (!devices_handle_discard_safely) { + if (discard_supported) { + pr_info("md/raid456: discard support disabled due to uncertainty.\n"); + pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n"); + } + discard_supported = false; + } } if (discard_supported && @@ -6796,7 +6795,6 @@ static void raid5_quiesce(struct mddev *mddev, int state) } } - static void *raid45_takeover_raid0(struct mddev *mddev, int level) { struct r0conf *raid0_conf = mddev->private; @@ -6823,7 +6821,6 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level) return setup_conf(mddev); } - static void *raid5_takeover_raid1(struct mddev *mddev) { int chunksect; @@ -6884,7 +6881,6 @@ static void *raid5_takeover_raid6(struct mddev *mddev) return setup_conf(mddev); } - static int raid5_check_reshape(struct mddev *mddev) { /* For a 2-drive array, the layout and chunk size can be changed @@ -7033,7 +7029,6 @@ static void *raid6_takeover(struct mddev *mddev) return setup_conf(mddev); } - static struct md_personality raid6_personality = { .name = "raid6", diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index bc72cd4be5f8..d59f5ca743cd 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -155,7 +155,7 @@ */ /* - * Operations state - intermediate states that are visible outside of + * Operations state - intermediate states that are visible outside of * STRIPE_ACTIVE. * In general _idle indicates nothing is running, _run indicates a data * processing operation is active, and _result means the data processing result @@ -364,7 +364,6 @@ enum { * HANDLE gets cleared if stripe_handle leaves nothing locked. */ - struct disk_info { struct md_rdev *rdev, *replacement; }; @@ -528,7 +527,6 @@ struct r5conf { #define ALGORITHM_ROTATING_N_RESTART 9 /* DDF PRL=6 RLQ=2 */ #define ALGORITHM_ROTATING_N_CONTINUE 10 /*DDF PRL=6 RLQ=3 */ - /* For every RAID5 algorithm we define a RAID6 algorithm * with exactly the same layout for data and parity, and * with the Q block always on the last device (N-1). |