diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Kconfig | 2 | ||||
-rw-r--r-- | drivers/md/bcache/closure.h | 3 | ||||
-rw-r--r-- | drivers/md/bcache/io.c | 1 | ||||
-rw-r--r-- | drivers/md/bcache/journal.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 14 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 28 | ||||
-rw-r--r-- | drivers/md/dm-cache-policy-mq.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-cache-policy-smq.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-cache-target.c | 37 | ||||
-rw-r--r-- | drivers/md/dm-thin-metadata.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-thin.c | 53 | ||||
-rw-r--r-- | drivers/md/dm.c | 39 | ||||
-rw-r--r-- | drivers/md/md-cluster.c | 12 | ||||
-rw-r--r-- | drivers/md/md-cluster.h | 2 | ||||
-rw-r--r-- | drivers/md/md.c | 6 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-btree-internal.h | 6 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-btree-remove.c | 23 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-btree-spine.c | 37 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-btree.c | 9 | ||||
-rw-r--r-- | drivers/md/raid1.c | 19 | ||||
-rw-r--r-- | drivers/md/raid10.c | 5 | ||||
-rw-r--r-- | drivers/md/raid5.c | 38 | ||||
-rw-r--r-- | drivers/md/raid5.h | 3 |
23 files changed, 236 insertions, 113 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index b59727309072..bfec3bdfe598 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -259,7 +259,7 @@ config DM_CRYPT the ciphers you're going to use in the cryptoapi configuration. For further information on dm-crypt and userspace tools see: - <http://code.google.com/p/cryptsetup/wiki/DMCrypt> + <https://gitlab.com/cryptsetup/cryptsetup/wikis/DMCrypt> To compile this code as a module, choose M here: the module will be called dm-crypt. diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index a08e3eeac3c5..79a6d63e8ed3 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -320,7 +320,6 @@ static inline void closure_wake_up(struct closure_waitlist *list) do { \ set_closure_fn(_cl, _fn, _wq); \ closure_sub(_cl, CLOSURE_RUNNING + 1); \ - return; \ } while (0) /** @@ -349,7 +348,6 @@ do { \ do { \ set_closure_fn(_cl, _fn, _wq); \ closure_queue(_cl); \ - return; \ } while (0) /** @@ -365,7 +363,6 @@ do { \ do { \ set_closure_fn(_cl, _destructor, NULL); \ closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ - return; \ } while (0) /** diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index cb64e64a4789..bf6a9ca18403 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -105,6 +105,7 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) } while (n != bio); continue_at(&s->cl, bch_bio_submit_split_done, NULL); + return; submit: generic_make_request(bio); } diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index ce64fc851251..418607a6ba33 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -592,12 +592,14 @@ static void journal_write_unlocked(struct closure *cl) if (!w->need_write) { closure_return_with_destructor(cl, journal_write_unlock); + return; } else if (journal_full(&c->journal)) { journal_reclaim(c); spin_unlock(&c->journal.lock); btree_flush_write(c); continue_at(cl, journal_write, system_wq); + return; } c->journal.blocks_free -= set_blocks(w->data, block_bytes(c)); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 4afb2d26b148..f292790997d7 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -88,8 +88,10 @@ static void bch_data_insert_keys(struct closure *cl) if (journal_ref) atomic_dec_bug(journal_ref); - if (!op->insert_data_done) + if (!op->insert_data_done) { continue_at(cl, bch_data_insert_start, op->wq); + return; + } bch_keylist_free(&op->insert_keys); closure_return(cl); @@ -216,8 +218,10 @@ static void bch_data_insert_start(struct closure *cl) /* 1 for the device pointer and 1 for the chksum */ if (bch_keylist_realloc(&op->insert_keys, 3 + (op->csum ? 1 : 0), - op->c)) + op->c)) { continue_at(cl, bch_data_insert_keys, op->wq); + return; + } k = op->insert_keys.top; bkey_init(k); @@ -255,6 +259,7 @@ static void bch_data_insert_start(struct closure *cl) op->insert_data_done = true; continue_at(cl, bch_data_insert_keys, op->wq); + return; err: /* bch_alloc_sectors() blocks if s->writeback = true */ BUG_ON(op->writeback); @@ -576,8 +581,10 @@ static void cache_lookup(struct closure *cl) ret = bch_btree_map_keys(&s->op, s->iop.c, &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0), cache_lookup_fn, MAP_END_KEY); - if (ret == -EAGAIN) + if (ret == -EAGAIN) { continue_at(cl, cache_lookup, bcache_wq); + return; + } closure_return(cl); } @@ -1085,6 +1092,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio) continue_at_nobarrier(&s->cl, flash_dev_nodata, bcache_wq); + return; } else if (rw) { bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &KEY(d->id, bio->bi_iter.bi_sector, 0), diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index ed2346ddf4c9..e51de52eeb94 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -494,7 +494,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) bitmap_super_t *sb; unsigned long chunksize, daemon_sleep, write_behind; - bitmap->storage.sb_page = alloc_page(GFP_KERNEL); + bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (bitmap->storage.sb_page == NULL) return -ENOMEM; bitmap->storage.sb_page->index = 0; @@ -541,6 +541,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) sb->state = cpu_to_le32(bitmap->flags); bitmap->events_cleared = bitmap->mddev->events; sb->events_cleared = cpu_to_le64(bitmap->mddev->events); + bitmap->mddev->bitmap_info.nodes = 0; kunmap_atomic(sb); @@ -558,6 +559,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) unsigned long sectors_reserved = 0; int err = -EINVAL; struct page *sb_page; + loff_t offset = bitmap->mddev->bitmap_info.offset; if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) { chunksize = 128 * 1024 * 1024; @@ -584,9 +586,9 @@ re_read: bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t); /* to 4k blocks */ bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096); - bitmap->mddev->bitmap_info.offset += bitmap->cluster_slot * (bm_blocks << 3); + offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3)); pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__, - bitmap->cluster_slot, (unsigned long long)bitmap->mddev->bitmap_info.offset); + bitmap->cluster_slot, offset); } if (bitmap->storage.file) { @@ -597,7 +599,7 @@ re_read: bitmap, bytes, sb_page); } else { err = read_sb_page(bitmap->mddev, - bitmap->mddev->bitmap_info.offset, + offset, sb_page, 0, sizeof(bitmap_super_t)); } @@ -611,8 +613,16 @@ re_read: daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; write_behind = le32_to_cpu(sb->write_behind); sectors_reserved = le32_to_cpu(sb->sectors_reserved); - nodes = le32_to_cpu(sb->nodes); - strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64); + /* XXX: This is a hack to ensure that we don't use clustering + * in case: + * - dm-raid is in use and + * - the nodes written in bitmap_sb is erroneous. + */ + if (!bitmap->mddev->sync_super) { + nodes = le32_to_cpu(sb->nodes); + strlcpy(bitmap->mddev->bitmap_info.cluster_name, + sb->cluster_name, 64); + } /* verify that the bitmap-specific fields are valid */ if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) @@ -671,7 +681,7 @@ out: kunmap_atomic(sb); /* Assiging chunksize is required for "re_read" */ bitmap->mddev->bitmap_info.chunksize = chunksize; - if (nodes && (bitmap->cluster_slot < 0)) { + if (err == 0 && nodes && (bitmap->cluster_slot < 0)) { err = md_setup_cluster(bitmap->mddev, nodes); if (err) { pr_err("%s: Could not setup cluster service (%d)\n", @@ -1866,10 +1876,6 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot, if (IS_ERR(bitmap)) return PTR_ERR(bitmap); - rv = bitmap_read_sb(bitmap); - if (rv) - goto err; - rv = bitmap_init_from_disk(bitmap, 0); if (rv) goto err; diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 32814371b8d3..aa1b41ca40f7 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -1471,5 +1471,3 @@ module_exit(mq_exit); MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("mq cache policy"); - -MODULE_ALIAS("dm-cache-default"); diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index b6f22651dd35..200366c62231 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -1686,7 +1686,7 @@ static struct dm_cache_policy *smq_create(dm_cblock_t cache_size, if (from_cblock(cache_size)) { mq->cache_hit_bits = alloc_bitset(from_cblock(cache_size)); - if (!mq->cache_hit_bits && mq->cache_hit_bits) { + if (!mq->cache_hit_bits) { DMERR("couldn't allocate cache hit bitset"); goto bad_cache_hit_bits; } @@ -1789,3 +1789,5 @@ module_exit(smq_exit); MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("smq cache policy"); + +MODULE_ALIAS("dm-cache-default"); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 1b4e1756b169..1fe93cfea7d3 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -1947,6 +1947,7 @@ static int commit_if_needed(struct cache *cache) static void process_deferred_bios(struct cache *cache) { + bool prealloc_used = false; unsigned long flags; struct bio_list bios; struct bio *bio; @@ -1966,6 +1967,7 @@ static void process_deferred_bios(struct cache *cache) * this bio might require one, we pause until there are some * prepared mappings to process. */ + prealloc_used = true; if (prealloc_data_structs(cache, &structs)) { spin_lock_irqsave(&cache->lock, flags); bio_list_merge(&cache->deferred_bios, &bios); @@ -1983,11 +1985,13 @@ static void process_deferred_bios(struct cache *cache) process_bio(cache, &structs, bio); } - prealloc_free_structs(cache, &structs); + if (prealloc_used) + prealloc_free_structs(cache, &structs); } static void process_deferred_cells(struct cache *cache) { + bool prealloc_used = false; unsigned long flags; struct dm_bio_prison_cell *cell, *tmp; struct list_head cells; @@ -2007,6 +2011,7 @@ static void process_deferred_cells(struct cache *cache) * this bio might require one, we pause until there are some * prepared mappings to process. */ + prealloc_used = true; if (prealloc_data_structs(cache, &structs)) { spin_lock_irqsave(&cache->lock, flags); list_splice(&cells, &cache->deferred_cells); @@ -2017,7 +2022,8 @@ static void process_deferred_cells(struct cache *cache) process_cell(cache, &structs, cell); } - prealloc_free_structs(cache, &structs); + if (prealloc_used) + prealloc_free_structs(cache, &structs); } static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) @@ -2062,7 +2068,7 @@ static void process_deferred_writethrough_bios(struct cache *cache) static void writeback_some_dirty_blocks(struct cache *cache) { - int r = 0; + bool prealloc_used = false; dm_oblock_t oblock; dm_cblock_t cblock; struct prealloc structs; @@ -2072,15 +2078,12 @@ static void writeback_some_dirty_blocks(struct cache *cache) memset(&structs, 0, sizeof(structs)); while (spare_migration_bandwidth(cache)) { - if (prealloc_data_structs(cache, &structs)) - break; - - r = policy_writeback_work(cache->policy, &oblock, &cblock, busy); - if (r) - break; + if (policy_writeback_work(cache->policy, &oblock, &cblock, busy)) + break; /* no work to do */ - r = get_cell(cache, oblock, &structs, &old_ocell); - if (r) { + prealloc_used = true; + if (prealloc_data_structs(cache, &structs) || + get_cell(cache, oblock, &structs, &old_ocell)) { policy_set_dirty(cache->policy, oblock); break; } @@ -2088,7 +2091,8 @@ static void writeback_some_dirty_blocks(struct cache *cache) writeback(cache, &structs, oblock, cblock, old_ocell); } - prealloc_free_structs(cache, &structs); + if (prealloc_used) + prealloc_free_structs(cache, &structs); } /*---------------------------------------------------------------- @@ -3496,7 +3500,7 @@ static void cache_resume(struct dm_target *ti) * <#demotions> <#promotions> <#dirty> * <#features> <features>* * <#core args> <core args> - * <policy name> <#policy args> <policy args>* <cache metadata mode> + * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check> */ static void cache_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) @@ -3582,6 +3586,11 @@ static void cache_status(struct dm_target *ti, status_type_t type, else DMEMIT("rw "); + if (dm_cache_metadata_needs_check(cache->cmd)) + DMEMIT("needs_check "); + else + DMEMIT("- "); + break; case STATUSTYPE_TABLE: @@ -3820,7 +3829,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {1, 7, 0}, + .version = {1, 8, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 48dfe3c4d6aa..6ba47cfb1443 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1293,8 +1293,8 @@ static int __release_metadata_snap(struct dm_pool_metadata *pmd) return r; disk_super = dm_block_data(copy); - dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->data_mapping_root)); - dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->device_details_root)); + dm_btree_del(&pmd->info, le64_to_cpu(disk_super->data_mapping_root)); + dm_btree_del(&pmd->details_info, le64_to_cpu(disk_super->device_details_root)); dm_sm_dec_block(pmd->metadata_sm, held_root); return dm_tm_unlock(pmd->tm, copy); diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index c33f61a4cc28..d2bbe8cc1e97 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -18,6 +18,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/vmalloc.h> #include <linux/sort.h> #include <linux/rbtree.h> @@ -268,7 +269,7 @@ struct pool { process_mapping_fn process_prepared_mapping; process_mapping_fn process_prepared_discard; - struct dm_bio_prison_cell *cell_sort_array[CELL_SORT_ARRAY_SIZE]; + struct dm_bio_prison_cell **cell_sort_array; }; static enum pool_mode get_pool_mode(struct pool *pool); @@ -665,16 +666,21 @@ static void requeue_io(struct thin_c *tc) requeue_deferred_cells(tc); } -static void error_retry_list(struct pool *pool) +static void error_retry_list_with_code(struct pool *pool, int error) { struct thin_c *tc; rcu_read_lock(); list_for_each_entry_rcu(tc, &pool->active_thins, list) - error_thin_bio_list(tc, &tc->retry_on_resume_list, -EIO); + error_thin_bio_list(tc, &tc->retry_on_resume_list, error); rcu_read_unlock(); } +static void error_retry_list(struct pool *pool) +{ + return error_retry_list_with_code(pool, -EIO); +} + /* * This section of code contains the logic for processing a thin device's IO. * Much of the code depends on pool object resources (lists, workqueues, etc) @@ -2281,18 +2287,23 @@ static void do_waker(struct work_struct *ws) queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); } +static void notify_of_pool_mode_change_to_oods(struct pool *pool); + /* * We're holding onto IO to allow userland time to react. After the * timeout either the pool will have been resized (and thus back in - * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO. + * PM_WRITE mode), or we degrade to PM_OUT_OF_DATA_SPACE w/ error_if_no_space. */ static void do_no_space_timeout(struct work_struct *ws) { struct pool *pool = container_of(to_delayed_work(ws), struct pool, no_space_timeout); - if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) - set_pool_mode(pool, PM_READ_ONLY); + if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) { + pool->pf.error_if_no_space = true; + notify_of_pool_mode_change_to_oods(pool); + error_retry_list_with_code(pool, -ENOSPC); + } } /*----------------------------------------------------------------*/ @@ -2370,6 +2381,14 @@ static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode) dm_device_name(pool->pool_md), new_mode); } +static void notify_of_pool_mode_change_to_oods(struct pool *pool) +{ + if (!pool->pf.error_if_no_space) + notify_of_pool_mode_change(pool, "out-of-data-space (queue IO)"); + else + notify_of_pool_mode_change(pool, "out-of-data-space (error IO)"); +} + static bool passdown_enabled(struct pool_c *pt) { return pt->adjusted_pf.discard_passdown; @@ -2454,7 +2473,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) * frequently seeing this mode. */ if (old_mode != new_mode) - notify_of_pool_mode_change(pool, "out-of-data-space"); + notify_of_pool_mode_change_to_oods(pool); pool->process_bio = process_bio_read_only; pool->process_discard = process_discard_bio; pool->process_cell = process_cell_read_only; @@ -2777,6 +2796,7 @@ static void __pool_destroy(struct pool *pool) { __pool_table_remove(pool); + vfree(pool->cell_sort_array); if (dm_pool_metadata_close(pool->pmd) < 0) DMWARN("%s: dm_pool_metadata_close() failed.", __func__); @@ -2889,6 +2909,13 @@ static struct pool *pool_create(struct mapped_device *pool_md, goto bad_mapping_pool; } + pool->cell_sort_array = vmalloc(sizeof(*pool->cell_sort_array) * CELL_SORT_ARRAY_SIZE); + if (!pool->cell_sort_array) { + *error = "Error allocating cell sort array"; + err_p = ERR_PTR(-ENOMEM); + goto bad_sort_array; + } + pool->ref_count = 1; pool->last_commit_jiffies = jiffies; pool->pool_md = pool_md; @@ -2897,6 +2924,8 @@ static struct pool *pool_create(struct mapped_device *pool_md, return pool; +bad_sort_array: + mempool_destroy(pool->mapping_pool); bad_mapping_pool: dm_deferred_set_destroy(pool->all_io_ds); bad_all_io_ds: @@ -3714,6 +3743,7 @@ static void emit_flags(struct pool_features *pf, char *result, * Status line is: * <transaction id> <used metadata sectors>/<total metadata sectors> * <used data sectors>/<total data sectors> <held metadata root> + * <pool mode> <discard config> <no space config> <needs_check> */ static void pool_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) @@ -3815,6 +3845,11 @@ static void pool_status(struct dm_target *ti, status_type_t type, else DMEMIT("queue_if_no_space "); + if (dm_pool_metadata_needs_check(pool->pmd)) + DMEMIT("needs_check "); + else + DMEMIT("- "); + break; case STATUSTYPE_TABLE: @@ -3918,7 +3953,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 15, 0}, + .version = {1, 16, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -4305,7 +4340,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type thin_target = { .name = "thin", - .version = {1, 15, 0}, + .version = {1, 16, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, diff --git a/drivers/md/dm.c b/drivers/md/dm.c index f331d888e7f5..0d7ab20c58df 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1067,13 +1067,10 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig) */ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) { - int nr_requests_pending; - atomic_dec(&md->pending[rw]); /* nudge anyone waiting on suspend queue */ - nr_requests_pending = md_in_flight(md); - if (!nr_requests_pending) + if (!md_in_flight(md)) wake_up(&md->wait); /* @@ -1085,8 +1082,7 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) if (run_queue) { if (md->queue->mq_ops) blk_mq_run_hw_queues(md->queue, true); - else if (!nr_requests_pending || - (nr_requests_pending >= md->queue->nr_congestion_on)) + else blk_run_queue_async(md->queue); } @@ -1733,7 +1729,8 @@ static int dm_merge_bvec(struct request_queue *q, struct mapped_device *md = q->queuedata; struct dm_table *map = dm_get_live_table_fast(md); struct dm_target *ti; - sector_t max_sectors, max_size = 0; + sector_t max_sectors; + int max_size = 0; if (unlikely(!map)) goto out; @@ -1746,18 +1743,10 @@ static int dm_merge_bvec(struct request_queue *q, * Find maximum amount of I/O that won't need splitting */ max_sectors = min(max_io_len(bvm->bi_sector, ti), - (sector_t) queue_max_sectors(q)); + (sector_t) BIO_MAX_SECTORS); max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; - - /* - * FIXME: this stop-gap fix _must_ be cleaned up (by passing a sector_t - * to the targets' merge function since it holds sectors not bytes). - * Just doing this as an interim fix for stable@ because the more - * comprehensive cleanup of switching to sector_t will impact every - * DM target that implements a ->merge hook. - */ - if (max_size > INT_MAX) - max_size = INT_MAX; + if (max_size < 0) + max_size = 0; /* * merge_bvec_fn() returns number of bytes @@ -1765,13 +1754,13 @@ static int dm_merge_bvec(struct request_queue *q, * max is precomputed maximal io size */ if (max_size && ti->type->merge) - max_size = ti->type->merge(ti, bvm, biovec, (int) max_size); + max_size = ti->type->merge(ti, bvm, biovec, max_size); /* * If the target doesn't support merge method and some of the devices - * provided their merge_bvec method (we know this by looking for the - * max_hw_sectors that dm_set_device_limits may set), then we can't - * allow bios with multiple vector entries. So always set max_size - * to 0, and the code below allows just one page. + * provided their merge_bvec method (we know this by looking at + * queue_max_hw_sectors), then we can't allow bios with multiple vector + * entries. So always set max_size to 0, and the code below allows + * just one page. */ else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) max_size = 0; @@ -2281,8 +2270,6 @@ static void dm_init_old_md_queue(struct mapped_device *md) static void cleanup_mapped_device(struct mapped_device *md) { - cleanup_srcu_struct(&md->io_barrier); - if (md->wq) destroy_workqueue(md->wq); if (md->kworker_task) @@ -2294,6 +2281,8 @@ static void cleanup_mapped_device(struct mapped_device *md) if (md->bs) bioset_free(md->bs); + cleanup_srcu_struct(&md->io_barrier); + if (md->disk) { spin_lock(&_minor_lock); md->disk->private_data = NULL; diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index fcfc4b9b2672..0072190515e0 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -44,6 +44,7 @@ struct resync_info { /* md_cluster_info flags */ #define MD_CLUSTER_WAITING_FOR_NEWDISK 1 +#define MD_CLUSTER_SUSPEND_READ_BALANCING 2 struct md_cluster_info { @@ -275,6 +276,9 @@ clear_bit: static void recover_prep(void *arg) { + struct mddev *mddev = arg; + struct md_cluster_info *cinfo = mddev->cluster_info; + set_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state); } static void recover_slot(void *arg, struct dlm_slot *slot) @@ -307,6 +311,7 @@ static void recover_done(void *arg, struct dlm_slot *slots, cinfo->slot_number = our_slot; complete(&cinfo->completion); + clear_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state); } static const struct dlm_lockspace_ops md_ls_ops = { @@ -816,12 +821,17 @@ static void resync_finish(struct mddev *mddev) resync_send(mddev, RESYNCING, 0, 0); } -static int area_resyncing(struct mddev *mddev, sector_t lo, sector_t hi) +static int area_resyncing(struct mddev *mddev, int direction, + sector_t lo, sector_t hi) { struct md_cluster_info *cinfo = mddev->cluster_info; int ret = 0; struct suspend_info *s; + if ((direction == READ) && + test_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state)) + return 1; + spin_lock_irq(&cinfo->suspend_lock); if (list_empty(&cinfo->suspend_list)) goto out; diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h index 6817ee00e053..00defe2badbc 100644 --- a/drivers/md/md-cluster.h +++ b/drivers/md/md-cluster.h @@ -18,7 +18,7 @@ struct md_cluster_operations { int (*metadata_update_start)(struct mddev *mddev); int (*metadata_update_finish)(struct mddev *mddev); int (*metadata_update_cancel)(struct mddev *mddev); - int (*area_resyncing)(struct mddev *mddev, sector_t lo, sector_t hi); + int (*area_resyncing)(struct mddev *mddev, int direction, sector_t lo, sector_t hi); int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev); int (*add_new_disk_finish)(struct mddev *mddev); int (*new_disk_ack)(struct mddev *mddev, bool ack); diff --git a/drivers/md/md.c b/drivers/md/md.c index d429c30cd514..e25f00f0138a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5382,6 +5382,8 @@ static void __md_stop(struct mddev *mddev) { struct md_personality *pers = mddev->pers; mddev_detach(mddev); + /* Ensure ->event_work is done */ + flush_workqueue(md_misc_wq); spin_lock(&mddev->lock); mddev->ready = 0; mddev->pers = NULL; @@ -5757,7 +5759,7 @@ static int get_bitmap_file(struct mddev *mddev, void __user * arg) char *ptr; int err; - file = kmalloc(sizeof(*file), GFP_NOIO); + file = kzalloc(sizeof(*file), GFP_NOIO); if (!file) return -ENOMEM; @@ -7437,7 +7439,7 @@ int md_setup_cluster(struct mddev *mddev, int nodes) err = request_module("md-cluster"); if (err) { pr_err("md-cluster module not found.\n"); - return err; + return -ENOENT; } spin_lock(&pers_lock); diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index bf2b80d5c470..8731b6ea026b 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -138,4 +138,10 @@ int lower_bound(struct btree_node *n, uint64_t key); extern struct dm_block_validator btree_node_validator; +/* + * Value type for upper levels of multi-level btrees. + */ +extern void init_le64_type(struct dm_transaction_manager *tm, + struct dm_btree_value_type *vt); + #endif /* DM_BTREE_INTERNAL_H */ diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c index e04cfd2d60ef..4222f774cf36 100644 --- a/drivers/md/persistent-data/dm-btree-remove.c +++ b/drivers/md/persistent-data/dm-btree-remove.c @@ -309,8 +309,8 @@ static void redistribute3(struct dm_btree_info *info, struct btree_node *parent, if (s < 0 && nr_center < -s) { /* not enough in central node */ - shift(left, center, nr_center); - s = nr_center - target; + shift(left, center, -nr_center); + s += nr_center; shift(left, right, s); nr_right += s; } else @@ -323,7 +323,7 @@ static void redistribute3(struct dm_btree_info *info, struct btree_node *parent, if (s > 0 && nr_center < s) { /* not enough in central node */ shift(center, right, nr_center); - s = target - nr_center; + s -= nr_center; shift(left, right, s); nr_left -= s; } else @@ -544,14 +544,6 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, return r; } -static struct dm_btree_value_type le64_type = { - .context = NULL, - .size = sizeof(__le64), - .inc = NULL, - .dec = NULL, - .equal = NULL -}; - int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, uint64_t *keys, dm_block_t *new_root) { @@ -559,12 +551,14 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, int index = 0, r = 0; struct shadow_spine spine; struct btree_node *n; + struct dm_btree_value_type le64_vt; + init_le64_type(info->tm, &le64_vt); init_shadow_spine(&spine, info); for (level = 0; level < info->levels; level++) { r = remove_raw(&spine, info, (level == last_level ? - &info->value_type : &le64_type), + &info->value_type : &le64_vt), root, keys[level], (unsigned *)&index); if (r < 0) break; @@ -654,11 +648,13 @@ static int remove_one(struct dm_btree_info *info, dm_block_t root, int index = 0, r = 0; struct shadow_spine spine; struct btree_node *n; + struct dm_btree_value_type le64_vt; uint64_t k; + init_le64_type(info->tm, &le64_vt); init_shadow_spine(&spine, info); for (level = 0; level < last_level; level++) { - r = remove_raw(&spine, info, &le64_type, + r = remove_raw(&spine, info, &le64_vt, root, keys[level], (unsigned *) &index); if (r < 0) goto out; @@ -689,6 +685,7 @@ static int remove_one(struct dm_btree_info *info, dm_block_t root, value_ptr(n, index)); delete_at(n, index); + keys[last_level] = k + 1ull; } else r = -ENODATA; diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c index 1b5e13ec7f96..0dee514ba4c5 100644 --- a/drivers/md/persistent-data/dm-btree-spine.c +++ b/drivers/md/persistent-data/dm-btree-spine.c @@ -249,3 +249,40 @@ int shadow_root(struct shadow_spine *s) { return s->root; } + +static void le64_inc(void *context, const void *value_le) +{ + struct dm_transaction_manager *tm = context; + __le64 v_le; + + memcpy(&v_le, value_le, sizeof(v_le)); + dm_tm_inc(tm, le64_to_cpu(v_le)); +} + +static void le64_dec(void *context, const void *value_le) +{ + struct dm_transaction_manager *tm = context; + __le64 v_le; + + memcpy(&v_le, value_le, sizeof(v_le)); + dm_tm_dec(tm, le64_to_cpu(v_le)); +} + +static int le64_equal(void *context, const void *value1_le, const void *value2_le) +{ + __le64 v1_le, v2_le; + + memcpy(&v1_le, value1_le, sizeof(v1_le)); + memcpy(&v2_le, value2_le, sizeof(v2_le)); + return v1_le == v2_le; +} + +void init_le64_type(struct dm_transaction_manager *tm, + struct dm_btree_value_type *vt) +{ + vt->context = tm; + vt->size = sizeof(__le64); + vt->inc = le64_inc; + vt->dec = le64_dec; + vt->equal = le64_equal; +} diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index 200ac12a1d40..c7726cebc495 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -255,7 +255,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root) int r; struct del_stack *s; - s = kmalloc(sizeof(*s), GFP_KERNEL); + s = kmalloc(sizeof(*s), GFP_NOIO); if (!s) return -ENOMEM; s->info = info; @@ -667,12 +667,7 @@ static int insert(struct dm_btree_info *info, dm_block_t root, struct btree_node *n; struct dm_btree_value_type le64_type; - le64_type.context = NULL; - le64_type.size = sizeof(__le64); - le64_type.inc = NULL; - le64_type.dec = NULL; - le64_type.equal = NULL; - + init_le64_type(info->tm, &le64_type); init_shadow_spine(&spine, info); for (level = 0; level < (info->levels - 1); level++) { diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f80f1af61ce7..967a4ed73929 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -336,7 +336,7 @@ static void raid1_end_read_request(struct bio *bio, int error) spin_lock_irqsave(&conf->device_lock, flags); if (r1_bio->mddev->degraded == conf->raid_disks || (r1_bio->mddev->degraded == conf->raid_disks-1 && - !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags))) + test_bit(In_sync, &conf->mirrors[mirror].rdev->flags))) uptodate = 1; spin_unlock_irqrestore(&conf->device_lock, flags); } @@ -541,7 +541,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect if ((conf->mddev->recovery_cp < this_sector + sectors) || (mddev_is_clustered(conf->mddev) && - md_cluster_ops->area_resyncing(conf->mddev, this_sector, + md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector, this_sector + sectors))) choose_first = 1; else @@ -1111,7 +1111,8 @@ static void make_request(struct mddev *mddev, struct bio * bio) ((bio_end_sector(bio) > mddev->suspend_lo && bio->bi_iter.bi_sector < mddev->suspend_hi) || (mddev_is_clustered(mddev) && - md_cluster_ops->area_resyncing(mddev, bio->bi_iter.bi_sector, bio_end_sector(bio))))) { + md_cluster_ops->area_resyncing(mddev, WRITE, + bio->bi_iter.bi_sector, bio_end_sector(bio))))) { /* As the suspend_* range is controlled by * userspace, we want an interruptible * wait. @@ -1124,7 +1125,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) if (bio_end_sector(bio) <= mddev->suspend_lo || bio->bi_iter.bi_sector >= mddev->suspend_hi || (mddev_is_clustered(mddev) && - !md_cluster_ops->area_resyncing(mddev, + !md_cluster_ops->area_resyncing(mddev, WRITE, bio->bi_iter.bi_sector, bio_end_sector(bio)))) break; schedule(); @@ -1475,6 +1476,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) { char b[BDEVNAME_SIZE]; struct r1conf *conf = mddev->private; + unsigned long flags; /* * If it is not operational, then we have already marked it as dead @@ -1494,14 +1496,13 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) return; } set_bit(Blocked, &rdev->flags); + spin_lock_irqsave(&conf->device_lock, flags); if (test_and_clear_bit(In_sync, &rdev->flags)) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded++; set_bit(Faulty, &rdev->flags); - spin_unlock_irqrestore(&conf->device_lock, flags); } else set_bit(Faulty, &rdev->flags); + spin_unlock_irqrestore(&conf->device_lock, flags); /* * if recovery is running, make sure it aborts. */ @@ -1567,7 +1568,10 @@ static int raid1_spare_active(struct mddev *mddev) * Find all failed disks within the RAID1 configuration * and mark them readable. * Called under mddev lock, so rcu protection not needed. + * device_lock used to avoid races with raid1_end_read_request + * which expects 'In_sync' flags and ->degraded to be consistent. */ + spin_lock_irqsave(&conf->device_lock, flags); for (i = 0; i < conf->raid_disks; i++) { struct md_rdev *rdev = conf->mirrors[i].rdev; struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev; @@ -1598,7 +1602,6 @@ static int raid1_spare_active(struct mddev *mddev) sysfs_notify_dirent_safe(rdev->sysfs_state); } } - spin_lock_irqsave(&conf->device_lock, flags); mddev->degraded -= count; spin_unlock_irqrestore(&conf->device_lock, flags); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 940f2f365461..38c58e19cfce 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3556,6 +3556,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) /* far_copies must be 1 */ conf->prev.stride = conf->dev_sectors; } + conf->reshape_safe = conf->reshape_progress; spin_lock_init(&conf->device_lock); INIT_LIST_HEAD(&conf->retry_list); @@ -3760,7 +3761,6 @@ static int run(struct mddev *mddev) } conf->offset_diff = min_offset_diff; - conf->reshape_safe = conf->reshape_progress; clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); @@ -4103,6 +4103,7 @@ static int raid10_start_reshape(struct mddev *mddev) conf->reshape_progress = size; } else conf->reshape_progress = 0; + conf->reshape_safe = conf->reshape_progress; spin_unlock_irq(&conf->device_lock); if (mddev->delta_disks && mddev->bitmap) { @@ -4170,6 +4171,7 @@ abort: rdev->new_data_offset = rdev->data_offset; smp_wmb(); conf->reshape_progress = MaxSector; + conf->reshape_safe = MaxSector; mddev->reshape_position = MaxSector; spin_unlock_irq(&conf->device_lock); return ret; @@ -4524,6 +4526,7 @@ static void end_reshape(struct r10conf *conf) md_finish_reshape(conf->mddev); smp_wmb(); conf->reshape_progress = MaxSector; + conf->reshape_safe = MaxSector; spin_unlock_irq(&conf->device_lock); /* read-ahead size must cover two whole stripes, which is diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 59e44e99eef3..f757023fc458 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2162,6 +2162,9 @@ static int resize_stripes(struct r5conf *conf, int newsize) if (!sc) return -ENOMEM; + /* Need to ensure auto-resizing doesn't interfere */ + mutex_lock(&conf->cache_size_mutex); + for (i = conf->max_nr_stripes; i; i--) { nsh = alloc_stripe(sc, GFP_KERNEL); if (!nsh) @@ -2178,6 +2181,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) kmem_cache_free(sc, nsh); } kmem_cache_destroy(sc); + mutex_unlock(&conf->cache_size_mutex); return -ENOMEM; } /* Step 2 - Must use GFP_NOIO now. @@ -2224,6 +2228,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) } else err = -ENOMEM; + mutex_unlock(&conf->cache_size_mutex); /* Step 4, return new stripes to service */ while(!list_empty(&newstripes)) { nsh = list_entry(newstripes.next, struct stripe_head, lru); @@ -2251,7 +2256,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) static int drop_one_stripe(struct r5conf *conf) { struct stripe_head *sh; - int hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS; + int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK; spin_lock_irq(conf->hash_locks + hash); sh = get_free_stripe(conf, hash); @@ -4061,8 +4066,10 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) &first_bad, &bad_sectors)) set_bit(R5_ReadRepl, &dev->flags); else { - if (rdev) + if (rdev && !test_bit(Faulty, &rdev->flags)) set_bit(R5_NeedReplace, &dev->flags); + else + clear_bit(R5_NeedReplace, &dev->flags); rdev = rcu_dereference(conf->disks[i].rdev); clear_bit(R5_ReadRepl, &dev->flags); } @@ -5857,12 +5864,14 @@ static void raid5d(struct md_thread *thread) pr_debug("%d stripes handled\n", handled); spin_unlock_irq(&conf->device_lock); - if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state)) { + if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) && + mutex_trylock(&conf->cache_size_mutex)) { grow_one_stripe(conf, __GFP_NOWARN); /* Set flag even if allocation failed. This helps * slow down allocation requests when mem is short */ set_bit(R5_DID_ALLOC, &conf->cache_state); + mutex_unlock(&conf->cache_size_mutex); } async_tx_issue_pending_all(); @@ -5894,18 +5903,22 @@ raid5_set_cache_size(struct mddev *mddev, int size) return -EINVAL; conf->min_nr_stripes = size; + mutex_lock(&conf->cache_size_mutex); while (size < conf->max_nr_stripes && drop_one_stripe(conf)) ; + mutex_unlock(&conf->cache_size_mutex); err = md_allow_write(mddev); if (err) return err; + mutex_lock(&conf->cache_size_mutex); while (size > conf->max_nr_stripes) if (!grow_one_stripe(conf, GFP_KERNEL)) break; + mutex_unlock(&conf->cache_size_mutex); return 0; } @@ -6371,11 +6384,19 @@ static unsigned long raid5_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); - int ret = 0; - while (ret < sc->nr_to_scan) { - if (drop_one_stripe(conf) == 0) - return SHRINK_STOP; - ret++; + unsigned long ret = SHRINK_STOP; + + if (mutex_trylock(&conf->cache_size_mutex)) { + ret= 0; + while (ret < sc->nr_to_scan && + conf->max_nr_stripes > conf->min_nr_stripes) { + if (drop_one_stripe(conf) == 0) { + ret = SHRINK_STOP; + break; + } + ret++; + } + mutex_unlock(&conf->cache_size_mutex); } return ret; } @@ -6444,6 +6465,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) goto abort; spin_lock_init(&conf->device_lock); seqcount_init(&conf->gen_lock); + mutex_init(&conf->cache_size_mutex); init_waitqueue_head(&conf->wait_for_quiescent); for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) { init_waitqueue_head(&conf->wait_for_stripe[i]); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 02c3bf8fbfe7..d05144278690 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -482,7 +482,8 @@ struct r5conf { */ int active_name; char cache_name[2][32]; - struct kmem_cache *slab_cache; /* for allocating stripes */ + struct kmem_cache *slab_cache; /* for allocating stripes */ + struct mutex cache_size_mutex; /* Protect changes to cache size */ int seq_flush, seq_write; int quiesce; |