diff options
Diffstat (limited to 'drivers/md')
51 files changed, 1878 insertions, 1029 deletions
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 646fe85261c1..18526d44688d 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -11,6 +11,7 @@ #include "bset.h" #include <linux/console.h> +#include <linux/sched/clock.h> #include <linux/random.h> #include <linux/prefetch.h> diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index a43eedd5804d..450d0e848ae4 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -32,6 +32,9 @@ #include <linux/prefetch.h> #include <linux/random.h> #include <linux/rcupdate.h> +#include <linux/sched/clock.h> +#include <linux/rculist.h> + #include <trace/events/bcache.h> /* diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 9b2fe2d3e3a9..1ec84ca81146 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -3,6 +3,7 @@ #include <linux/llist.h> #include <linux/sched.h> +#include <linux/sched/task_stack.h> #include <linux/workqueue.h> /* diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 76d20875503c..709c9cc34369 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -666,7 +666,7 @@ static inline struct search *search_alloc(struct bio *bio, s->iop.write_prio = 0; s->iop.error = 0; s->iop.flags = 0; - s->iop.flush_journal = (bio->bi_opf & (REQ_PREFLUSH|REQ_FUA)) != 0; + s->iop.flush_journal = op_is_flush(bio->bi_opf); s->iop.wq = bcache_wq; return s; @@ -1009,7 +1009,7 @@ static int cached_dev_congested(void *data, int bits) struct request_queue *q = bdev_get_queue(dc->bdev); int ret = 0; - if (bdi_congested(&q->backing_dev_info, bits)) + if (bdi_congested(q->backing_dev_info, bits)) return 1; if (cached_dev_get(dc)) { @@ -1018,7 +1018,7 @@ static int cached_dev_congested(void *data, int bits) for_each_cache(ca, d->c, i) { q = bdev_get_queue(ca->bdev); - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= bdi_congested(q->backing_dev_info, bits); } cached_dev_put(dc); @@ -1032,7 +1032,7 @@ void bch_cached_dev_request_init(struct cached_dev *dc) struct gendisk *g = dc->disk.disk; g->queue->make_request_fn = cached_dev_make_request; - g->queue->backing_dev_info.congested_fn = cached_dev_congested; + g->queue->backing_dev_info->congested_fn = cached_dev_congested; dc->disk.cache_miss = cached_dev_cache_miss; dc->disk.ioctl = cached_dev_ioctl; } @@ -1125,7 +1125,7 @@ static int flash_dev_congested(void *data, int bits) for_each_cache(ca, d->c, i) { q = bdev_get_queue(ca->bdev); - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= bdi_congested(q->backing_dev_info, bits); } return ret; @@ -1136,7 +1136,7 @@ void bch_flash_dev_request_init(struct bcache_device *d) struct gendisk *g = d->disk; g->queue->make_request_fn = flash_dev_make_request; - g->queue->backing_dev_info.congested_fn = flash_dev_congested; + g->queue->backing_dev_info->congested_fn = flash_dev_congested; d->cache_miss = flash_dev_cache_miss; d->ioctl = flash_dev_ioctl; } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 3a19cbc8b230..85e3f21c2514 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -807,7 +807,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, blk_queue_make_request(q, NULL); d->disk->queue = q; q->queuedata = d; - q->backing_dev_info.congested_data = d; + q->backing_dev_info->congested_data = d; q->limits.max_hw_sectors = UINT_MAX; q->limits.max_sectors = UINT_MAX; q->limits.max_segment_size = UINT_MAX; @@ -1132,9 +1132,9 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) set_capacity(dc->disk.disk, dc->bdev->bd_part->nr_sects - dc->sb.data_offset); - dc->disk.disk->queue->backing_dev_info.ra_pages = - max(dc->disk.disk->queue->backing_dev_info.ra_pages, - q->backing_dev_info.ra_pages); + dc->disk.disk->queue->backing_dev_info->ra_pages = + max(dc->disk.disk->queue->backing_dev_info->ra_pages, + q->backing_dev_info->ra_pages); bch_cached_dev_request_init(dc); bch_cached_dev_writeback_init(dc); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index b3ff57d61dde..f90f13616980 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -13,6 +13,7 @@ #include <linux/blkdev.h> #include <linux/sort.h> +#include <linux/sched/clock.h> static const char * const cache_replacement_policies[] = { "lru", diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index dde6172f3f10..8c3a938f4bf0 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -12,6 +12,7 @@ #include <linux/module.h> #include <linux/seq_file.h> #include <linux/types.h> +#include <linux/sched/clock.h> #include "util.h" diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index cf2cbc211d83..5d13930f0f22 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -4,8 +4,8 @@ #include <linux/blkdev.h> #include <linux/errno.h> -#include <linux/blkdev.h> #include <linux/kernel.h> +#include <linux/sched/clock.h> #include <linux/llist.h> #include <linux/ratelimit.h> #include <linux/vmalloc.h> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 69e1ae59cab8..6ac2e48b9235 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -13,6 +13,7 @@ #include <linux/delay.h> #include <linux/kthread.h> +#include <linux/sched/clock.h> #include <trace/events/bcache.h> /* Rate limiting */ diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 84d2f0e4c754..df4859f6ac6a 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -11,6 +11,7 @@ #include <linux/device-mapper.h> #include <linux/dm-io.h> #include <linux/slab.h> +#include <linux/sched/mm.h> #include <linux/jiffies.h> #include <linux/vmalloc.h> #include <linux/shrinker.h> @@ -794,7 +795,7 @@ static void __wait_for_free_buffer(struct dm_bufio_client *c) DECLARE_WAITQUEUE(wait, current); add_wait_queue(&c->free_buffer_wait, &wait); - set_task_state(current, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); dm_bufio_unlock(c); io_schedule(); diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 624fe4319b24..e4c2c1a1e993 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -25,7 +25,7 @@ * defines a range of metadata versions that this module can handle. */ #define MIN_CACHE_VERSION 1 -#define MAX_CACHE_VERSION 1 +#define MAX_CACHE_VERSION 2 #define CACHE_METADATA_CACHE_SIZE 64 @@ -55,6 +55,7 @@ enum mapping_bits { /* * The data on the cache is different from that on the origin. + * This flag is only used by metadata format 1. */ M_DIRTY = 2 }; @@ -93,12 +94,18 @@ struct cache_disk_superblock { __le32 write_misses; __le32 policy_version[CACHE_POLICY_VERSION_SIZE]; + + /* + * Metadata format 2 fields. + */ + __le64 dirty_root; } __packed; struct dm_cache_metadata { atomic_t ref_count; struct list_head list; + unsigned version; struct block_device *bdev; struct dm_block_manager *bm; struct dm_space_map *metadata_sm; @@ -142,11 +149,18 @@ struct dm_cache_metadata { bool fail_io:1; /* + * Metadata format 2 fields. + */ + dm_block_t dirty_root; + struct dm_disk_bitset dirty_info; + + /* * These structures are used when loading metadata. They're too * big to put on the stack. */ struct dm_array_cursor mapping_cursor; struct dm_array_cursor hint_cursor; + struct dm_bitset_cursor dirty_cursor; }; /*------------------------------------------------------------------- @@ -170,6 +184,7 @@ static void sb_prepare_for_write(struct dm_block_validator *v, static int check_metadata_version(struct cache_disk_superblock *disk_super) { uint32_t metadata_version = le32_to_cpu(disk_super->version); + if (metadata_version < MIN_CACHE_VERSION || metadata_version > MAX_CACHE_VERSION) { DMERR("Cache metadata version %u found, but only versions between %u and %u supported.", metadata_version, MIN_CACHE_VERSION, MAX_CACHE_VERSION); @@ -310,6 +325,11 @@ static void __copy_sm_root(struct dm_cache_metadata *cmd, sizeof(cmd->metadata_space_map_root)); } +static bool separate_dirty_bits(struct dm_cache_metadata *cmd) +{ + return cmd->version >= 2; +} + static int __write_initial_superblock(struct dm_cache_metadata *cmd) { int r; @@ -341,7 +361,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) disk_super->flags = 0; memset(disk_super->uuid, 0, sizeof(disk_super->uuid)); disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC); - disk_super->version = cpu_to_le32(MAX_CACHE_VERSION); + disk_super->version = cpu_to_le32(cmd->version); memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name)); memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version)); disk_super->policy_hint_size = 0; @@ -362,6 +382,9 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) disk_super->write_hits = cpu_to_le32(0); disk_super->write_misses = cpu_to_le32(0); + if (separate_dirty_bits(cmd)) + disk_super->dirty_root = cpu_to_le64(cmd->dirty_root); + return dm_tm_commit(cmd->tm, sblock); } @@ -382,6 +405,13 @@ static int __format_metadata(struct dm_cache_metadata *cmd) if (r < 0) goto bad; + if (separate_dirty_bits(cmd)) { + dm_disk_bitset_init(cmd->tm, &cmd->dirty_info); + r = dm_bitset_empty(&cmd->dirty_info, &cmd->dirty_root); + if (r < 0) + goto bad; + } + dm_disk_bitset_init(cmd->tm, &cmd->discard_info); r = dm_bitset_empty(&cmd->discard_info, &cmd->discard_root); if (r < 0) @@ -407,9 +437,10 @@ bad: static int __check_incompat_features(struct cache_disk_superblock *disk_super, struct dm_cache_metadata *cmd) { - uint32_t features; + uint32_t incompat_flags, features; - features = le32_to_cpu(disk_super->incompat_flags) & ~DM_CACHE_FEATURE_INCOMPAT_SUPP; + incompat_flags = le32_to_cpu(disk_super->incompat_flags); + features = incompat_flags & ~DM_CACHE_FEATURE_INCOMPAT_SUPP; if (features) { DMERR("could not access metadata due to unsupported optional features (%lx).", (unsigned long)features); @@ -470,6 +501,7 @@ static int __open_metadata(struct dm_cache_metadata *cmd) } __setup_mapping_info(cmd); + dm_disk_bitset_init(cmd->tm, &cmd->dirty_info); dm_disk_bitset_init(cmd->tm, &cmd->discard_info); sb_flags = le32_to_cpu(disk_super->flags); cmd->clean_when_opened = test_bit(CLEAN_SHUTDOWN, &sb_flags); @@ -548,6 +580,7 @@ static unsigned long clear_clean_shutdown(unsigned long flags) static void read_superblock_fields(struct dm_cache_metadata *cmd, struct cache_disk_superblock *disk_super) { + cmd->version = le32_to_cpu(disk_super->version); cmd->flags = le32_to_cpu(disk_super->flags); cmd->root = le64_to_cpu(disk_super->mapping_root); cmd->hint_root = le64_to_cpu(disk_super->hint_root); @@ -567,6 +600,9 @@ static void read_superblock_fields(struct dm_cache_metadata *cmd, cmd->stats.write_hits = le32_to_cpu(disk_super->write_hits); cmd->stats.write_misses = le32_to_cpu(disk_super->write_misses); + if (separate_dirty_bits(cmd)) + cmd->dirty_root = le64_to_cpu(disk_super->dirty_root); + cmd->changed = false; } @@ -625,6 +661,13 @@ static int __commit_transaction(struct dm_cache_metadata *cmd, */ BUILD_BUG_ON(sizeof(struct cache_disk_superblock) > 512); + if (separate_dirty_bits(cmd)) { + r = dm_bitset_flush(&cmd->dirty_info, cmd->dirty_root, + &cmd->dirty_root); + if (r) + return r; + } + r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root, &cmd->discard_root); if (r) @@ -649,6 +692,8 @@ static int __commit_transaction(struct dm_cache_metadata *cmd, update_flags(disk_super, mutator); disk_super->mapping_root = cpu_to_le64(cmd->root); + if (separate_dirty_bits(cmd)) + disk_super->dirty_root = cpu_to_le64(cmd->dirty_root); disk_super->hint_root = cpu_to_le64(cmd->hint_root); disk_super->discard_root = cpu_to_le64(cmd->discard_root); disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); @@ -698,7 +743,8 @@ static void unpack_value(__le64 value_le, dm_oblock_t *block, unsigned *flags) static struct dm_cache_metadata *metadata_open(struct block_device *bdev, sector_t data_block_size, bool may_format_device, - size_t policy_hint_size) + size_t policy_hint_size, + unsigned metadata_version) { int r; struct dm_cache_metadata *cmd; @@ -709,6 +755,7 @@ static struct dm_cache_metadata *metadata_open(struct block_device *bdev, return ERR_PTR(-ENOMEM); } + cmd->version = metadata_version; atomic_set(&cmd->ref_count, 1); init_rwsem(&cmd->root_lock); cmd->bdev = bdev; @@ -757,7 +804,8 @@ static struct dm_cache_metadata *lookup(struct block_device *bdev) static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev, sector_t data_block_size, bool may_format_device, - size_t policy_hint_size) + size_t policy_hint_size, + unsigned metadata_version) { struct dm_cache_metadata *cmd, *cmd2; @@ -768,7 +816,8 @@ static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev, if (cmd) return cmd; - cmd = metadata_open(bdev, data_block_size, may_format_device, policy_hint_size); + cmd = metadata_open(bdev, data_block_size, may_format_device, + policy_hint_size, metadata_version); if (!IS_ERR(cmd)) { mutex_lock(&table_lock); cmd2 = lookup(bdev); @@ -800,10 +849,11 @@ static bool same_params(struct dm_cache_metadata *cmd, sector_t data_block_size) struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, sector_t data_block_size, bool may_format_device, - size_t policy_hint_size) + size_t policy_hint_size, + unsigned metadata_version) { - struct dm_cache_metadata *cmd = lookup_or_open(bdev, data_block_size, - may_format_device, policy_hint_size); + struct dm_cache_metadata *cmd = lookup_or_open(bdev, data_block_size, may_format_device, + policy_hint_size, metadata_version); if (!IS_ERR(cmd) && !same_params(cmd, data_block_size)) { dm_cache_metadata_close(cmd); @@ -829,8 +879,8 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd) /* * Checks that the given cache block is either unmapped or clean. */ -static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b, - bool *result) +static int block_clean_combined_dirty(struct dm_cache_metadata *cmd, dm_cblock_t b, + bool *result) { int r; __le64 value; @@ -838,10 +888,8 @@ static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b, unsigned flags; r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(b), &value); - if (r) { - DMERR("block_unmapped_or_clean failed"); + if (r) return r; - } unpack_value(value, &ob, &flags); *result = !((flags & M_VALID) && (flags & M_DIRTY)); @@ -849,17 +897,19 @@ static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b, return 0; } -static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd, - dm_cblock_t begin, dm_cblock_t end, - bool *result) +static int blocks_are_clean_combined_dirty(struct dm_cache_metadata *cmd, + dm_cblock_t begin, dm_cblock_t end, + bool *result) { int r; *result = true; while (begin != end) { - r = block_unmapped_or_clean(cmd, begin, result); - if (r) + r = block_clean_combined_dirty(cmd, begin, result); + if (r) { + DMERR("block_clean_combined_dirty failed"); return r; + } if (!*result) { DMERR("cache block %llu is dirty", @@ -873,6 +923,67 @@ static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd, return 0; } +static int blocks_are_clean_separate_dirty(struct dm_cache_metadata *cmd, + dm_cblock_t begin, dm_cblock_t end, + bool *result) +{ + int r; + bool dirty_flag; + *result = true; + + r = dm_bitset_cursor_begin(&cmd->dirty_info, cmd->dirty_root, + from_cblock(begin), &cmd->dirty_cursor); + if (r) { + DMERR("%s: dm_bitset_cursor_begin for dirty failed", __func__); + return r; + } + + r = dm_bitset_cursor_skip(&cmd->dirty_cursor, from_cblock(begin)); + if (r) { + DMERR("%s: dm_bitset_cursor_skip for dirty failed", __func__); + dm_bitset_cursor_end(&cmd->dirty_cursor); + return r; + } + + while (begin != end) { + /* + * We assume that unmapped blocks have their dirty bit + * cleared. + */ + dirty_flag = dm_bitset_cursor_get_value(&cmd->dirty_cursor); + if (dirty_flag) { + DMERR("%s: cache block %llu is dirty", __func__, + (unsigned long long) from_cblock(begin)); + dm_bitset_cursor_end(&cmd->dirty_cursor); + *result = false; + return 0; + } + + r = dm_bitset_cursor_next(&cmd->dirty_cursor); + if (r) { + DMERR("%s: dm_bitset_cursor_next for dirty failed", __func__); + dm_bitset_cursor_end(&cmd->dirty_cursor); + return r; + } + + begin = to_cblock(from_cblock(begin) + 1); + } + + dm_bitset_cursor_end(&cmd->dirty_cursor); + + return 0; +} + +static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd, + dm_cblock_t begin, dm_cblock_t end, + bool *result) +{ + if (separate_dirty_bits(cmd)) + return blocks_are_clean_separate_dirty(cmd, begin, end, result); + else + return blocks_are_clean_combined_dirty(cmd, begin, end, result); +} + static bool cmd_write_lock(struct dm_cache_metadata *cmd) { down_write(&cmd->root_lock); @@ -950,8 +1061,18 @@ int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size) r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks), from_cblock(new_cache_size), &null_mapping, &cmd->root); - if (!r) - cmd->cache_blocks = new_cache_size; + if (r) + goto out; + + if (separate_dirty_bits(cmd)) { + r = dm_bitset_resize(&cmd->dirty_info, cmd->dirty_root, + from_cblock(cmd->cache_blocks), from_cblock(new_cache_size), + false, &cmd->dirty_root); + if (r) + goto out; + } + + cmd->cache_blocks = new_cache_size; cmd->changed = true; out: @@ -995,14 +1116,6 @@ static int __clear_discard(struct dm_cache_metadata *cmd, dm_dblock_t b) from_dblock(b), &cmd->discard_root); } -static int __is_discarded(struct dm_cache_metadata *cmd, dm_dblock_t b, - bool *is_discarded) -{ - return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root, - from_dblock(b), &cmd->discard_root, - is_discarded); -} - static int __discard(struct dm_cache_metadata *cmd, dm_dblock_t dblock, bool discard) { @@ -1032,22 +1145,38 @@ static int __load_discards(struct dm_cache_metadata *cmd, load_discard_fn fn, void *context) { int r = 0; - dm_block_t b; - bool discard; + uint32_t b; + struct dm_bitset_cursor c; - for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) { - dm_dblock_t dblock = to_dblock(b); + if (from_dblock(cmd->discard_nr_blocks) == 0) + /* nothing to do */ + return 0; - if (cmd->clean_when_opened) { - r = __is_discarded(cmd, dblock, &discard); - if (r) - return r; - } else - discard = false; + if (cmd->clean_when_opened) { + r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root, &cmd->discard_root); + if (r) + return r; - r = fn(context, cmd->discard_block_size, dblock, discard); + r = dm_bitset_cursor_begin(&cmd->discard_info, cmd->discard_root, + from_dblock(cmd->discard_nr_blocks), &c); if (r) - break; + return r; + + for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) { + r = fn(context, cmd->discard_block_size, to_dblock(b), + dm_bitset_cursor_get_value(&c)); + if (r) + break; + } + + dm_bitset_cursor_end(&c); + + } else { + for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) { + r = fn(context, cmd->discard_block_size, to_dblock(b), false); + if (r) + return r; + } } return r; @@ -1177,11 +1306,11 @@ static bool hints_array_available(struct dm_cache_metadata *cmd, hints_array_initialized(cmd); } -static int __load_mapping(struct dm_cache_metadata *cmd, - uint64_t cb, bool hints_valid, - struct dm_array_cursor *mapping_cursor, - struct dm_array_cursor *hint_cursor, - load_mapping_fn fn, void *context) +static int __load_mapping_v1(struct dm_cache_metadata *cmd, + uint64_t cb, bool hints_valid, + struct dm_array_cursor *mapping_cursor, + struct dm_array_cursor *hint_cursor, + load_mapping_fn fn, void *context) { int r = 0; @@ -1206,8 +1335,51 @@ static int __load_mapping(struct dm_cache_metadata *cmd, r = fn(context, oblock, to_cblock(cb), flags & M_DIRTY, le32_to_cpu(hint), hints_valid); - if (r) - DMERR("policy couldn't load cblock"); + if (r) { + DMERR("policy couldn't load cache block %llu", + (unsigned long long) from_cblock(to_cblock(cb))); + } + } + + return r; +} + +static int __load_mapping_v2(struct dm_cache_metadata *cmd, + uint64_t cb, bool hints_valid, + struct dm_array_cursor *mapping_cursor, + struct dm_array_cursor *hint_cursor, + struct dm_bitset_cursor *dirty_cursor, + load_mapping_fn fn, void *context) +{ + int r = 0; + + __le64 mapping; + __le32 hint = 0; + + __le64 *mapping_value_le; + __le32 *hint_value_le; + + dm_oblock_t oblock; + unsigned flags; + bool dirty; + + dm_array_cursor_get_value(mapping_cursor, (void **) &mapping_value_le); + memcpy(&mapping, mapping_value_le, sizeof(mapping)); + unpack_value(mapping, &oblock, &flags); + + if (flags & M_VALID) { + if (hints_valid) { + dm_array_cursor_get_value(hint_cursor, (void **) &hint_value_le); + memcpy(&hint, hint_value_le, sizeof(hint)); + } + + dirty = dm_bitset_cursor_get_value(dirty_cursor); + r = fn(context, oblock, to_cblock(cb), dirty, + le32_to_cpu(hint), hints_valid); + if (r) { + DMERR("policy couldn't load cache block %llu", + (unsigned long long) from_cblock(to_cblock(cb))); + } } return r; @@ -1238,10 +1410,28 @@ static int __load_mappings(struct dm_cache_metadata *cmd, } } + if (separate_dirty_bits(cmd)) { + r = dm_bitset_cursor_begin(&cmd->dirty_info, cmd->dirty_root, + from_cblock(cmd->cache_blocks), + &cmd->dirty_cursor); + if (r) { + dm_array_cursor_end(&cmd->hint_cursor); + dm_array_cursor_end(&cmd->mapping_cursor); + return r; + } + } + for (cb = 0; ; cb++) { - r = __load_mapping(cmd, cb, hints_valid, - &cmd->mapping_cursor, &cmd->hint_cursor, - fn, context); + if (separate_dirty_bits(cmd)) + r = __load_mapping_v2(cmd, cb, hints_valid, + &cmd->mapping_cursor, + &cmd->hint_cursor, + &cmd->dirty_cursor, + fn, context); + else + r = __load_mapping_v1(cmd, cb, hints_valid, + &cmd->mapping_cursor, &cmd->hint_cursor, + fn, context); if (r) goto out; @@ -1264,12 +1454,23 @@ static int __load_mappings(struct dm_cache_metadata *cmd, goto out; } } + + if (separate_dirty_bits(cmd)) { + r = dm_bitset_cursor_next(&cmd->dirty_cursor); + if (r) { + DMERR("dm_bitset_cursor_next for dirty failed"); + goto out; + } + } } out: dm_array_cursor_end(&cmd->mapping_cursor); if (hints_valid) dm_array_cursor_end(&cmd->hint_cursor); + if (separate_dirty_bits(cmd)) + dm_bitset_cursor_end(&cmd->dirty_cursor); + return r; } @@ -1352,13 +1553,55 @@ static int __dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty } -int dm_cache_set_dirty(struct dm_cache_metadata *cmd, - dm_cblock_t cblock, bool dirty) +static int __set_dirty_bits_v1(struct dm_cache_metadata *cmd, unsigned nr_bits, unsigned long *bits) +{ + int r; + unsigned i; + for (i = 0; i < nr_bits; i++) { + r = __dirty(cmd, to_cblock(i), test_bit(i, bits)); + if (r) + return r; + } + + return 0; +} + +static int is_dirty_callback(uint32_t index, bool *value, void *context) +{ + unsigned long *bits = context; + *value = test_bit(index, bits); + return 0; +} + +static int __set_dirty_bits_v2(struct dm_cache_metadata *cmd, unsigned nr_bits, unsigned long *bits) +{ + int r = 0; + + /* nr_bits is really just a sanity check */ + if (nr_bits != from_cblock(cmd->cache_blocks)) { + DMERR("dirty bitset is wrong size"); + return -EINVAL; + } + + r = dm_bitset_del(&cmd->dirty_info, cmd->dirty_root); + if (r) + return r; + + cmd->changed = true; + return dm_bitset_new(&cmd->dirty_info, &cmd->dirty_root, nr_bits, is_dirty_callback, bits); +} + +int dm_cache_set_dirty_bits(struct dm_cache_metadata *cmd, + unsigned nr_bits, + unsigned long *bits) { int r; WRITE_LOCK(cmd); - r = __dirty(cmd, cblock, dirty); + if (separate_dirty_bits(cmd)) + r = __set_dirty_bits_v2(cmd, nr_bits, bits); + else + r = __set_dirty_bits_v1(cmd, nr_bits, bits); WRITE_UNLOCK(cmd); return r; diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index 8528744195e5..4f07c08cf107 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h @@ -45,18 +45,20 @@ * As these various flags are defined they should be added to the * following masks. */ + #define DM_CACHE_FEATURE_COMPAT_SUPP 0UL #define DM_CACHE_FEATURE_COMPAT_RO_SUPP 0UL #define DM_CACHE_FEATURE_INCOMPAT_SUPP 0UL /* - * Reopens or creates a new, empty metadata volume. - * Returns an ERR_PTR on failure. + * Reopens or creates a new, empty metadata volume. Returns an ERR_PTR on + * failure. If reopening then features must match. */ struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, sector_t data_block_size, bool may_format_device, - size_t policy_hint_size); + size_t policy_hint_size, + unsigned metadata_version); void dm_cache_metadata_close(struct dm_cache_metadata *cmd); @@ -91,7 +93,8 @@ int dm_cache_load_mappings(struct dm_cache_metadata *cmd, load_mapping_fn fn, void *context); -int dm_cache_set_dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty); +int dm_cache_set_dirty_bits(struct dm_cache_metadata *cmd, + unsigned nr_bits, unsigned long *bits); struct dm_cache_statistics { uint32_t read_hits; diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index e04c61e0839e..9c689b34e6e7 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -179,6 +179,7 @@ enum cache_io_mode { struct cache_features { enum cache_metadata_mode mode; enum cache_io_mode io_mode; + unsigned metadata_version; }; struct cache_stats { @@ -248,7 +249,7 @@ struct cache { /* * Fields for converting from sectors to blocks. */ - uint32_t sectors_per_block; + sector_t sectors_per_block; int sectors_per_block_shift; spinlock_t lock; @@ -787,8 +788,7 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); spin_lock_irqsave(&cache->lock, flags); - if (cache->need_tick_bio && - !(bio->bi_opf & (REQ_FUA | REQ_PREFLUSH)) && + if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) && bio_op(bio) != REQ_OP_DISCARD) { pb->tick = true; cache->need_tick_bio = false; @@ -828,11 +828,6 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) return to_oblock(block_nr); } -static int bio_triggers_commit(struct cache *cache, struct bio *bio) -{ - return bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); -} - /* * You must increment the deferred set whilst the prison cell is held. To * encourage this, we ask for 'cell' to be passed in. @@ -884,7 +879,7 @@ static void issue(struct cache *cache, struct bio *bio) { unsigned long flags; - if (!bio_triggers_commit(cache, bio)) { + if (!op_is_flush(bio->bi_opf)) { accounted_request(cache, bio); return; } @@ -1069,8 +1064,7 @@ static void dec_io_migrations(struct cache *cache) static bool discard_or_flush(struct bio *bio) { - return bio_op(bio) == REQ_OP_DISCARD || - bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); + return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); } static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell) @@ -2291,7 +2285,7 @@ static void do_waker(struct work_struct *ws) static int is_congested(struct dm_dev *dev, int bdi_bits) { struct request_queue *q = bdev_get_queue(dev->bdev); - return bdi_congested(&q->backing_dev_info, bdi_bits); + return bdi_congested(q->backing_dev_info, bdi_bits); } static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) @@ -2541,13 +2535,14 @@ static void init_features(struct cache_features *cf) { cf->mode = CM_WRITE; cf->io_mode = CM_IO_WRITEBACK; + cf->metadata_version = 1; } static int parse_features(struct cache_args *ca, struct dm_arg_set *as, char **error) { static struct dm_arg _args[] = { - {0, 1, "Invalid number of cache feature arguments"}, + {0, 2, "Invalid number of cache feature arguments"}, }; int r; @@ -2573,6 +2568,9 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as, else if (!strcasecmp(arg, "passthrough")) cf->io_mode = CM_IO_PASSTHROUGH; + else if (!strcasecmp(arg, "metadata2")) + cf->metadata_version = 2; + else { *error = "Unrecognised cache feature requested"; return -EINVAL; @@ -2827,7 +2825,8 @@ static int cache_create(struct cache_args *ca, struct cache **result) cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, ca->block_size, may_format, - dm_cache_policy_get_hint_size(cache->policy)); + dm_cache_policy_get_hint_size(cache->policy), + ca->features.metadata_version); if (IS_ERR(cmd)) { *error = "Error creating metadata object"; r = PTR_ERR(cmd); @@ -3172,21 +3171,16 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) static int write_dirty_bitset(struct cache *cache) { - unsigned i, r; + int r; if (get_cache_mode(cache) >= CM_READ_ONLY) return -EINVAL; - for (i = 0; i < from_cblock(cache->cache_size); i++) { - r = dm_cache_set_dirty(cache->cmd, to_cblock(i), - is_dirty(cache, to_cblock(i))); - if (r) { - metadata_operation_failed(cache, "dm_cache_set_dirty", r); - return r; - } - } + r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset); + if (r) + metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r); - return 0; + return r; } static int write_discard_bitset(struct cache *cache) @@ -3547,11 +3541,11 @@ static void cache_status(struct dm_target *ti, status_type_t type, residency = policy_residency(cache->policy); - DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ", + DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ", (unsigned)DM_CACHE_METADATA_BLOCK_SIZE, (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), (unsigned long long)nr_blocks_metadata, - cache->sectors_per_block, + (unsigned long long)cache->sectors_per_block, (unsigned long long) from_cblock(residency), (unsigned long long) from_cblock(cache->cache_size), (unsigned) atomic_read(&cache->stats.read_hit), @@ -3562,14 +3556,19 @@ static void cache_status(struct dm_target *ti, status_type_t type, (unsigned) atomic_read(&cache->stats.promotion), (unsigned long) atomic_read(&cache->nr_dirty)); + if (cache->features.metadata_version == 2) + DMEMIT("2 metadata2 "); + else + DMEMIT("1 "); + if (writethrough_mode(&cache->features)) - DMEMIT("1 writethrough "); + DMEMIT("writethrough "); else if (passthrough_mode(&cache->features)) - DMEMIT("1 passthrough "); + DMEMIT("passthrough "); else if (writeback_mode(&cache->features)) - DMEMIT("1 writeback "); + DMEMIT("writeback "); else { DMERR("%s: internal error: unknown io mode: %d", @@ -3817,7 +3816,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {1, 9, 0}, + .version = {1, 10, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 40ceba1fe8be..136fda3ff9e5 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -92,7 +92,6 @@ struct mapped_device { * io objects are allocated from here. */ mempool_t *io_pool; - mempool_t *rq_pool; struct bio_set *bs; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 8a9f742d8ed7..389a3637ffcc 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1210,14 +1210,14 @@ continue_locked: spin_unlock_irq(&cc->write_thread_wait.lock); if (unlikely(kthread_should_stop())) { - set_task_state(current, TASK_RUNNING); + set_current_state(TASK_RUNNING); remove_wait_queue(&cc->write_thread_wait, &wait); break; } schedule(); - set_task_state(current, TASK_RUNNING); + set_current_state(TASK_RUNNING); spin_lock_irq(&cc->write_thread_wait.lock); __remove_wait_queue(&cc->write_thread_wait, &wait); goto continue_locked; @@ -1536,7 +1536,7 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string down_read(&key->sem); - ukp = user_key_payload(key); + ukp = user_key_payload_locked(key); if (!ukp) { up_read(&key->sem); key_put(key); diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index bf2b2676cb8a..9fab33b113c4 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -1379,7 +1379,7 @@ static void stop_worker(struct era *era) static int dev_is_congested(struct dm_dev *dev, int bdi_bits) { struct request_queue *q = bdev_get_queue(dev->bdev); - return bdi_congested(&q->backing_dev_info, bdi_bits); + return bdi_congested(q->backing_dev_info, bdi_bits); } static int era_is_congested(struct dm_target_callbacks *cb, int bdi_bits) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index a5a9b17f0f7f..4da6fc6b1ffd 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/vmalloc.h> #include <linux/miscdevice.h> +#include <linux/sched/mm.h> #include <linux/init.h> #include <linux/wait.h> #include <linux/slab.h> diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 3570bcb7a4a4..7f223dbed49f 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -92,12 +92,6 @@ struct multipath { unsigned queue_mode; - /* - * We must use a mempool of dm_mpath_io structs so that we - * can resubmit bios on error. - */ - mempool_t *mpio_pool; - struct mutex work_mutex; struct work_struct trigger_event; @@ -115,8 +109,6 @@ struct dm_mpath_io { typedef int (*action_fn) (struct pgpath *pgpath); -static struct kmem_cache *_mpio_cache; - static struct workqueue_struct *kmultipathd, *kmpath_handlerd; static void trigger_event(struct work_struct *work); static void activate_path(struct work_struct *work); @@ -209,7 +201,6 @@ static struct multipath *alloc_multipath(struct dm_target *ti) init_waitqueue_head(&m->pg_init_wait); mutex_init(&m->work_mutex); - m->mpio_pool = NULL; m->queue_mode = DM_TYPE_NONE; m->ti = ti; @@ -229,16 +220,7 @@ static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m) m->queue_mode = DM_TYPE_MQ_REQUEST_BASED; else m->queue_mode = DM_TYPE_REQUEST_BASED; - } - - if (m->queue_mode == DM_TYPE_REQUEST_BASED) { - unsigned min_ios = dm_get_reserved_rq_based_ios(); - - m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache); - if (!m->mpio_pool) - return -ENOMEM; - } - else if (m->queue_mode == DM_TYPE_BIO_BASED) { + } else if (m->queue_mode == DM_TYPE_BIO_BASED) { INIT_WORK(&m->process_queued_bios, process_queued_bios); /* * bio-based doesn't support any direct scsi_dh management; @@ -263,7 +245,6 @@ static void free_multipath(struct multipath *m) kfree(m->hw_handler_name); kfree(m->hw_handler_params); - mempool_destroy(m->mpio_pool); kfree(m); } @@ -272,38 +253,6 @@ static struct dm_mpath_io *get_mpio(union map_info *info) return info->ptr; } -static struct dm_mpath_io *set_mpio(struct multipath *m, union map_info *info) -{ - struct dm_mpath_io *mpio; - - if (!m->mpio_pool) { - /* Use blk-mq pdu memory requested via per_io_data_size */ - mpio = get_mpio(info); - memset(mpio, 0, sizeof(*mpio)); - return mpio; - } - - mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); - if (!mpio) - return NULL; - - memset(mpio, 0, sizeof(*mpio)); - info->ptr = mpio; - - return mpio; -} - -static void clear_request_fn_mpio(struct multipath *m, union map_info *info) -{ - /* Only needed for non blk-mq (.request_fn) multipath */ - if (m->mpio_pool) { - struct dm_mpath_io *mpio = info->ptr; - - info->ptr = NULL; - mempool_free(mpio, m->mpio_pool); - } -} - static size_t multipath_per_bio_data_size(void) { return sizeof(struct dm_mpath_io) + sizeof(struct dm_bio_details); @@ -530,16 +479,17 @@ static bool must_push_back_bio(struct multipath *m) /* * Map cloned requests (request-based multipath) */ -static int __multipath_map(struct dm_target *ti, struct request *clone, - union map_info *map_context, - struct request *rq, struct request **__clone) +static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, + union map_info *map_context, + struct request **__clone) { struct multipath *m = ti->private; int r = DM_MAPIO_REQUEUE; - size_t nr_bytes = clone ? blk_rq_bytes(clone) : blk_rq_bytes(rq); + size_t nr_bytes = blk_rq_bytes(rq); struct pgpath *pgpath; struct block_device *bdev; - struct dm_mpath_io *mpio; + struct dm_mpath_io *mpio = get_mpio(map_context); + struct request *clone; /* Do we need to select a new pgpath? */ pgpath = lockless_dereference(m->current_pgpath); @@ -556,42 +506,23 @@ static int __multipath_map(struct dm_target *ti, struct request *clone, return r; } - mpio = set_mpio(m, map_context); - if (!mpio) - /* ENOMEM, requeue */ - return r; - + memset(mpio, 0, sizeof(*mpio)); mpio->pgpath = pgpath; mpio->nr_bytes = nr_bytes; bdev = pgpath->path.dev->bdev; - if (clone) { - /* - * Old request-based interface: allocated clone is passed in. - * Used by: .request_fn stacked on .request_fn path(s). - */ - clone->q = bdev_get_queue(bdev); - clone->rq_disk = bdev->bd_disk; - clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; - } else { - /* - * blk-mq request-based interface; used by both: - * .request_fn stacked on blk-mq path(s) and - * blk-mq stacked on blk-mq path(s). - */ - clone = blk_mq_alloc_request(bdev_get_queue(bdev), - rq_data_dir(rq), BLK_MQ_REQ_NOWAIT); - if (IS_ERR(clone)) { - /* EBUSY, ENODEV or EWOULDBLOCK: requeue */ - clear_request_fn_mpio(m, map_context); - return r; - } - clone->bio = clone->biotail = NULL; - clone->rq_disk = bdev->bd_disk; - clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; - *__clone = clone; + clone = blk_get_request(bdev_get_queue(bdev), + rq->cmd_flags | REQ_NOMERGE, + GFP_ATOMIC); + if (IS_ERR(clone)) { + /* EBUSY, ENODEV or EWOULDBLOCK: requeue */ + return r; } + clone->bio = clone->biotail = NULL; + clone->rq_disk = bdev->bd_disk; + clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; + *__clone = clone; if (pgpath->pg->ps.type->start_io) pgpath->pg->ps.type->start_io(&pgpath->pg->ps, @@ -600,22 +531,9 @@ static int __multipath_map(struct dm_target *ti, struct request *clone, return DM_MAPIO_REMAPPED; } -static int multipath_map(struct dm_target *ti, struct request *clone, - union map_info *map_context) -{ - return __multipath_map(ti, clone, map_context, NULL, NULL); -} - -static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, - union map_info *map_context, - struct request **clone) -{ - return __multipath_map(ti, NULL, map_context, rq, clone); -} - static void multipath_release_clone(struct request *clone) { - blk_mq_free_request(clone); + blk_put_request(clone); } /* @@ -1187,7 +1105,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->num_write_same_bios = 1; if (m->queue_mode == DM_TYPE_BIO_BASED) ti->per_io_data_size = multipath_per_bio_data_size(); - else if (m->queue_mode == DM_TYPE_MQ_REQUEST_BASED) + else ti->per_io_data_size = sizeof(struct dm_mpath_io); return 0; @@ -1610,7 +1528,6 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, if (ps->type->end_io) ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); } - clear_request_fn_mpio(m, map_context); return r; } @@ -2060,7 +1977,6 @@ static struct target_type multipath_target = { .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, - .map_rq = multipath_map, .clone_and_map_rq = multipath_clone_and_map, .release_clone_rq = multipath_release_clone, .rq_end_io = multipath_end_io, @@ -2080,11 +1996,6 @@ static int __init dm_multipath_init(void) { int r; - /* allocate a slab for the dm_mpath_ios */ - _mpio_cache = KMEM_CACHE(dm_mpath_io, 0); - if (!_mpio_cache) - return -ENOMEM; - r = dm_register_target(&multipath_target); if (r < 0) { DMERR("request-based register failed %d", r); @@ -2120,8 +2031,6 @@ bad_alloc_kmpath_handlerd: bad_alloc_kmultipathd: dm_unregister_target(&multipath_target); bad_register_target: - kmem_cache_destroy(_mpio_cache); - return r; } @@ -2131,7 +2040,6 @@ static void __exit dm_multipath_exit(void) destroy_workqueue(kmultipathd); dm_unregister_target(&multipath_target); - kmem_cache_destroy(_mpio_cache); } module_init(dm_multipath_init); diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index b8f978e551d7..f8564d63982f 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -24,6 +24,11 @@ */ #define MIN_FREE_RESHAPE_SPACE to_sector(4*4096) +/* + * Minimum journal space 4 MiB in sectors. + */ +#define MIN_RAID456_JOURNAL_SPACE (4*2048) + static bool devices_handle_discard_safely = false; /* @@ -73,6 +78,9 @@ struct raid_dev { #define __CTR_FLAG_DATA_OFFSET 13 /* 2 */ /* Only with reshapable raid4/5/6/10! */ #define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */ +/* New for v1.10.0 */ +#define __CTR_FLAG_JOURNAL_DEV 15 /* 2 */ /* Only with raid4/5/6! */ + /* * Flags for rs->ctr_flags field. */ @@ -91,6 +99,9 @@ struct raid_dev { #define CTR_FLAG_DELTA_DISKS (1 << __CTR_FLAG_DELTA_DISKS) #define CTR_FLAG_DATA_OFFSET (1 << __CTR_FLAG_DATA_OFFSET) #define CTR_FLAG_RAID10_USE_NEAR_SETS (1 << __CTR_FLAG_RAID10_USE_NEAR_SETS) +#define CTR_FLAG_JOURNAL_DEV (1 << __CTR_FLAG_JOURNAL_DEV) + +#define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET) /* * Definitions of various constructor flags to @@ -163,7 +174,8 @@ struct raid_dev { CTR_FLAG_STRIPE_CACHE | \ CTR_FLAG_REGION_SIZE | \ CTR_FLAG_DELTA_DISKS | \ - CTR_FLAG_DATA_OFFSET) + CTR_FLAG_DATA_OFFSET | \ + CTR_FLAG_JOURNAL_DEV) #define RAID6_VALID_FLAGS (CTR_FLAG_SYNC | \ CTR_FLAG_REBUILD | \ @@ -173,7 +185,8 @@ struct raid_dev { CTR_FLAG_STRIPE_CACHE | \ CTR_FLAG_REGION_SIZE | \ CTR_FLAG_DELTA_DISKS | \ - CTR_FLAG_DATA_OFFSET) + CTR_FLAG_DATA_OFFSET | \ + CTR_FLAG_JOURNAL_DEV) /* ...valid options definitions per raid level */ /* @@ -222,6 +235,12 @@ struct raid_set { struct raid_type *raid_type; struct dm_target_callbacks callbacks; + /* Optional raid4/5/6 journal device */ + struct journal_dev { + struct dm_dev *dev; + struct md_rdev rdev; + } journal_dev; + struct raid_dev dev[0]; }; @@ -306,6 +325,7 @@ static struct arg_name_flag { { CTR_FLAG_DATA_OFFSET, "data_offset"}, { CTR_FLAG_DELTA_DISKS, "delta_disks"}, { CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"}, + { CTR_FLAG_JOURNAL_DEV, "journal_dev" }, }; /* Return argument name string for given @flag */ @@ -370,7 +390,7 @@ static bool rs_is_reshapable(struct raid_set *rs) /* Return true, if raid set in @rs is recovering */ static bool rs_is_recovering(struct raid_set *rs) { - return rs->md.recovery_cp < rs->dev[0].rdev.sectors; + return rs->md.recovery_cp < rs->md.dev_sectors; } /* Return true, if raid set in @rs is reshaping */ @@ -627,7 +647,8 @@ static void rs_set_capacity(struct raid_set *rs) * is unintended in case of out-of-place reshaping */ rdev_for_each(rdev, mddev) - rdev->sectors = mddev->dev_sectors; + if (!test_bit(Journal, &rdev->flags)) + rdev->sectors = mddev->dev_sectors; set_capacity(gendisk, mddev->array_sectors); revalidate_disk(gendisk); @@ -713,6 +734,11 @@ static void raid_set_free(struct raid_set *rs) { int i; + if (rs->journal_dev.dev) { + md_rdev_clear(&rs->journal_dev.rdev); + dm_put_device(rs->ti, rs->journal_dev.dev); + } + for (i = 0; i < rs->raid_disks; i++) { if (rs->dev[i].meta_dev) dm_put_device(rs->ti, rs->dev[i].meta_dev); @@ -760,10 +786,11 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as) rs->dev[i].data_dev = NULL; /* - * There are no offsets, since there is a separate device - * for data and metadata. + * There are no offsets initially. + * Out of place reshape will set them accordingly. */ rs->dev[i].rdev.data_offset = 0; + rs->dev[i].rdev.new_data_offset = 0; rs->dev[i].rdev.mddev = &rs->md; arg = dm_shift_arg(as); @@ -821,6 +848,9 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as) rebuild++; } + if (rs->journal_dev.dev) + list_add_tail(&rs->journal_dev.rdev.same_set, &rs->md.disks); + if (metadata_available) { rs->md.external = 0; rs->md.persistent = 1; @@ -1026,6 +1056,8 @@ too_many: * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) * [stripe_cache <sectors>] Stripe cache size for higher RAIDs * [region_size <sectors>] Defines granularity of bitmap + * [journal_dev <dev>] raid4/5/6 journaling deviice + * (i.e. write hole closing log) * * RAID10-only options: * [raid10_copies <# copies>] Number of copies. (Default: 2) @@ -1133,7 +1165,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, /* * Parameters that take a string value are checked here. */ - + /* "raid10_format {near|offset|far} */ if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT))) { if (test_and_set_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) { rs->ti->error = "Only one 'raid10_format' argument pair allowed"; @@ -1151,6 +1183,41 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, continue; } + /* "journal_dev dev" */ + if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV))) { + int r; + struct md_rdev *jdev; + + if (test_and_set_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) { + rs->ti->error = "Only one raid4/5/6 set journaling device allowed"; + return -EINVAL; + } + if (!rt_is_raid456(rt)) { + rs->ti->error = "'journal_dev' is an invalid parameter for this RAID type"; + return -EINVAL; + } + r = dm_get_device(rs->ti, arg, dm_table_get_mode(rs->ti->table), + &rs->journal_dev.dev); + if (r) { + rs->ti->error = "raid4/5/6 journal device lookup failure"; + return r; + } + jdev = &rs->journal_dev.rdev; + md_rdev_init(jdev); + jdev->mddev = &rs->md; + jdev->bdev = rs->journal_dev.dev->bdev; + jdev->sectors = to_sector(i_size_read(jdev->bdev->bd_inode)); + if (jdev->sectors < MIN_RAID456_JOURNAL_SPACE) { + rs->ti->error = "No space for raid4/5/6 journal"; + return -ENOSPC; + } + set_bit(Journal, &jdev->flags); + continue; + } + + /* + * Parameters with number values from here on. + */ if (kstrtoint(arg, 10, &value) < 0) { rs->ti->error = "Bad numerical argument given in raid params"; return -EINVAL; @@ -1425,6 +1492,25 @@ static unsigned int rs_data_stripes(struct raid_set *rs) return rs->raid_disks - rs->raid_type->parity_devs; } +/* + * Retrieve rdev->sectors from any valid raid device of @rs + * to allow userpace to pass in arbitray "- -" device tupples. + */ +static sector_t __rdev_sectors(struct raid_set *rs) +{ + int i; + + for (i = 0; i < rs->md.raid_disks; i++) { + struct md_rdev *rdev = &rs->dev[i].rdev; + + if (!test_bit(Journal, &rdev->flags) && + rdev->bdev && rdev->sectors) + return rdev->sectors; + } + + BUG(); /* Constructor ensures we got some. */ +} + /* Calculate the sectors per device and per array used for @rs */ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev) { @@ -1468,7 +1554,8 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev) array_sectors = (data_stripes + delta_disks) * dev_sectors; rdev_for_each(rdev, mddev) - rdev->sectors = dev_sectors; + if (!test_bit(Journal, &rdev->flags)) + rdev->sectors = dev_sectors; mddev->array_sectors = array_sectors; mddev->dev_sectors = dev_sectors; @@ -1510,9 +1597,9 @@ static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors) else if (dev_sectors == MaxSector) /* Prevent recovery */ __rs_setup_recovery(rs, MaxSector); - else if (rs->dev[0].rdev.sectors < dev_sectors) + else if (__rdev_sectors(rs) < dev_sectors) /* Grown raid set */ - __rs_setup_recovery(rs, rs->dev[0].rdev.sectors); + __rs_setup_recovery(rs, __rdev_sectors(rs)); else __rs_setup_recovery(rs, MaxSector); } @@ -1851,18 +1938,21 @@ static int rs_check_reshape(struct raid_set *rs) return -EPERM; } -static int read_disk_sb(struct md_rdev *rdev, int size) +static int read_disk_sb(struct md_rdev *rdev, int size, bool force_reload) { BUG_ON(!rdev->sb_page); - if (rdev->sb_loaded) + if (rdev->sb_loaded && !force_reload) return 0; + rdev->sb_loaded = 0; + if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true)) { DMERR("Failed to read superblock of device at position %d", rdev->raid_disk); md_error(rdev->mddev, rdev); - return -EINVAL; + set_bit(Faulty, &rdev->flags); + return -EIO; } rdev->sb_loaded = 1; @@ -1990,7 +2080,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev) return -EINVAL; } - r = read_disk_sb(rdev, rdev->sb_size); + r = read_disk_sb(rdev, rdev->sb_size, false); if (r) return r; @@ -2146,6 +2236,9 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) */ d = 0; rdev_for_each(r, mddev) { + if (test_bit(Journal, &rdev->flags)) + continue; + if (test_bit(FirstUse, &r->flags)) new_devs++; @@ -2201,7 +2294,8 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) */ sb_retrieve_failed_devices(sb, failed_devices); rdev_for_each(r, mddev) { - if (!r->sb_page) + if (test_bit(Journal, &rdev->flags) || + !r->sb_page) continue; sb2 = page_address(r->sb_page); sb2->failed_devices = 0; @@ -2253,7 +2347,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev) struct mddev *mddev = &rs->md; struct dm_raid_superblock *sb; - if (rs_is_raid0(rs) || !rdev->sb_page) + if (rs_is_raid0(rs) || !rdev->sb_page || rdev->raid_disk < 0) return 0; sb = page_address(rdev->sb_page); @@ -2278,7 +2372,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev) /* Enable bitmap creation for RAID levels != 0 */ mddev->bitmap_info.offset = rt_is_raid0(rs->raid_type) ? 0 : to_sector(4096); - rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset; + mddev->bitmap_info.default_offset = mddev->bitmap_info.offset; if (!test_and_clear_bit(FirstUse, &rdev->flags)) { /* Retrieve device size stored in superblock to be prepared for shrink */ @@ -2316,21 +2410,22 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev) static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) { int r; - struct raid_dev *dev; - struct md_rdev *rdev, *tmp, *freshest; + struct md_rdev *rdev, *freshest; struct mddev *mddev = &rs->md; freshest = NULL; - rdev_for_each_safe(rdev, tmp, mddev) { + rdev_for_each(rdev, mddev) { + if (test_bit(Journal, &rdev->flags)) + continue; + /* * Skipping super_load due to CTR_FLAG_SYNC will cause * the array to undergo initialization again as * though it were new. This is the intended effect * of the "sync" directive. * - * When reshaping capability is added, we must ensure - * that the "sync" directive is disallowed during the - * reshape. + * With reshaping capability added, we must ensure that + * that the "sync" directive is disallowed during the reshape. */ if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) continue; @@ -2347,6 +2442,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) case 0: break; default: + /* This is a failure to read the superblock from the metadata device. */ /* * We have to keep any raid0 data/metadata device pairs or * the MD raid0 personality will fail to start the array. @@ -2354,33 +2450,16 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) if (rs_is_raid0(rs)) continue; - dev = container_of(rdev, struct raid_dev, rdev); - if (dev->meta_dev) - dm_put_device(ti, dev->meta_dev); - - dev->meta_dev = NULL; - rdev->meta_bdev = NULL; - - if (rdev->sb_page) - put_page(rdev->sb_page); - - rdev->sb_page = NULL; - - rdev->sb_loaded = 0; - /* - * We might be able to salvage the data device - * even though the meta device has failed. For - * now, we behave as though '- -' had been - * set for this device in the table. + * We keep the dm_devs to be able to emit the device tuple + * properly on the table line in raid_status() (rather than + * mistakenly acting as if '- -' got passed into the constructor). + * + * The rdev has to stay on the same_set list to allow for + * the attempt to restore faulty devices on second resume. */ - if (dev->data_dev) - dm_put_device(ti, dev->data_dev); - - dev->data_dev = NULL; - rdev->bdev = NULL; - - list_del(&rdev->same_set); + rdev->raid_disk = rdev->saved_raid_disk = -1; + break; } } @@ -2401,7 +2480,9 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) return -EINVAL; rdev_for_each(rdev, mddev) - if ((rdev != freshest) && super_validate(rs, rdev)) + if (!test_bit(Journal, &rdev->flags) && + rdev != freshest && + super_validate(rs, rdev)) return -EINVAL; return 0; } @@ -2488,10 +2569,12 @@ static int rs_adjust_data_offsets(struct raid_set *rs) return -ENOSPC; } out: - /* Adjust data offsets on all rdevs */ + /* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */ rdev_for_each(rdev, &rs->md) { - rdev->data_offset = data_offset; - rdev->new_data_offset = new_data_offset; + if (!test_bit(Journal, &rdev->flags)) { + rdev->data_offset = data_offset; + rdev->new_data_offset = new_data_offset; + } } return 0; @@ -2504,8 +2587,10 @@ static void __reorder_raid_disk_indexes(struct raid_set *rs) struct md_rdev *rdev; rdev_for_each(rdev, &rs->md) { - rdev->raid_disk = i++; - rdev->saved_raid_disk = rdev->new_raid_disk = -1; + if (!test_bit(Journal, &rdev->flags)) { + rdev->raid_disk = i++; + rdev->saved_raid_disk = rdev->new_raid_disk = -1; + } } } @@ -2845,7 +2930,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (r) goto bad; - calculated_dev_sectors = rs->dev[0].rdev.sectors; + calculated_dev_sectors = rs->md.dev_sectors; /* * Backup any new raid set level, layout, ... @@ -2858,7 +2943,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (r) goto bad; - resize = calculated_dev_sectors != rs->dev[0].rdev.sectors; + resize = calculated_dev_sectors != __rdev_sectors(rs); INIT_WORK(&rs->md.event_work, do_table_event); ti->private = rs; @@ -2902,6 +2987,13 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } + /* We can't takeover a journaled raid4/5/6 */ + if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) { + ti->error = "Can't takeover a journaled raid4/5/6 set"; + r = -EPERM; + goto bad; + } + /* * If a takeover is needed, userspace sets any additional * devices to rebuild and we can check for a valid request here. @@ -2924,6 +3016,18 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) rs_set_new(rs); } else if (rs_reshape_requested(rs)) { /* + * No need to check for 'ongoing' takeover here, because takeover + * is an instant operation as oposed to an ongoing reshape. + */ + + /* We can't reshape a journaled raid4/5/6 */ + if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) { + ti->error = "Can't reshape a journaled raid4/5/6 set"; + r = -EPERM; + goto bad; + } + + /* * We can only prepare for a reshape here, because the * raid set needs to run to provide the repective reshape * check functions via its MD personality instance. @@ -3071,18 +3175,23 @@ static const char *decipher_sync_action(struct mddev *mddev) } /* - * Return status string @rdev + * Return status string for @rdev * * Status characters: * - * 'D' = Dead/Failed device + * 'D' = Dead/Failed raid set component or raid4/5/6 journal device * 'a' = Alive but not in-sync - * 'A' = Alive and in-sync + * 'A' = Alive and in-sync raid set component or alive raid4/5/6 journal device + * '-' = Non-existing device (i.e. uspace passed '- -' into the ctr) */ static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync) { - if (test_bit(Faulty, &rdev->flags)) + if (!rdev->bdev) + return "-"; + else if (test_bit(Faulty, &rdev->flags)) return "D"; + else if (test_bit(Journal, &rdev->flags)) + return "A"; else if (!array_in_sync || !test_bit(In_sync, &rdev->flags)) return "a"; else @@ -3151,7 +3260,8 @@ static sector_t rs_get_progress(struct raid_set *rs, * being initialized. */ rdev_for_each(rdev, mddev) - if (!test_bit(In_sync, &rdev->flags)) + if (!test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags)) *array_in_sync = true; #if 0 r = 0; /* HM FIXME: TESTME: https://bugzilla.redhat.com/show_bug.cgi?id=1210637 ? */ @@ -3183,7 +3293,6 @@ static void raid_status(struct dm_target *ti, status_type_t type, sector_t progress, resync_max_sectors, resync_mismatches; const char *sync_action; struct raid_type *rt; - struct md_rdev *rdev; switch (type) { case STATUSTYPE_INFO: @@ -3204,9 +3313,9 @@ static void raid_status(struct dm_target *ti, status_type_t type, atomic64_read(&mddev->resync_mismatches) : 0; sync_action = decipher_sync_action(&rs->md); - /* HM FIXME: do we want another state char for raid0? It shows 'D' or 'A' now */ - rdev_for_each(rdev, mddev) - DMEMIT(__raid_dev_status(rdev, array_in_sync)); + /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */ + for (i = 0; i < rs->raid_disks; i++) + DMEMIT(__raid_dev_status(&rs->dev[i].rdev, array_in_sync)); /* * In-sync/Reshape ratio: @@ -3252,6 +3361,12 @@ static void raid_status(struct dm_target *ti, status_type_t type, * so retrieving it from the first raid disk is sufficient. */ DMEMIT(" %llu", (unsigned long long) rs->dev[0].rdev.data_offset); + + /* + * v1.10.0+: + */ + DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? + __raid_dev_status(&rs->journal_dev.rdev, 0) : "-"); break; case STATUSTYPE_TABLE: @@ -3265,7 +3380,8 @@ static void raid_status(struct dm_target *ti, status_type_t type, raid_param_cnt += rebuild_disks * 2 + write_mostly_params + hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) + - hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2; + hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 + + (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0); /* Emit table line */ DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors); if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) @@ -3312,6 +3428,9 @@ static void raid_status(struct dm_target *ti, status_type_t type, if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE), mddev->sync_speed_min); + if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) + DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV), + __get_dev_name(rs->journal_dev.dev)); DMEMIT(" %d", rs->raid_disks); for (i = 0; i < rs->raid_disks; i++) DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev), @@ -3345,12 +3464,15 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv) else if (!strcasecmp(argv[0], "recover")) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); else { - if (!strcasecmp(argv[0], "check")) + if (!strcasecmp(argv[0], "check")) { set_bit(MD_RECOVERY_CHECK, &mddev->recovery); - else if (!!strcasecmp(argv[0], "repair")) + set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + } else if (!strcasecmp(argv[0], "repair")) { + set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + } else return -EINVAL; - set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - set_bit(MD_RECOVERY_SYNC, &mddev->recovery); } if (mddev->ro == 2) { /* A write to sync_action is enough to justify @@ -3427,11 +3549,14 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs) memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices)); - for (i = 0; i < rs->md.raid_disks; i++) { + for (i = 0; i < mddev->raid_disks; i++) { r = &rs->dev[i].rdev; - if (test_bit(Faulty, &r->flags) && r->sb_page && - sync_page_io(r, 0, r->sb_size, r->sb_page, - REQ_OP_READ, 0, true)) { + /* HM FIXME: enhance journal device recovery processing */ + if (test_bit(Journal, &r->flags)) + continue; + + if (test_bit(Faulty, &r->flags) && + r->meta_bdev && !read_disk_sb(r, r->sb_size, true)) { DMINFO("Faulty %s device #%d has readable super block." " Attempting to revive it.", rs->raid_type->name, i); @@ -3445,22 +3570,26 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs) * '>= 0' - meaning we must call this function * ourselves. */ - if ((r->raid_disk >= 0) && - (mddev->pers->hot_remove_disk(mddev, r) != 0)) - /* Failed to revive this device, try next */ - continue; - - r->raid_disk = i; - r->saved_raid_disk = i; flags = r->flags; + clear_bit(In_sync, &r->flags); /* Mandatory for hot remove. */ + if (r->raid_disk >= 0) { + if (mddev->pers->hot_remove_disk(mddev, r)) { + /* Failed to revive this device, try next */ + r->flags = flags; + continue; + } + } else + r->raid_disk = r->saved_raid_disk = i; + clear_bit(Faulty, &r->flags); clear_bit(WriteErrorSeen, &r->flags); - clear_bit(In_sync, &r->flags); + if (mddev->pers->hot_add_disk(mddev, r)) { - r->raid_disk = -1; - r->saved_raid_disk = -1; + /* Failed to revive this device, try next */ + r->raid_disk = r->saved_raid_disk = -1; r->flags = flags; } else { + clear_bit(In_sync, &r->flags); r->recovery_offset = 0; set_bit(i, (void *) cleared_failed_devices); cleared = true; @@ -3473,6 +3602,9 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs) uint64_t failed_devices[DISKS_ARRAY_ELEMS]; rdev_for_each(r, &rs->md) { + if (test_bit(Journal, &r->flags)) + continue; + sb = page_address(r->sb_page); sb_retrieve_failed_devices(sb, failed_devices); @@ -3643,7 +3775,15 @@ static void raid_resume(struct dm_target *ti) mddev->ro = 0; mddev->in_sync = 0; - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + /* + * Keep the RAID set frozen if reshape/rebuild flags are set. + * The RAID set is unfrozen once the next table load/resume, + * which clears the reshape/rebuild flags, occurs. + * This ensures that the constructor for the inactive table + * retrieves an up-to-date reshape_position. + */ + if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS)) + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); if (mddev->suspended) mddev_resume(mddev); @@ -3651,7 +3791,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 9, 1}, + .version = {1, 10, 1}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c index 6c25213ab38c..bdbb7e6e8212 100644 --- a/drivers/md/dm-round-robin.c +++ b/drivers/md/dm-round-robin.c @@ -17,8 +17,8 @@ #include <linux/module.h> #define DM_MSG_PREFIX "multipath round-robin" -#define RR_MIN_IO 1000 -#define RR_VERSION "1.1.0" +#define RR_MIN_IO 1 +#define RR_VERSION "1.2.0" /*----------------------------------------------------------------- * Path-handling code, paths are held in lists @@ -47,44 +47,19 @@ struct selector { struct list_head valid_paths; struct list_head invalid_paths; spinlock_t lock; - struct dm_path * __percpu *current_path; - struct percpu_counter repeat_count; }; -static void set_percpu_current_path(struct selector *s, struct dm_path *path) -{ - int cpu; - - for_each_possible_cpu(cpu) - *per_cpu_ptr(s->current_path, cpu) = path; -} - static struct selector *alloc_selector(void) { struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return NULL; - - INIT_LIST_HEAD(&s->valid_paths); - INIT_LIST_HEAD(&s->invalid_paths); - spin_lock_init(&s->lock); - - s->current_path = alloc_percpu(struct dm_path *); - if (!s->current_path) - goto out_current_path; - set_percpu_current_path(s, NULL); - - if (percpu_counter_init(&s->repeat_count, 0, GFP_KERNEL)) - goto out_repeat_count; + if (s) { + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->invalid_paths); + spin_lock_init(&s->lock); + } return s; - -out_repeat_count: - free_percpu(s->current_path); -out_current_path: - kfree(s); - return NULL;; } static int rr_create(struct path_selector *ps, unsigned argc, char **argv) @@ -105,8 +80,6 @@ static void rr_destroy(struct path_selector *ps) free_paths(&s->valid_paths); free_paths(&s->invalid_paths); - free_percpu(s->current_path); - percpu_counter_destroy(&s->repeat_count); kfree(s); ps->context = NULL; } @@ -157,6 +130,11 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path, return -EINVAL; } + if (repeat_count > 1) { + DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead"); + repeat_count = 1; + } + /* allocate the path */ pi = kmalloc(sizeof(*pi), GFP_KERNEL); if (!pi) { @@ -183,9 +161,6 @@ static void rr_fail_path(struct path_selector *ps, struct dm_path *p) struct path_info *pi = p->pscontext; spin_lock_irqsave(&s->lock, flags); - if (p == *this_cpu_ptr(s->current_path)) - set_percpu_current_path(s, NULL); - list_move(&pi->list, &s->invalid_paths); spin_unlock_irqrestore(&s->lock, flags); } @@ -208,29 +183,15 @@ static struct dm_path *rr_select_path(struct path_selector *ps, size_t nr_bytes) unsigned long flags; struct selector *s = ps->context; struct path_info *pi = NULL; - struct dm_path *current_path = NULL; - - local_irq_save(flags); - current_path = *this_cpu_ptr(s->current_path); - if (current_path) { - percpu_counter_dec(&s->repeat_count); - if (percpu_counter_read_positive(&s->repeat_count) > 0) { - local_irq_restore(flags); - return current_path; - } - } - spin_lock(&s->lock); + spin_lock_irqsave(&s->lock, flags); if (!list_empty(&s->valid_paths)) { pi = list_entry(s->valid_paths.next, struct path_info, list); list_move_tail(&pi->list, &s->valid_paths); - percpu_counter_set(&s->repeat_count, pi->repeat_count); - set_percpu_current_path(s, pi->path); - current_path = pi->path; } spin_unlock_irqrestore(&s->lock, flags); - return current_path; + return pi ? pi->path : NULL; } static struct path_selector_type rr_ps = { diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 6e702fc69a83..28955b94d2b2 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -109,28 +109,6 @@ void dm_stop_queue(struct request_queue *q) dm_mq_stop_queue(q); } -static struct dm_rq_target_io *alloc_old_rq_tio(struct mapped_device *md, - gfp_t gfp_mask) -{ - return mempool_alloc(md->io_pool, gfp_mask); -} - -static void free_old_rq_tio(struct dm_rq_target_io *tio) -{ - mempool_free(tio, tio->md->io_pool); -} - -static struct request *alloc_old_clone_request(struct mapped_device *md, - gfp_t gfp_mask) -{ - return mempool_alloc(md->rq_pool, gfp_mask); -} - -static void free_old_clone_request(struct mapped_device *md, struct request *rq) -{ - mempool_free(rq, md->rq_pool); -} - /* * Partial completion handling for request-based dm */ @@ -185,7 +163,7 @@ static void end_clone_bio(struct bio *clone) static struct dm_rq_target_io *tio_from_request(struct request *rq) { - return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special); + return blk_mq_rq_to_pdu(rq); } static void rq_end_stats(struct mapped_device *md, struct request *orig) @@ -233,31 +211,6 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) dm_put(md); } -static void free_rq_clone(struct request *clone) -{ - struct dm_rq_target_io *tio = clone->end_io_data; - struct mapped_device *md = tio->md; - - blk_rq_unprep_clone(clone); - - /* - * It is possible for a clone_old_rq() allocated clone to - * get passed in -- it may not yet have a request_queue. - * This is known to occur if the error target replaces - * a multipath target that has a request_fn queue stacked - * on blk-mq queue(s). - */ - if (clone->q && clone->q->mq_ops) - /* stacked on blk-mq queue(s) */ - tio->ti->type->release_clone_rq(clone); - else if (!md->queue->mq_ops) - /* request_fn queue stacked on request_fn queue(s) */ - free_old_clone_request(md, clone); - - if (!md->queue->mq_ops) - free_old_rq_tio(tio); -} - /* * Complete the clone and the original request. * Must be called without clone's queue lock held, @@ -270,20 +223,9 @@ static void dm_end_request(struct request *clone, int error) struct mapped_device *md = tio->md; struct request *rq = tio->orig; - if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { - rq->errors = clone->errors; - rq->resid_len = clone->resid_len; - - if (rq->sense) - /* - * We are using the sense buffer of the original - * request. - * So setting the length of the sense data is enough. - */ - rq->sense_len = clone->sense_len; - } + blk_rq_unprep_clone(clone); + tio->ti->type->release_clone_rq(clone); - free_rq_clone(clone); rq_end_stats(md, rq); if (!rq->q->mq_ops) blk_end_request_all(rq, error); @@ -292,22 +234,6 @@ static void dm_end_request(struct request *clone, int error) rq_completed(md, rw, true); } -static void dm_unprep_request(struct request *rq) -{ - struct dm_rq_target_io *tio = tio_from_request(rq); - struct request *clone = tio->clone; - - if (!rq->q->mq_ops) { - rq->special = NULL; - rq->rq_flags &= ~RQF_DONTPREP; - } - - if (clone) - free_rq_clone(clone); - else if (!tio->md->queue->mq_ops) - free_old_rq_tio(tio); -} - /* * Requeue the original request of a clone. */ @@ -346,7 +272,10 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_ int rw = rq_data_dir(rq); rq_end_stats(md, rq); - dm_unprep_request(rq); + if (tio->clone) { + blk_rq_unprep_clone(tio->clone); + tio->ti->type->release_clone_rq(tio->clone); + } if (!rq->q->mq_ops) dm_old_requeue_request(rq); @@ -399,16 +328,15 @@ static void dm_softirq_done(struct request *rq) int rw; if (!clone) { - rq_end_stats(tio->md, rq); + struct mapped_device *md = tio->md; + + rq_end_stats(md, rq); rw = rq_data_dir(rq); - if (!rq->q->mq_ops) { + if (!rq->q->mq_ops) blk_end_request_all(rq, tio->error); - rq_completed(tio->md, rw, false); - free_old_rq_tio(tio); - } else { + else blk_mq_end_request(rq, tio->error); - rq_completed(tio->md, rw, false); - } + rq_completed(md, rw, false); return; } @@ -452,16 +380,6 @@ static void end_clone_request(struct request *clone, int error) { struct dm_rq_target_io *tio = clone->end_io_data; - if (!clone->q->mq_ops) { - /* - * For just cleaning up the information of the queue in which - * the clone was dispatched. - * The clone is *NOT* freed actually here because it is alloced - * from dm own mempool (RQF_ALLOCED isn't set). - */ - __blk_put_request(clone->q, clone); - } - /* * Actual request completion is done in a softirq context which doesn't * hold the clone's queue lock. Otherwise, deadlock could occur because: @@ -511,9 +429,6 @@ static int setup_clone(struct request *clone, struct request *rq, if (r) return r; - clone->cmd = rq->cmd; - clone->cmd_len = rq->cmd_len; - clone->sense = rq->sense; clone->end_io = end_clone_request; clone->end_io_data = tio; @@ -522,28 +437,6 @@ static int setup_clone(struct request *clone, struct request *rq, return 0; } -static struct request *clone_old_rq(struct request *rq, struct mapped_device *md, - struct dm_rq_target_io *tio, gfp_t gfp_mask) -{ - /* - * Create clone for use with .request_fn request_queue - */ - struct request *clone; - - clone = alloc_old_clone_request(md, gfp_mask); - if (!clone) - return NULL; - - blk_rq_init(NULL, clone); - if (setup_clone(clone, rq, tio, gfp_mask)) { - /* -ENOMEM */ - free_old_clone_request(md, clone); - return NULL; - } - - return clone; -} - static void map_tio_request(struct kthread_work *work); static void init_tio(struct dm_rq_target_io *tio, struct request *rq, @@ -565,60 +458,6 @@ static void init_tio(struct dm_rq_target_io *tio, struct request *rq, kthread_init_work(&tio->work, map_tio_request); } -static struct dm_rq_target_io *dm_old_prep_tio(struct request *rq, - struct mapped_device *md, - gfp_t gfp_mask) -{ - struct dm_rq_target_io *tio; - int srcu_idx; - struct dm_table *table; - - tio = alloc_old_rq_tio(md, gfp_mask); - if (!tio) - return NULL; - - init_tio(tio, rq, md); - - table = dm_get_live_table(md, &srcu_idx); - /* - * Must clone a request if this .request_fn DM device - * is stacked on .request_fn device(s). - */ - if (!dm_table_all_blk_mq_devices(table)) { - if (!clone_old_rq(rq, md, tio, gfp_mask)) { - dm_put_live_table(md, srcu_idx); - free_old_rq_tio(tio); - return NULL; - } - } - dm_put_live_table(md, srcu_idx); - - return tio; -} - -/* - * Called with the queue lock held. - */ -static int dm_old_prep_fn(struct request_queue *q, struct request *rq) -{ - struct mapped_device *md = q->queuedata; - struct dm_rq_target_io *tio; - - if (unlikely(rq->special)) { - DMWARN("Already has something in rq->special."); - return BLKPREP_KILL; - } - - tio = dm_old_prep_tio(rq, md, GFP_ATOMIC); - if (!tio) - return BLKPREP_DEFER; - - rq->special = tio; - rq->rq_flags |= RQF_DONTPREP; - - return BLKPREP_OK; -} - /* * Returns: * DM_MAPIO_* : the request has been processed as indicated @@ -633,31 +472,18 @@ static int map_request(struct dm_rq_target_io *tio) struct request *rq = tio->orig; struct request *clone = NULL; - if (tio->clone) { - clone = tio->clone; - r = ti->type->map_rq(ti, clone, &tio->info); - if (r == DM_MAPIO_DELAY_REQUEUE) - return DM_MAPIO_REQUEUE; /* .request_fn requeue is always immediate */ - } else { - r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone); - if (r < 0) { - /* The target wants to complete the I/O */ - dm_kill_unmapped_request(rq, r); - return r; - } - if (r == DM_MAPIO_REMAPPED && - setup_clone(clone, rq, tio, GFP_ATOMIC)) { - /* -ENOMEM */ - ti->type->release_clone_rq(clone); - return DM_MAPIO_REQUEUE; - } - } - + r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone); switch (r) { case DM_MAPIO_SUBMITTED: /* The target has taken the I/O to submit by itself later */ break; case DM_MAPIO_REMAPPED: + if (setup_clone(clone, rq, tio, GFP_ATOMIC)) { + /* -ENOMEM */ + ti->type->release_clone_rq(clone); + return DM_MAPIO_REQUEUE; + } + /* The target has remapped the I/O so dispatch it */ trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), blk_rq_pos(rq)); @@ -716,6 +542,29 @@ static void dm_start_request(struct mapped_device *md, struct request *orig) dm_get(md); } +static int __dm_rq_init_rq(struct mapped_device *md, struct request *rq) +{ + struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); + + /* + * Must initialize md member of tio, otherwise it won't + * be available in dm_mq_queue_rq. + */ + tio->md = md; + + if (md->init_tio_pdu) { + /* target-specific per-io data is immediately after the tio */ + tio->info.ptr = tio + 1; + } + + return 0; +} + +static int dm_rq_init_rq(struct request_queue *q, struct request *rq, gfp_t gfp) +{ + return __dm_rq_init_rq(q->rq_alloc_data, rq); +} + static void map_tio_request(struct kthread_work *work) { struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work); @@ -814,6 +663,7 @@ static void dm_old_request_fn(struct request_queue *q) dm_start_request(md, rq); tio = tio_from_request(rq); + init_tio(tio, rq, md); /* Establish tio->ti before queuing work (map_tio_request) */ tio->ti = ti; kthread_queue_work(&md->kworker, &tio->work); @@ -824,10 +674,23 @@ static void dm_old_request_fn(struct request_queue *q) /* * Fully initialize a .request_fn request-based queue. */ -int dm_old_init_request_queue(struct mapped_device *md) +int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t) { + struct dm_target *immutable_tgt; + /* Fully initialize the queue */ - if (!blk_init_allocated_queue(md->queue, dm_old_request_fn, NULL)) + md->queue->cmd_size = sizeof(struct dm_rq_target_io); + md->queue->rq_alloc_data = md; + md->queue->request_fn = dm_old_request_fn; + md->queue->init_rq_fn = dm_rq_init_rq; + + immutable_tgt = dm_table_get_immutable_target(t); + if (immutable_tgt && immutable_tgt->per_io_data_size) { + /* any target-specific per-io data is immediately after the tio */ + md->queue->cmd_size += immutable_tgt->per_io_data_size; + md->init_tio_pdu = true; + } + if (blk_init_allocated_queue(md->queue) < 0) return -EINVAL; /* disable dm_old_request_fn's merge heuristic by default */ @@ -835,7 +698,6 @@ int dm_old_init_request_queue(struct mapped_device *md) dm_init_normal_md_queue(md); blk_queue_softirq_done(md->queue, dm_softirq_done); - blk_queue_prep_rq(md->queue, dm_old_prep_fn); /* Initialize the request-based DM worker thread */ kthread_init_worker(&md->kworker); @@ -856,21 +718,7 @@ static int dm_mq_init_request(void *data, struct request *rq, unsigned int hctx_idx, unsigned int request_idx, unsigned int numa_node) { - struct mapped_device *md = data; - struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); - - /* - * Must initialize md member of tio, otherwise it won't - * be available in dm_mq_queue_rq. - */ - tio->md = md; - - if (md->init_tio_pdu) { - /* target-specific per-io data is immediately after the tio */ - tio->info.ptr = tio + 1; - } - - return 0; + return __dm_rq_init_rq(data, rq); } static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, diff --git a/drivers/md/dm-rq.h b/drivers/md/dm-rq.h index 4da06cae7bad..f0020d21b95f 100644 --- a/drivers/md/dm-rq.h +++ b/drivers/md/dm-rq.h @@ -48,7 +48,7 @@ struct dm_rq_clone_bio_info { bool dm_use_blk_mq_default(void); bool dm_use_blk_mq(struct mapped_device *md); -int dm_old_init_request_queue(struct mapped_device *md); +int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t); int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t); void dm_mq_cleanup_mapped_device(struct mapped_device *md); diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 38b05f23b96c..0250e7e521ab 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -175,6 +175,7 @@ static void dm_stat_free(struct rcu_head *head) int cpu; struct dm_stat *s = container_of(head, struct dm_stat, rcu_head); + kfree(s->histogram_boundaries); kfree(s->program_id); kfree(s->aux_data); for_each_possible_cpu(cpu) { diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 0a427de23ed2..3ad16d9c9d5a 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1750,7 +1750,7 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) char b[BDEVNAME_SIZE]; if (likely(q)) - r |= bdi_congested(&q->backing_dev_info, bdi_bits); + r |= bdi_congested(q->backing_dev_info, bdi_bits); else DMWARN_LIMIT("%s: any_congested: nonexistent device %s", dm_device_name(t->md), diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 710ae28fd618..43d3445b121d 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -131,12 +131,6 @@ static int io_err_map(struct dm_target *tt, struct bio *bio) return -EIO; } -static int io_err_map_rq(struct dm_target *ti, struct request *clone, - union map_info *map_context) -{ - return -EIO; -} - static int io_err_clone_and_map_rq(struct dm_target *ti, struct request *rq, union map_info *map_context, struct request **clone) @@ -161,7 +155,6 @@ static struct target_type error_target = { .ctr = io_err_ctr, .dtr = io_err_dtr, .map = io_err_map, - .map_rq = io_err_map_rq, .clone_and_map_rq = io_err_clone_and_map_rq, .release_clone_rq = io_err_release_clone_rq, .direct_access = io_err_direct_access, diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index d1c05c12a9db..2b266a2b5035 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -699,7 +699,7 @@ static void remap_to_origin(struct thin_c *tc, struct bio *bio) static int bio_triggers_commit(struct thin_c *tc, struct bio *bio) { - return (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA)) && + return op_is_flush(bio->bi_opf) && dm_thin_changed_this_transaction(tc->td); } @@ -870,8 +870,7 @@ static void __inc_remap_and_issue_cell(void *context, struct bio *bio; while ((bio = bio_list_pop(&cell->bios))) { - if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) || - bio_op(bio) == REQ_OP_DISCARD) + if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD) bio_list_add(&info->defer_bios, bio); else { inc_all_io_entry(info->tc->pool, bio); @@ -1716,9 +1715,8 @@ static void __remap_and_issue_shared_cell(void *context, struct bio *bio; while ((bio = bio_list_pop(&cell->bios))) { - if ((bio_data_dir(bio) == WRITE) || - (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) || - bio_op(bio) == REQ_OP_DISCARD)) + if (bio_data_dir(bio) == WRITE || op_is_flush(bio->bi_opf) || + bio_op(bio) == REQ_OP_DISCARD) bio_list_add(&info->defer_bios, bio); else { struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));; @@ -2635,8 +2633,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } - if (bio->bi_opf & (REQ_PREFLUSH | REQ_FUA) || - bio_op(bio) == REQ_OP_DISCARD) { + if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD) { thin_defer_bio_with_throttle(tc, bio); return DM_MAPIO_SUBMITTED; } @@ -2714,7 +2711,7 @@ static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits) return 1; q = bdev_get_queue(pt->data_dev->bdev); - return bdi_congested(&q->backing_dev_info, bdi_bits); + return bdi_congested(q->backing_dev_info, bdi_bits); } static void requeue_bios(struct pool *pool) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 3086da5664f3..dfb75979e455 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -12,6 +12,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/mutex.h> +#include <linux/sched/signal.h> #include <linux/blkpg.h> #include <linux/bio.h> #include <linux/mempool.h> @@ -91,7 +92,6 @@ static int dm_numa_node = DM_NUMA_NODE; */ struct dm_md_mempools { mempool_t *io_pool; - mempool_t *rq_pool; struct bio_set *bs; }; @@ -466,13 +466,16 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, if (r > 0) { /* - * Target determined this ioctl is being issued against - * a logical partition of the parent bdev; so extra - * validation is needed. + * Target determined this ioctl is being issued against a + * subset of the parent bdev; require extra privileges. */ - r = scsi_verify_blk_ioctl(NULL, cmd); - if (r) + if (!capable(CAP_SYS_RAWIO)) { + DMWARN_LIMIT( + "%s: sending ioctl %x to DM device without required privilege.", + current->comm, cmd); + r = -ENOIOCTLCMD; goto out; + } } r = __blkdev_driver_ioctl(bdev, mode, cmd, arg); @@ -972,10 +975,64 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) } EXPORT_SYMBOL_GPL(dm_accept_partial_bio); +/* + * Flush current->bio_list when the target map method blocks. + * This fixes deadlocks in snapshot and possibly in other targets. + */ +struct dm_offload { + struct blk_plug plug; + struct blk_plug_cb cb; +}; + +static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule) +{ + struct dm_offload *o = container_of(cb, struct dm_offload, cb); + struct bio_list list; + struct bio *bio; + int i; + + INIT_LIST_HEAD(&o->cb.list); + + if (unlikely(!current->bio_list)) + return; + + for (i = 0; i < 2; i++) { + list = current->bio_list[i]; + bio_list_init(¤t->bio_list[i]); + + while ((bio = bio_list_pop(&list))) { + struct bio_set *bs = bio->bi_pool; + if (unlikely(!bs) || bs == fs_bio_set) { + bio_list_add(¤t->bio_list[i], bio); + continue; + } + + spin_lock(&bs->rescue_lock); + bio_list_add(&bs->rescue_list, bio); + queue_work(bs->rescue_workqueue, &bs->rescue_work); + spin_unlock(&bs->rescue_lock); + } + } +} + +static void dm_offload_start(struct dm_offload *o) +{ + blk_start_plug(&o->plug); + o->cb.callback = flush_current_bio_list; + list_add(&o->cb.list, ¤t->plug->cb_list); +} + +static void dm_offload_end(struct dm_offload *o) +{ + list_del(&o->cb.list); + blk_finish_plug(&o->plug); +} + static void __map_bio(struct dm_target_io *tio) { int r; sector_t sector; + struct dm_offload o; struct bio *clone = &tio->clone; struct dm_target *ti = tio->ti; @@ -988,7 +1045,11 @@ static void __map_bio(struct dm_target_io *tio) */ atomic_inc(&tio->io->io_count); sector = clone->bi_iter.bi_sector; + + dm_offload_start(&o); r = ti->type->map(ti, clone); + dm_offload_end(&o); + if (r == DM_MAPIO_REMAPPED) { /* the bio has been remapped so dispatch it */ @@ -1314,7 +1375,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits) * With request-based DM we only need to check the * top-level queue for congestion. */ - r = md->queue->backing_dev_info.wb.state & bdi_bits; + r = md->queue->backing_dev_info->wb.state & bdi_bits; } else { map = dm_get_live_table_fast(md); if (map) @@ -1397,7 +1458,7 @@ void dm_init_md_queue(struct mapped_device *md) * - must do so here (in alloc_dev callchain) before queue is used */ md->queue->queuedata = md; - md->queue->backing_dev_info.congested_data = md; + md->queue->backing_dev_info->congested_data = md; } void dm_init_normal_md_queue(struct mapped_device *md) @@ -1408,7 +1469,7 @@ void dm_init_normal_md_queue(struct mapped_device *md) /* * Initialize aspects of queue that aren't relevant for blk-mq */ - md->queue->backing_dev_info.congested_fn = dm_any_congested; + md->queue->backing_dev_info->congested_fn = dm_any_congested; blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); } @@ -1419,7 +1480,6 @@ static void cleanup_mapped_device(struct mapped_device *md) if (md->kworker_task) kthread_stop(md->kworker_task); mempool_destroy(md->io_pool); - mempool_destroy(md->rq_pool); if (md->bs) bioset_free(md->bs); @@ -1595,12 +1655,10 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t) goto out; } - BUG_ON(!p || md->io_pool || md->rq_pool || md->bs); + BUG_ON(!p || md->io_pool || md->bs); md->io_pool = p->io_pool; p->io_pool = NULL; - md->rq_pool = p->rq_pool; - p->rq_pool = NULL; md->bs = p->bs; p->bs = NULL; @@ -1777,7 +1835,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) switch (type) { case DM_TYPE_REQUEST_BASED: - r = dm_old_init_request_queue(md); + r = dm_old_init_request_queue(md, t); if (r) { DMERR("Cannot initialize queue for request-based mapped device"); return r; @@ -2493,7 +2551,6 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t unsigned integrity, unsigned per_io_data_size) { struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); - struct kmem_cache *cachep = NULL; unsigned int pool_size = 0; unsigned int front_pad; @@ -2503,20 +2560,16 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t switch (type) { case DM_TYPE_BIO_BASED: case DM_TYPE_DAX_BIO_BASED: - cachep = _io_cache; pool_size = dm_get_reserved_bio_based_ios(); front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); + + pools->io_pool = mempool_create_slab_pool(pool_size, _io_cache); + if (!pools->io_pool) + goto out; break; case DM_TYPE_REQUEST_BASED: - cachep = _rq_tio_cache; - pool_size = dm_get_reserved_rq_based_ios(); - pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache); - if (!pools->rq_pool) - goto out; - /* fall through to setup remaining rq-based pools */ case DM_TYPE_MQ_REQUEST_BASED: - if (!pool_size) - pool_size = dm_get_reserved_rq_based_ios(); + pool_size = dm_get_reserved_rq_based_ios(); front_pad = offsetof(struct dm_rq_clone_bio_info, clone); /* per_io_data_size is used for blk-mq pdu at queue allocation */ break; @@ -2524,12 +2577,6 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t BUG(); } - if (cachep) { - pools->io_pool = mempool_create_slab_pool(pool_size, cachep); - if (!pools->io_pool) - goto out; - } - pools->bs = bioset_create_nobvec(pool_size, front_pad); if (!pools->bs) goto out; @@ -2551,7 +2598,6 @@ void dm_free_md_mempools(struct dm_md_mempools *pools) return; mempool_destroy(pools->io_pool); - mempool_destroy(pools->rq_pool); if (pools->bs) bioset_free(pools->bs); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index f0aad08b9654..f298b01f7ab3 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -95,8 +95,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); /* * To check whether the target type is request-based or not (bio-based). */ -#define dm_target_request_based(t) (((t)->type->map_rq != NULL) || \ - ((t)->type->clone_and_map_rq != NULL)) +#define dm_target_request_based(t) ((t)->type->clone_and_map_rq != NULL) /* * To check whether the target type is a hybrid (capable of being diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 685aa2d77e25..b0536cfd8e17 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -214,7 +214,7 @@ static void faulty_make_request(struct mddev *mddev, struct bio *bio) } } if (failit) { - struct bio *b = bio_clone_mddev(bio, GFP_NOIO, mddev); + struct bio *b = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); b->bi_bdev = conf->rdev->bdev; b->bi_private = bio; diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 5975c9915684..3e38e0207a3e 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -53,18 +53,26 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) return conf->disks + lo; } +/* + * In linear_congested() conf->raid_disks is used as a copy of + * mddev->raid_disks to iterate conf->disks[], because conf->raid_disks + * and conf->disks[] are created in linear_conf(), they are always + * consitent with each other, but mddev->raid_disks does not. + */ static int linear_congested(struct mddev *mddev, int bits) { struct linear_conf *conf; int i, ret = 0; - conf = mddev->private; + rcu_read_lock(); + conf = rcu_dereference(mddev->private); - for (i = 0; i < mddev->raid_disks && !ret ; i++) { + for (i = 0; i < conf->raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= bdi_congested(q->backing_dev_info, bits); } + rcu_read_unlock(); return ret; } @@ -144,6 +152,19 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) conf->disks[i-1].end_sector + conf->disks[i].rdev->sectors; + /* + * conf->raid_disks is copy of mddev->raid_disks. The reason to + * keep a copy of mddev->raid_disks in struct linear_conf is, + * mddev->raid_disks may not be consistent with pointers number of + * conf->disks[] when it is updated in linear_add() and used to + * iterate old conf->disks[] earray in linear_congested(). + * Here conf->raid_disks is always consitent with number of + * pointers in conf->disks[] array, and mddev->private is updated + * with rcu_assign_pointer() in linear_addr(), such race can be + * avoided. + */ + conf->raid_disks = raid_disks; + return conf; out: @@ -196,15 +217,24 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) if (!newconf) return -ENOMEM; + /* newconf->raid_disks already keeps a copy of * the increased + * value of mddev->raid_disks, WARN_ONCE() is just used to make + * sure of this. It is possible that oldconf is still referenced + * in linear_congested(), therefore kfree_rcu() is used to free + * oldconf until no one uses it anymore. + */ mddev_suspend(mddev); - oldconf = mddev->private; + oldconf = rcu_dereference_protected(mddev->private, + lockdep_is_held(&mddev->reconfig_mutex)); mddev->raid_disks++; - mddev->private = newconf; + WARN_ONCE(mddev->raid_disks != newconf->raid_disks, + "copied raid_disks doesn't match mddev->raid_disks"); + rcu_assign_pointer(mddev->private, newconf); md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); mddev_resume(mddev); revalidate_disk(mddev->gendisk); - kfree(oldconf); + kfree_rcu(oldconf, rcu); return 0; } @@ -262,6 +292,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) trace_block_bio_remap(bdev_get_queue(split->bi_bdev), split, disk_devt(mddev->gendisk), bio_sector); + mddev_check_writesame(mddev, split); generic_make_request(split); } } while (split != bio); diff --git a/drivers/md/linear.h b/drivers/md/linear.h index b685ddd7d7f7..8d392e6098b3 100644 --- a/drivers/md/linear.h +++ b/drivers/md/linear.h @@ -10,6 +10,7 @@ struct linear_conf { struct rcu_head rcu; sector_t array_sectors; + int raid_disks; /* a copy of mddev->raid_disks */ struct dev_info disks[0]; }; #endif diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 2b13117fb918..321ecac23027 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -777,7 +777,6 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots) bm_lockres->flags |= DLM_LKF_NOQUEUE; ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); if (ret == -EAGAIN) { - memset(bm_lockres->lksb.sb_lvbptr, '\0', LVB_SIZE); s = read_resync_info(mddev, bm_lockres); if (s) { pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n", @@ -974,6 +973,7 @@ static int leave(struct mddev *mddev) lockres_free(cinfo->bitmap_lockres); unlock_all_bitmaps(mddev); dlm_release_lockspace(cinfo->lockspace, 2); + kfree(cinfo); return 0; } diff --git a/drivers/md/md.c b/drivers/md/md.c index 01175dac0db6..f6ae1d67bcd0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -44,6 +44,7 @@ */ +#include <linux/sched/signal.h> #include <linux/kthread.h> #include <linux/blkdev.h> #include <linux/badblocks.h> @@ -190,16 +191,6 @@ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, } EXPORT_SYMBOL_GPL(bio_alloc_mddev); -struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, - struct mddev *mddev) -{ - if (!mddev || !mddev->bio_set) - return bio_clone(bio, gfp_mask); - - return bio_clone_bioset(bio, gfp_mask, mddev->bio_set); -} -EXPORT_SYMBOL_GPL(bio_clone_mddev); - /* * We have a system wide 'event count' that is incremented * on any 'interesting' event, and readers of /proc/mdstat @@ -449,14 +440,6 @@ void md_flush_request(struct mddev *mddev, struct bio *bio) } EXPORT_SYMBOL(md_flush_request); -void md_unplug(struct blk_plug_cb *cb, bool from_schedule) -{ - struct mddev *mddev = cb->data; - md_wakeup_thread(mddev->thread); - kfree(cb); -} -EXPORT_SYMBOL(md_unplug); - static inline struct mddev *mddev_get(struct mddev *mddev) { atomic_inc(&mddev->active); @@ -1896,7 +1879,7 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) } sb = page_address(rdev->sb_page); sb->data_size = cpu_to_le64(num_sectors); - sb->super_offset = rdev->sb_start; + sb->super_offset = cpu_to_le64(rdev->sb_start); sb->sb_csum = calc_sb_1_csum(sb); do { md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, @@ -2304,7 +2287,7 @@ static bool does_sb_need_changing(struct mddev *mddev) /* Check if any mddev parameters have changed */ if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || - (mddev->layout != le64_to_cpu(sb->layout)) || + (mddev->layout != le32_to_cpu(sb->layout)) || (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) return true; @@ -5228,8 +5211,11 @@ int md_run(struct mddev *mddev) sysfs_notify_dirent_safe(rdev->sysfs_state); } - if (mddev->bio_set == NULL) + if (mddev->bio_set == NULL) { mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0); + if (!mddev->bio_set) + return -ENOMEM; + } spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); @@ -5346,8 +5332,8 @@ int md_run(struct mddev *mddev) queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mddev->queue); else queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, mddev->queue); - mddev->queue->backing_dev_info.congested_data = mddev; - mddev->queue->backing_dev_info.congested_fn = md_congested; + mddev->queue->backing_dev_info->congested_data = mddev; + mddev->queue->backing_dev_info->congested_fn = md_congested; } if (pers->sync_request) { if (mddev->kobj.sd && @@ -5704,7 +5690,7 @@ static int do_md_stop(struct mddev *mddev, int mode, __md_stop_writes(mddev); __md_stop(mddev); - mddev->queue->backing_dev_info.congested_fn = NULL; + mddev->queue->backing_dev_info->congested_fn = NULL; /* tell userspace to handle 'inactive' */ sysfs_notify_dirent_safe(mddev->sysfs_state); @@ -6464,11 +6450,10 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->layout = info->layout; mddev->chunk_sectors = info->chunk_size >> 9; - mddev->max_disks = MD_SB_DISKS; - if (mddev->persistent) { - mddev->flags = 0; - mddev->sb_flags = 0; + mddev->max_disks = MD_SB_DISKS; + mddev->flags = 0; + mddev->sb_flags = 0; } set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); @@ -6539,8 +6524,12 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) return -ENOSPC; } rv = mddev->pers->resize(mddev, num_sectors); - if (!rv) - revalidate_disk(mddev->gendisk); + if (!rv) { + if (mddev->queue) { + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + } + } return rv; } @@ -8980,7 +8969,14 @@ static __exit void md_exit(void) for_each_mddev(mddev, tmp) { export_array(mddev); + mddev->ctime = 0; mddev->hold_active = 0; + /* + * for_each_mddev() will call mddev_put() at the end of each + * iteration. As the mddev is now fully clear, this will + * schedule the mddev for destruction by a workqueue, and the + * destroy_workqueue() below will wait for that to complete. + */ } destroy_workqueue(md_misc_wq); destroy_workqueue(md_wq); diff --git a/drivers/md/md.h b/drivers/md/md.h index 2a514036a83d..dde8ecb760c8 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -673,21 +673,13 @@ extern void md_rdev_clear(struct md_rdev *rdev); extern void mddev_suspend(struct mddev *mddev); extern void mddev_resume(struct mddev *mddev); -extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, - struct mddev *mddev); extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, struct mddev *mddev); -extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule); extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); extern void md_kick_rdev_from_array(struct md_rdev * rdev); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); -static inline int mddev_check_plugged(struct mddev *mddev) -{ - return !!blk_check_plugged(md_unplug, mddev, - sizeof(struct blk_plug_cb)); -} static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) { @@ -710,4 +702,11 @@ static inline void mddev_clear_unsupported_flags(struct mddev *mddev, { mddev->flags &= ~unsupported_flags; } + +static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio) +{ + if (bio_op(bio) == REQ_OP_WRITE_SAME && + !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors) + mddev->queue->limits.max_write_same_sectors = 0; +} #endif /* _MD_MD_H */ diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index aa8c4e5c1ee2..79a12b59250b 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -138,6 +138,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio) mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT; mp_bh->bio.bi_end_io = multipath_end_request; mp_bh->bio.bi_private = mp_bh; + mddev_check_writesame(mddev, &mp_bh->bio); generic_make_request(&mp_bh->bio); return; } @@ -169,7 +170,7 @@ static int multipath_congested(struct mddev *mddev, int bits) if (rdev && !test_bit(Faulty, &rdev->flags)) { struct request_queue *q = bdev_get_queue(rdev->bdev); - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= bdi_congested(q->backing_dev_info, bits); /* Just like multipath_map, we just check the * first available device */ diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index 7938cd21fa4c..185dc60360b5 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -976,6 +976,27 @@ int dm_array_cursor_next(struct dm_array_cursor *c) } EXPORT_SYMBOL_GPL(dm_array_cursor_next); +int dm_array_cursor_skip(struct dm_array_cursor *c, uint32_t count) +{ + int r; + + do { + uint32_t remaining = le32_to_cpu(c->ab->nr_entries) - c->index; + + if (count < remaining) { + c->index += count; + return 0; + } + + count -= remaining; + r = dm_array_cursor_next(c); + + } while (!r); + + return r; +} +EXPORT_SYMBOL_GPL(dm_array_cursor_skip); + void dm_array_cursor_get_value(struct dm_array_cursor *c, void **value_le) { *value_le = element_at(c->info, c->ab, c->index); diff --git a/drivers/md/persistent-data/dm-array.h b/drivers/md/persistent-data/dm-array.h index 27ee49a55473..d7d2d579c662 100644 --- a/drivers/md/persistent-data/dm-array.h +++ b/drivers/md/persistent-data/dm-array.h @@ -207,6 +207,7 @@ void dm_array_cursor_end(struct dm_array_cursor *c); uint32_t dm_array_cursor_index(struct dm_array_cursor *c); int dm_array_cursor_next(struct dm_array_cursor *c); +int dm_array_cursor_skip(struct dm_array_cursor *c, uint32_t count); /* * value_le is only valid while the cursor points at the current value. diff --git a/drivers/md/persistent-data/dm-bitset.c b/drivers/md/persistent-data/dm-bitset.c index 36f7cc2c7109..b7208d82e748 100644 --- a/drivers/md/persistent-data/dm-bitset.c +++ b/drivers/md/persistent-data/dm-bitset.c @@ -39,6 +39,48 @@ int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *root) } EXPORT_SYMBOL_GPL(dm_bitset_empty); +struct packer_context { + bit_value_fn fn; + unsigned nr_bits; + void *context; +}; + +static int pack_bits(uint32_t index, void *value, void *context) +{ + int r; + struct packer_context *p = context; + unsigned bit, nr = min(64u, p->nr_bits - (index * 64)); + uint64_t word = 0; + bool bv; + + for (bit = 0; bit < nr; bit++) { + r = p->fn(index * 64 + bit, &bv, p->context); + if (r) + return r; + + if (bv) + set_bit(bit, (unsigned long *) &word); + else + clear_bit(bit, (unsigned long *) &word); + } + + *((__le64 *) value) = cpu_to_le64(word); + + return 0; +} + +int dm_bitset_new(struct dm_disk_bitset *info, dm_block_t *root, + uint32_t size, bit_value_fn fn, void *context) +{ + struct packer_context p; + p.fn = fn; + p.nr_bits = size; + p.context = context; + + return dm_array_new(&info->array_info, root, dm_div_up(size, 64), pack_bits, &p); +} +EXPORT_SYMBOL_GPL(dm_bitset_new); + int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t root, uint32_t old_nr_entries, uint32_t new_nr_entries, bool default_value, dm_block_t *new_root) @@ -168,4 +210,108 @@ int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root, } EXPORT_SYMBOL_GPL(dm_bitset_test_bit); +static int cursor_next_array_entry(struct dm_bitset_cursor *c) +{ + int r; + __le64 *value; + + r = dm_array_cursor_next(&c->cursor); + if (r) + return r; + + dm_array_cursor_get_value(&c->cursor, (void **) &value); + c->array_index++; + c->bit_index = 0; + c->current_bits = le64_to_cpu(*value); + return 0; +} + +int dm_bitset_cursor_begin(struct dm_disk_bitset *info, + dm_block_t root, uint32_t nr_entries, + struct dm_bitset_cursor *c) +{ + int r; + __le64 *value; + + if (!nr_entries) + return -ENODATA; + + c->info = info; + c->entries_remaining = nr_entries; + + r = dm_array_cursor_begin(&info->array_info, root, &c->cursor); + if (r) + return r; + + dm_array_cursor_get_value(&c->cursor, (void **) &value); + c->array_index = 0; + c->bit_index = 0; + c->current_bits = le64_to_cpu(*value); + + return r; +} +EXPORT_SYMBOL_GPL(dm_bitset_cursor_begin); + +void dm_bitset_cursor_end(struct dm_bitset_cursor *c) +{ + return dm_array_cursor_end(&c->cursor); +} +EXPORT_SYMBOL_GPL(dm_bitset_cursor_end); + +int dm_bitset_cursor_next(struct dm_bitset_cursor *c) +{ + int r = 0; + + if (!c->entries_remaining) + return -ENODATA; + + c->entries_remaining--; + if (++c->bit_index > 63) + r = cursor_next_array_entry(c); + + return r; +} +EXPORT_SYMBOL_GPL(dm_bitset_cursor_next); + +int dm_bitset_cursor_skip(struct dm_bitset_cursor *c, uint32_t count) +{ + int r; + __le64 *value; + uint32_t nr_array_skip; + uint32_t remaining_in_word = 64 - c->bit_index; + + if (c->entries_remaining < count) + return -ENODATA; + + if (count < remaining_in_word) { + c->bit_index += count; + c->entries_remaining -= count; + return 0; + + } else { + c->entries_remaining -= remaining_in_word; + count -= remaining_in_word; + } + + nr_array_skip = (count / 64) + 1; + r = dm_array_cursor_skip(&c->cursor, nr_array_skip); + if (r) + return r; + + dm_array_cursor_get_value(&c->cursor, (void **) &value); + c->entries_remaining -= count; + c->array_index += nr_array_skip; + c->bit_index = count & 63; + c->current_bits = le64_to_cpu(*value); + + return 0; +} +EXPORT_SYMBOL_GPL(dm_bitset_cursor_skip); + +bool dm_bitset_cursor_get_value(struct dm_bitset_cursor *c) +{ + return test_bit(c->bit_index, (unsigned long *) &c->current_bits); +} +EXPORT_SYMBOL_GPL(dm_bitset_cursor_get_value); + /*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-bitset.h b/drivers/md/persistent-data/dm-bitset.h index c2287d672ef5..df888da04ee1 100644 --- a/drivers/md/persistent-data/dm-bitset.h +++ b/drivers/md/persistent-data/dm-bitset.h @@ -93,6 +93,22 @@ void dm_disk_bitset_init(struct dm_transaction_manager *tm, int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *new_root); /* + * Creates a new bitset populated with values provided by a callback + * function. This is more efficient than creating an empty bitset, + * resizing, and then setting values since that process incurs a lot of + * copying. + * + * info - describes the array + * root - the root block of the array on disk + * size - the number of entries in the array + * fn - the callback + * context - passed to the callback + */ +typedef int (*bit_value_fn)(uint32_t index, bool *value, void *context); +int dm_bitset_new(struct dm_disk_bitset *info, dm_block_t *root, + uint32_t size, bit_value_fn fn, void *context); + +/* * Resize the bitset. * * info - describes the bitset @@ -161,6 +177,29 @@ int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root, int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root, dm_block_t *new_root); +struct dm_bitset_cursor { + struct dm_disk_bitset *info; + struct dm_array_cursor cursor; + + uint32_t entries_remaining; + uint32_t array_index; + uint32_t bit_index; + uint64_t current_bits; +}; + +/* + * Make sure you've flush any dm_disk_bitset and updated the root before + * using this. + */ +int dm_bitset_cursor_begin(struct dm_disk_bitset *info, + dm_block_t root, uint32_t nr_entries, + struct dm_bitset_cursor *c); +void dm_bitset_cursor_end(struct dm_bitset_cursor *c); + +int dm_bitset_cursor_next(struct dm_bitset_cursor *c); +int dm_bitset_cursor_skip(struct dm_bitset_cursor *c, uint32_t count); +bool dm_bitset_cursor_get_value(struct dm_bitset_cursor *c); + /*----------------------------------------------------------------*/ #endif /* _LINUX_DM_BITSET_H */ diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index a6dde7cab458..8589e0a14068 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -13,6 +13,7 @@ #include <linux/rwsem.h> #include <linux/device-mapper.h> #include <linux/stacktrace.h> +#include <linux/sched/task.h> #define DM_MSG_PREFIX "block manager" @@ -120,7 +121,7 @@ static int __check_holder(struct block_lock *lock) static void __wait(struct waiter *w) { for (;;) { - set_task_state(current, TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); if (!w->task) break; @@ -128,7 +129,7 @@ static void __wait(struct waiter *w) schedule(); } - set_task_state(current, TASK_RUNNING); + set_current_state(TASK_RUNNING); } static void __wake_waiter(struct waiter *w) @@ -462,7 +463,7 @@ int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, int r; p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result); - if (IS_ERR(p)) + if (unlikely(IS_ERR(p))) return PTR_ERR(p); aux = dm_bufio_get_aux_data(to_buffer(*result)); @@ -498,7 +499,7 @@ int dm_bm_write_lock(struct dm_block_manager *bm, return -EPERM; p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result); - if (IS_ERR(p)) + if (unlikely(IS_ERR(p))) return PTR_ERR(p); aux = dm_bufio_get_aux_data(to_buffer(*result)); @@ -531,7 +532,7 @@ int dm_bm_read_try_lock(struct dm_block_manager *bm, int r; p = dm_bufio_get(bm->bufio, b, (struct dm_buffer **) result); - if (IS_ERR(p)) + if (unlikely(IS_ERR(p))) return PTR_ERR(p); if (unlikely(!p)) return -EWOULDBLOCK; @@ -567,7 +568,7 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm, return -EPERM; p = dm_bufio_new(bm->bufio, b, (struct dm_buffer **) result); - if (IS_ERR(p)) + if (unlikely(IS_ERR(p))) return PTR_ERR(p); memset(p, 0, dm_bm_block_size(bm)); diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index 20a40329d84a..02e2ee0d8a00 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -272,7 +272,12 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root) int r; struct del_stack *s; - s = kmalloc(sizeof(*s), GFP_NOIO); + /* + * dm_btree_del() is called via an ioctl, as such should be + * considered an FS op. We can't recurse back into the FS, so we + * allocate GFP_NOFS. + */ + s = kmalloc(sizeof(*s), GFP_NOFS); if (!s) return -ENOMEM; s->info = info; @@ -1139,6 +1144,17 @@ int dm_btree_cursor_next(struct dm_btree_cursor *c) } EXPORT_SYMBOL_GPL(dm_btree_cursor_next); +int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count) +{ + int r = 0; + + while (count-- && !r) + r = dm_btree_cursor_next(c); + + return r; +} +EXPORT_SYMBOL_GPL(dm_btree_cursor_skip); + int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le) { if (c->depth) { diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h index db9bd26adf31..3dc5bb1a4748 100644 --- a/drivers/md/persistent-data/dm-btree.h +++ b/drivers/md/persistent-data/dm-btree.h @@ -209,6 +209,7 @@ int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root, bool prefetch_leaves, struct dm_btree_cursor *c); void dm_btree_cursor_end(struct dm_btree_cursor *c); int dm_btree_cursor_next(struct dm_btree_cursor *c); +int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count); int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le); #endif /* _LINUX_DM_BTREE_H */ diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index 4c28608a0c94..829b4ce057d8 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -626,13 +626,19 @@ int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm, void *root_le, size_t len) { int r; - struct disk_sm_root *smr = root_le; + struct disk_sm_root smr; if (len < sizeof(struct disk_sm_root)) { DMERR("sm_metadata root too small"); return -ENOMEM; } + /* + * We don't know the alignment of the root_le buffer, so need to + * copy into a new structure. + */ + memcpy(&smr, root_le, sizeof(smr)); + r = sm_ll_init(ll, tm); if (r < 0) return r; @@ -644,10 +650,10 @@ int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm, ll->max_entries = metadata_ll_max_entries; ll->commit = metadata_ll_commit; - ll->nr_blocks = le64_to_cpu(smr->nr_blocks); - ll->nr_allocated = le64_to_cpu(smr->nr_allocated); - ll->bitmap_root = le64_to_cpu(smr->bitmap_root); - ll->ref_count_root = le64_to_cpu(smr->ref_count_root); + ll->nr_blocks = le64_to_cpu(smr.nr_blocks); + ll->nr_allocated = le64_to_cpu(smr.nr_allocated); + ll->bitmap_root = le64_to_cpu(smr.bitmap_root); + ll->ref_count_root = le64_to_cpu(smr.ref_count_root); return ll->open_index(ll); } diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 20557e2c60c6..4aed69d9dd17 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -544,7 +544,7 @@ static int sm_metadata_copy_root(struct dm_space_map *sm, void *where_le, size_t static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks); -static struct dm_space_map ops = { +static const struct dm_space_map ops = { .destroy = sm_metadata_destroy, .extend = sm_metadata_extend, .get_nr_blocks = sm_metadata_get_nr_blocks, @@ -671,7 +671,7 @@ static int sm_bootstrap_copy_root(struct dm_space_map *sm, void *where, return -EINVAL; } -static struct dm_space_map bootstrap_ops = { +static const struct dm_space_map bootstrap_ops = { .destroy = sm_bootstrap_destroy, .extend = sm_bootstrap_extend, .get_nr_blocks = sm_bootstrap_get_nr_blocks, diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 848365d474f3..93347ca7c7a6 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -41,7 +41,7 @@ static int raid0_congested(struct mddev *mddev, int bits) for (i = 0; i < raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(devlist[i]->bdev); - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= bdi_congested(q->backing_dev_info, bits); } return ret; } @@ -420,8 +420,8 @@ static int raid0_run(struct mddev *mddev) */ int stripe = mddev->raid_disks * (mddev->chunk_sectors << 9) / PAGE_SIZE; - if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) - mddev->queue->backing_dev_info.ra_pages = 2* stripe; + if (mddev->queue->backing_dev_info->ra_pages < 2* stripe) + mddev->queue->backing_dev_info->ra_pages = 2* stripe; } dump_zones(mddev); @@ -503,6 +503,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) trace_block_bio_remap(bdev_get_queue(split->bi_bdev), split, disk_devt(mddev->gendisk), bio_sector); + mddev_check_writesame(mddev, split); generic_make_request(split); } } while (split != bio); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 7b0f647bcccb..a34f58772022 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -37,7 +37,10 @@ #include <linux/module.h> #include <linux/seq_file.h> #include <linux/ratelimit.h> +#include <linux/sched/signal.h> + #include <trace/events/block.h> + #include "md.h" #include "raid1.h" #include "bitmap.h" @@ -71,9 +74,8 @@ */ static int max_queued_requests = 1024; -static void allow_barrier(struct r1conf *conf, sector_t start_next_window, - sector_t bi_sector); -static void lower_barrier(struct r1conf *conf); +static void allow_barrier(struct r1conf *conf, sector_t sector_nr); +static void lower_barrier(struct r1conf *conf, sector_t sector_nr); #define raid1_log(md, fmt, args...) \ do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) @@ -100,7 +102,6 @@ static void r1bio_pool_free(void *r1_bio, void *data) #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) #define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) -#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS) static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) { @@ -205,6 +206,7 @@ static void free_r1bio(struct r1bio *r1_bio) static void put_buf(struct r1bio *r1_bio) { struct r1conf *conf = r1_bio->mddev->private; + sector_t sect = r1_bio->sector; int i; for (i = 0; i < conf->raid_disks * 2; i++) { @@ -215,7 +217,7 @@ static void put_buf(struct r1bio *r1_bio) mempool_free(r1_bio, conf->r1buf_pool); - lower_barrier(conf); + lower_barrier(conf, sect); } static void reschedule_retry(struct r1bio *r1_bio) @@ -223,10 +225,12 @@ static void reschedule_retry(struct r1bio *r1_bio) unsigned long flags; struct mddev *mddev = r1_bio->mddev; struct r1conf *conf = mddev->private; + int idx; + idx = sector_to_idx(r1_bio->sector); spin_lock_irqsave(&conf->device_lock, flags); list_add(&r1_bio->retry_list, &conf->retry_list); - conf->nr_queued ++; + atomic_inc(&conf->nr_queued[idx]); spin_unlock_irqrestore(&conf->device_lock, flags); wake_up(&conf->wait_barrier); @@ -243,7 +247,6 @@ static void call_bio_endio(struct r1bio *r1_bio) struct bio *bio = r1_bio->master_bio; int done; struct r1conf *conf = r1_bio->mddev->private; - sector_t start_next_window = r1_bio->start_next_window; sector_t bi_sector = bio->bi_iter.bi_sector; if (bio->bi_phys_segments) { @@ -269,7 +272,7 @@ static void call_bio_endio(struct r1bio *r1_bio) * Wake up any possible resync thread that waits for the device * to go idle. */ - allow_barrier(conf, start_next_window, bi_sector); + allow_barrier(conf, bi_sector); } } @@ -517,6 +520,25 @@ static void raid1_end_write_request(struct bio *bio) bio_put(to_put); } +static sector_t align_to_barrier_unit_end(sector_t start_sector, + sector_t sectors) +{ + sector_t len; + + WARN_ON(sectors == 0); + /* + * len is the number of sectors from start_sector to end of the + * barrier unit which start_sector belongs to. + */ + len = round_up(start_sector + 1, BARRIER_UNIT_SECTOR_SIZE) - + start_sector; + + if (len > sectors) + len = sectors; + + return len; +} + /* * This routine returns the disk from which the requested read should * be done. There is a per-array 'next expected sequential IO' sector @@ -744,9 +766,9 @@ static int raid1_congested(struct mddev *mddev, int bits) * non-congested targets, it can be removed */ if ((bits & (1 << WB_async_congested)) || 1) - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= bdi_congested(q->backing_dev_info, bits); else - ret &= bdi_congested(&q->backing_dev_info, bits); + ret &= bdi_congested(q->backing_dev_info, bits); } } rcu_read_unlock(); @@ -813,168 +835,228 @@ static void flush_pending_writes(struct r1conf *conf) */ static void raise_barrier(struct r1conf *conf, sector_t sector_nr) { + int idx = sector_to_idx(sector_nr); + spin_lock_irq(&conf->resync_lock); /* Wait until no block IO is waiting */ - wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, + wait_event_lock_irq(conf->wait_barrier, + !atomic_read(&conf->nr_waiting[idx]), conf->resync_lock); /* block any new IO from starting */ - conf->barrier++; - conf->next_resync = sector_nr; + atomic_inc(&conf->barrier[idx]); + /* + * In raise_barrier() we firstly increase conf->barrier[idx] then + * check conf->nr_pending[idx]. In _wait_barrier() we firstly + * increase conf->nr_pending[idx] then check conf->barrier[idx]. + * A memory barrier here to make sure conf->nr_pending[idx] won't + * be fetched before conf->barrier[idx] is increased. Otherwise + * there will be a race between raise_barrier() and _wait_barrier(). + */ + smp_mb__after_atomic(); /* For these conditions we must wait: * A: while the array is in frozen state - * B: while barrier >= RESYNC_DEPTH, meaning resync reach - * the max count which allowed. - * C: next_resync + RESYNC_SECTORS > start_next_window, meaning - * next resync will reach to the window which normal bios are - * handling. - * D: while there are any active requests in the current window. + * B: while conf->nr_pending[idx] is not 0, meaning regular I/O + * existing in corresponding I/O barrier bucket. + * C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches + * max resync count which allowed on current I/O barrier bucket. */ wait_event_lock_irq(conf->wait_barrier, !conf->array_frozen && - conf->barrier < RESYNC_DEPTH && - conf->current_window_requests == 0 && - (conf->start_next_window >= - conf->next_resync + RESYNC_SECTORS), + !atomic_read(&conf->nr_pending[idx]) && + atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH, conf->resync_lock); - conf->nr_pending++; + atomic_inc(&conf->nr_pending[idx]); spin_unlock_irq(&conf->resync_lock); } -static void lower_barrier(struct r1conf *conf) +static void lower_barrier(struct r1conf *conf, sector_t sector_nr) { - unsigned long flags; - BUG_ON(conf->barrier <= 0); - spin_lock_irqsave(&conf->resync_lock, flags); - conf->barrier--; - conf->nr_pending--; - spin_unlock_irqrestore(&conf->resync_lock, flags); + int idx = sector_to_idx(sector_nr); + + BUG_ON(atomic_read(&conf->barrier[idx]) <= 0); + + atomic_dec(&conf->barrier[idx]); + atomic_dec(&conf->nr_pending[idx]); wake_up(&conf->wait_barrier); } -static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio) +static void _wait_barrier(struct r1conf *conf, int idx) { - bool wait = false; + /* + * We need to increase conf->nr_pending[idx] very early here, + * then raise_barrier() can be blocked when it waits for + * conf->nr_pending[idx] to be 0. Then we can avoid holding + * conf->resync_lock when there is no barrier raised in same + * barrier unit bucket. Also if the array is frozen, I/O + * should be blocked until array is unfrozen. + */ + atomic_inc(&conf->nr_pending[idx]); + /* + * In _wait_barrier() we firstly increase conf->nr_pending[idx], then + * check conf->barrier[idx]. In raise_barrier() we firstly increase + * conf->barrier[idx], then check conf->nr_pending[idx]. A memory + * barrier is necessary here to make sure conf->barrier[idx] won't be + * fetched before conf->nr_pending[idx] is increased. Otherwise there + * will be a race between _wait_barrier() and raise_barrier(). + */ + smp_mb__after_atomic(); - if (conf->array_frozen || !bio) - wait = true; - else if (conf->barrier && bio_data_dir(bio) == WRITE) { - if ((conf->mddev->curr_resync_completed - >= bio_end_sector(bio)) || - (conf->start_next_window + NEXT_NORMALIO_DISTANCE - <= bio->bi_iter.bi_sector)) - wait = false; - else - wait = true; - } + /* + * Don't worry about checking two atomic_t variables at same time + * here. If during we check conf->barrier[idx], the array is + * frozen (conf->array_frozen is 1), and chonf->barrier[idx] is + * 0, it is safe to return and make the I/O continue. Because the + * array is frozen, all I/O returned here will eventually complete + * or be queued, no race will happen. See code comment in + * frozen_array(). + */ + if (!READ_ONCE(conf->array_frozen) && + !atomic_read(&conf->barrier[idx])) + return; - return wait; + /* + * After holding conf->resync_lock, conf->nr_pending[idx] + * should be decreased before waiting for barrier to drop. + * Otherwise, we may encounter a race condition because + * raise_barrer() might be waiting for conf->nr_pending[idx] + * to be 0 at same time. + */ + spin_lock_irq(&conf->resync_lock); + atomic_inc(&conf->nr_waiting[idx]); + atomic_dec(&conf->nr_pending[idx]); + /* + * In case freeze_array() is waiting for + * get_unqueued_pending() == extra + */ + wake_up(&conf->wait_barrier); + /* Wait for the barrier in same barrier unit bucket to drop. */ + wait_event_lock_irq(conf->wait_barrier, + !conf->array_frozen && + !atomic_read(&conf->barrier[idx]), + conf->resync_lock); + atomic_inc(&conf->nr_pending[idx]); + atomic_dec(&conf->nr_waiting[idx]); + spin_unlock_irq(&conf->resync_lock); } -static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) +static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr) { - sector_t sector = 0; + int idx = sector_to_idx(sector_nr); - spin_lock_irq(&conf->resync_lock); - if (need_to_wait_for_sync(conf, bio)) { - conf->nr_waiting++; - /* Wait for the barrier to drop. - * However if there are already pending - * requests (preventing the barrier from - * rising completely), and the - * per-process bio queue isn't empty, - * then don't wait, as we need to empty - * that queue to allow conf->start_next_window - * to increase. - */ - raid1_log(conf->mddev, "wait barrier"); - wait_event_lock_irq(conf->wait_barrier, - !conf->array_frozen && - (!conf->barrier || - ((conf->start_next_window < - conf->next_resync + RESYNC_SECTORS) && - current->bio_list && - !bio_list_empty(current->bio_list))), - conf->resync_lock); - conf->nr_waiting--; - } - - if (bio && bio_data_dir(bio) == WRITE) { - if (bio->bi_iter.bi_sector >= conf->next_resync) { - if (conf->start_next_window == MaxSector) - conf->start_next_window = - conf->next_resync + - NEXT_NORMALIO_DISTANCE; - - if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE) - <= bio->bi_iter.bi_sector) - conf->next_window_requests++; - else - conf->current_window_requests++; - sector = conf->start_next_window; - } - } + /* + * Very similar to _wait_barrier(). The difference is, for read + * I/O we don't need wait for sync I/O, but if the whole array + * is frozen, the read I/O still has to wait until the array is + * unfrozen. Since there is no ordering requirement with + * conf->barrier[idx] here, memory barrier is unnecessary as well. + */ + atomic_inc(&conf->nr_pending[idx]); - conf->nr_pending++; + if (!READ_ONCE(conf->array_frozen)) + return; + + spin_lock_irq(&conf->resync_lock); + atomic_inc(&conf->nr_waiting[idx]); + atomic_dec(&conf->nr_pending[idx]); + /* + * In case freeze_array() is waiting for + * get_unqueued_pending() == extra + */ + wake_up(&conf->wait_barrier); + /* Wait for array to be unfrozen */ + wait_event_lock_irq(conf->wait_barrier, + !conf->array_frozen, + conf->resync_lock); + atomic_inc(&conf->nr_pending[idx]); + atomic_dec(&conf->nr_waiting[idx]); spin_unlock_irq(&conf->resync_lock); - return sector; } -static void allow_barrier(struct r1conf *conf, sector_t start_next_window, - sector_t bi_sector) +static void wait_barrier(struct r1conf *conf, sector_t sector_nr) { - unsigned long flags; + int idx = sector_to_idx(sector_nr); - spin_lock_irqsave(&conf->resync_lock, flags); - conf->nr_pending--; - if (start_next_window) { - if (start_next_window == conf->start_next_window) { - if (conf->start_next_window + NEXT_NORMALIO_DISTANCE - <= bi_sector) - conf->next_window_requests--; - else - conf->current_window_requests--; - } else - conf->current_window_requests--; - - if (!conf->current_window_requests) { - if (conf->next_window_requests) { - conf->current_window_requests = - conf->next_window_requests; - conf->next_window_requests = 0; - conf->start_next_window += - NEXT_NORMALIO_DISTANCE; - } else - conf->start_next_window = MaxSector; - } - } - spin_unlock_irqrestore(&conf->resync_lock, flags); + _wait_barrier(conf, idx); +} + +static void wait_all_barriers(struct r1conf *conf) +{ + int idx; + + for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) + _wait_barrier(conf, idx); +} + +static void _allow_barrier(struct r1conf *conf, int idx) +{ + atomic_dec(&conf->nr_pending[idx]); wake_up(&conf->wait_barrier); } +static void allow_barrier(struct r1conf *conf, sector_t sector_nr) +{ + int idx = sector_to_idx(sector_nr); + + _allow_barrier(conf, idx); +} + +static void allow_all_barriers(struct r1conf *conf) +{ + int idx; + + for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) + _allow_barrier(conf, idx); +} + +/* conf->resync_lock should be held */ +static int get_unqueued_pending(struct r1conf *conf) +{ + int idx, ret; + + for (ret = 0, idx = 0; idx < BARRIER_BUCKETS_NR; idx++) + ret += atomic_read(&conf->nr_pending[idx]) - + atomic_read(&conf->nr_queued[idx]); + + return ret; +} + static void freeze_array(struct r1conf *conf, int extra) { - /* stop syncio and normal IO and wait for everything to - * go quite. - * We wait until nr_pending match nr_queued+extra - * This is called in the context of one normal IO request - * that has failed. Thus any sync request that might be pending - * will be blocked by nr_pending, and we need to wait for - * pending IO requests to complete or be queued for re-try. - * Thus the number queued (nr_queued) plus this request (extra) - * must match the number of pending IOs (nr_pending) before - * we continue. + /* Stop sync I/O and normal I/O and wait for everything to + * go quiet. + * This is called in two situations: + * 1) management command handlers (reshape, remove disk, quiesce). + * 2) one normal I/O request failed. + + * After array_frozen is set to 1, new sync IO will be blocked at + * raise_barrier(), and new normal I/O will blocked at _wait_barrier() + * or wait_read_barrier(). The flying I/Os will either complete or be + * queued. When everything goes quite, there are only queued I/Os left. + + * Every flying I/O contributes to a conf->nr_pending[idx], idx is the + * barrier bucket index which this I/O request hits. When all sync and + * normal I/O are queued, sum of all conf->nr_pending[] will match sum + * of all conf->nr_queued[]. But normal I/O failure is an exception, + * in handle_read_error(), we may call freeze_array() before trying to + * fix the read error. In this case, the error read I/O is not queued, + * so get_unqueued_pending() == 1. + * + * Therefore before this function returns, we need to wait until + * get_unqueued_pendings(conf) gets equal to extra. For + * normal I/O context, extra is 1, in rested situations extra is 0. */ spin_lock_irq(&conf->resync_lock); conf->array_frozen = 1; raid1_log(conf->mddev, "wait freeze"); - wait_event_lock_irq_cmd(conf->wait_barrier, - conf->nr_pending == conf->nr_queued+extra, - conf->resync_lock, - flush_pending_writes(conf)); + wait_event_lock_irq_cmd( + conf->wait_barrier, + get_unqueued_pending(conf) == extra, + conf->resync_lock, + flush_pending_writes(conf)); spin_unlock_irq(&conf->resync_lock); } static void unfreeze_array(struct r1conf *conf) @@ -982,8 +1064,8 @@ static void unfreeze_array(struct r1conf *conf) /* reverse the effect of the freeze */ spin_lock_irq(&conf->resync_lock); conf->array_frozen = 0; - wake_up(&conf->wait_barrier); spin_unlock_irq(&conf->resync_lock); + wake_up(&conf->wait_barrier); } /* duplicate the data pages for behind I/O @@ -1070,11 +1152,28 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) kfree(plug); } -static void raid1_read_request(struct mddev *mddev, struct bio *bio, - struct r1bio *r1_bio) +static inline struct r1bio * +alloc_r1bio(struct mddev *mddev, struct bio *bio, sector_t sectors_handled) +{ + struct r1conf *conf = mddev->private; + struct r1bio *r1_bio; + + r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); + + r1_bio->master_bio = bio; + r1_bio->sectors = bio_sectors(bio) - sectors_handled; + r1_bio->state = 0; + r1_bio->mddev = mddev; + r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled; + + return r1_bio; +} + +static void raid1_read_request(struct mddev *mddev, struct bio *bio) { struct r1conf *conf = mddev->private; struct raid1_info *mirror; + struct r1bio *r1_bio; struct bio *read_bio; struct bitmap *bitmap = mddev->bitmap; const int op = bio_op(bio); @@ -1083,8 +1182,29 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, int max_sectors; int rdisk; - wait_barrier(conf, bio); + /* + * Still need barrier for READ in case that whole + * array is frozen. + */ + wait_read_barrier(conf, bio->bi_iter.bi_sector); + + r1_bio = alloc_r1bio(mddev, bio, 0); + /* + * We might need to issue multiple reads to different + * devices if there are bad blocks around, so we keep + * track of the number of reads in bio->bi_phys_segments. + * If this is 0, there is only one r1_bio and no locking + * will be needed when requests complete. If it is + * non-zero, then it is the number of not-completed requests. + */ + bio->bi_phys_segments = 0; + bio_clear_flag(bio, BIO_SEG_VALID); + + /* + * make_request() can abort the operation when read-ahead is being + * used and no empty request is available. + */ read_again: rdisk = read_balance(conf, r1_bio, &max_sectors); @@ -1106,9 +1226,8 @@ read_again: atomic_read(&bitmap->behind_writes) == 0); } r1_bio->read_disk = rdisk; - r1_bio->start_next_window = 0; - read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); + read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors); @@ -1151,36 +1270,25 @@ read_again: */ reschedule_retry(r1_bio); - r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); - - r1_bio->master_bio = bio; - r1_bio->sectors = bio_sectors(bio) - sectors_handled; - r1_bio->state = 0; - r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled; + r1_bio = alloc_r1bio(mddev, bio, sectors_handled); goto read_again; } else generic_make_request(read_bio); } -static void raid1_write_request(struct mddev *mddev, struct bio *bio, - struct r1bio *r1_bio) +static void raid1_write_request(struct mddev *mddev, struct bio *bio) { struct r1conf *conf = mddev->private; + struct r1bio *r1_bio; int i, disks; struct bitmap *bitmap = mddev->bitmap; unsigned long flags; - const int op = bio_op(bio); - const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); - const unsigned long do_flush_fua = (bio->bi_opf & - (REQ_PREFLUSH | REQ_FUA)); struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; struct raid1_plug_cb *plug = NULL; int first_clone; int sectors_handled; int max_sectors; - sector_t start_next_window; /* * Register the new request and wait if the reconstruction @@ -1216,7 +1324,19 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, } finish_wait(&conf->wait_barrier, &w); } - start_next_window = wait_barrier(conf, bio); + wait_barrier(conf, bio->bi_iter.bi_sector); + + r1_bio = alloc_r1bio(mddev, bio, 0); + + /* We might need to issue multiple writes to different + * devices if there are bad blocks around, so we keep + * track of the number of writes in bio->bi_phys_segments. + * If this is 0, there is only one r1_bio and no locking + * will be needed when requests complete. If it is + * non-zero, then it is the number of not-completed requests. + */ + bio->bi_phys_segments = 0; + bio_clear_flag(bio, BIO_SEG_VALID); if (conf->pending_count >= max_queued_requests) { md_wakeup_thread(mddev->thread); @@ -1237,7 +1357,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, disks = conf->raid_disks * 2; retry_write: - r1_bio->start_next_window = start_next_window; blocked_rdev = NULL; rcu_read_lock(); max_sectors = r1_bio->sectors; @@ -1304,25 +1423,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, if (unlikely(blocked_rdev)) { /* Wait for this device to become unblocked */ int j; - sector_t old = start_next_window; for (j = 0; j < i; j++) if (r1_bio->bios[j]) rdev_dec_pending(conf->mirrors[j].rdev, mddev); r1_bio->state = 0; - allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector); + allow_barrier(conf, bio->bi_iter.bi_sector); raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); md_wait_for_blocked_rdev(blocked_rdev, mddev); - start_next_window = wait_barrier(conf, bio); - /* - * We must make sure the multi r1bios of bio have - * the same value of bi_phys_segments - */ - if (bio->bi_phys_segments && old && - old != start_next_window) - /* Wait for the former r1bio(s) to complete */ - wait_event(conf->wait_barrier, - bio->bi_phys_segments == 1); + wait_barrier(conf, bio->bi_iter.bi_sector); goto retry_write; } @@ -1345,13 +1454,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, first_clone = 1; for (i = 0; i < disks; i++) { - struct bio *mbio; + struct bio *mbio = NULL; + sector_t offset; if (!r1_bio->bios[i]) continue; - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, - max_sectors); + offset = r1_bio->sector - bio->bi_iter.bi_sector; if (first_clone) { /* do behind I/O ? @@ -1361,8 +1469,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, if (bitmap && (atomic_read(&bitmap->behind_writes) < mddev->bitmap_info.max_write_behind) && - !waitqueue_active(&bitmap->behind_wait)) + !waitqueue_active(&bitmap->behind_wait)) { + mbio = bio_clone_bioset_partial(bio, GFP_NOIO, + mddev->bio_set, + offset << 9, + max_sectors << 9); alloc_behind_pages(mbio, r1_bio); + } bitmap_startwrite(bitmap, r1_bio->sector, r1_bio->sectors, @@ -1370,6 +1483,19 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, &r1_bio->state)); first_clone = 0; } + + if (!mbio) { + if (r1_bio->behind_bvecs) + mbio = bio_clone_bioset_partial(bio, GFP_NOIO, + mddev->bio_set, + offset << 9, + max_sectors << 9); + else { + mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); + bio_trim(mbio, offset, max_sectors); + } + } + if (r1_bio->behind_bvecs) { struct bio_vec *bvec; int j; @@ -1389,7 +1515,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, conf->mirrors[i].rdev->data_offset); mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; - bio_set_op_attrs(mbio, op, do_flush_fua | do_sync); + mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA)); if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) && !test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) && conf->raid_disks - mddev->degraded > 1) @@ -1430,12 +1556,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, /* We need another r1_bio. It has already been counted * in bio->bi_phys_segments */ - r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); - r1_bio->master_bio = bio; - r1_bio->sectors = bio_sectors(bio) - sectors_handled; - r1_bio->state = 0; - r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled; + r1_bio = alloc_r1bio(mddev, bio, sectors_handled); goto retry_write; } @@ -1447,36 +1568,51 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, static void raid1_make_request(struct mddev *mddev, struct bio *bio) { - struct r1conf *conf = mddev->private; - struct r1bio *r1_bio; + struct bio *split; + sector_t sectors; - /* - * make_request() can abort the operation when read-ahead is being - * used and no empty request is available. - * - */ - r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { + md_flush_request(mddev, bio); + return; + } - r1_bio->master_bio = bio; - r1_bio->sectors = bio_sectors(bio); - r1_bio->state = 0; - r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_iter.bi_sector; + /* if bio exceeds barrier unit boundary, split it */ + do { + sectors = align_to_barrier_unit_end( + bio->bi_iter.bi_sector, bio_sectors(bio)); + if (sectors < bio_sectors(bio)) { + split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set); + bio_chain(split, bio); + } else { + split = bio; + } - /* - * We might need to issue multiple reads to different devices if there - * are bad blocks around, so we keep track of the number of reads in - * bio->bi_phys_segments. If this is 0, there is only one r1_bio and - * no locking will be needed when requests complete. If it is - * non-zero, then it is the number of not-completed requests. - */ - bio->bi_phys_segments = 0; - bio_clear_flag(bio, BIO_SEG_VALID); + if (bio_data_dir(split) == READ) { + raid1_read_request(mddev, split); - if (bio_data_dir(bio) == READ) - raid1_read_request(mddev, bio, r1_bio); - else - raid1_write_request(mddev, bio, r1_bio); + /* + * If a bio is splitted, the first part of bio will + * pass barrier but the bio is queued in + * current->bio_list (see generic_make_request). If + * there is a raise_barrier() called here, the second + * part of bio can't pass barrier. But since the first + * part bio isn't dispatched to underlaying disks yet, + * the barrier is never released, hence raise_barrier + * will alays wait. We have a deadlock. + * Note, this only happens in read path. For write + * path, the first part of bio is dispatched in a + * schedule() call (because of blk plug) or offloaded + * to raid10d. + * Quitting from the function immediately can change + * the bio order queued in bio_list and avoid the deadlock. + */ + if (split != bio) { + generic_make_request(bio); + break; + } + } else + raid1_write_request(mddev, split); + } while (split != bio); } static void raid1_status(struct seq_file *seq, struct mddev *mddev) @@ -1567,19 +1703,11 @@ static void print_conf(struct r1conf *conf) static void close_sync(struct r1conf *conf) { - wait_barrier(conf, NULL); - allow_barrier(conf, 0, 0); + wait_all_barriers(conf); + allow_all_barriers(conf); mempool_destroy(conf->r1buf_pool); conf->r1buf_pool = NULL; - - spin_lock_irq(&conf->resync_lock); - conf->next_resync = MaxSector - 2 * NEXT_NORMALIO_DISTANCE; - conf->start_next_window = MaxSector; - conf->current_window_requests += - conf->next_window_requests; - conf->next_window_requests = 0; - spin_unlock_irq(&conf->resync_lock); } static int raid1_spare_active(struct mddev *mddev) @@ -2276,7 +2404,8 @@ static int narrow_write_error(struct r1bio *r1_bio, int i) wbio->bi_vcnt = vcnt; } else { - wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); + wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO, + mddev->bio_set); } bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); @@ -2326,8 +2455,9 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) { - int m; + int m, idx; bool fail = false; + for (m = 0; m < conf->raid_disks * 2 ; m++) if (r1_bio->bios[m] == IO_MADE_GOOD) { struct md_rdev *rdev = conf->mirrors[m].rdev; @@ -2353,8 +2483,14 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) if (fail) { spin_lock_irq(&conf->device_lock); list_add(&r1_bio->retry_list, &conf->bio_end_io_list); - conf->nr_queued++; + idx = sector_to_idx(r1_bio->sector); + atomic_inc(&conf->nr_queued[idx]); spin_unlock_irq(&conf->device_lock); + /* + * In case freeze_array() is waiting for condition + * get_unqueued_pending() == extra to be true. + */ + wake_up(&conf->wait_barrier); md_wakeup_thread(conf->mddev->thread); } else { if (test_bit(R1BIO_WriteError, &r1_bio->state)) @@ -2414,7 +2550,8 @@ read_more: const unsigned long do_sync = r1_bio->master_bio->bi_opf & REQ_SYNC; r1_bio->read_disk = disk; - bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); + bio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO, + mddev->bio_set); bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors); r1_bio->bios[r1_bio->read_disk] = bio; @@ -2448,15 +2585,8 @@ read_more: generic_make_request(bio); bio = NULL; - r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); - - r1_bio->master_bio = mbio; - r1_bio->sectors = bio_sectors(mbio) - sectors_handled; - r1_bio->state = 0; + r1_bio = alloc_r1bio(mddev, mbio, sectors_handled); set_bit(R1BIO_ReadError, &r1_bio->state); - r1_bio->mddev = mddev; - r1_bio->sector = mbio->bi_iter.bi_sector + - sectors_handled; goto read_more; } else { @@ -2475,6 +2605,7 @@ static void raid1d(struct md_thread *thread) struct r1conf *conf = mddev->private; struct list_head *head = &conf->retry_list; struct blk_plug plug; + int idx; md_check_recovery(mddev); @@ -2482,17 +2613,15 @@ static void raid1d(struct md_thread *thread) !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { LIST_HEAD(tmp); spin_lock_irqsave(&conf->device_lock, flags); - if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { - while (!list_empty(&conf->bio_end_io_list)) { - list_move(conf->bio_end_io_list.prev, &tmp); - conf->nr_queued--; - } - } + if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) + list_splice_init(&conf->bio_end_io_list, &tmp); spin_unlock_irqrestore(&conf->device_lock, flags); while (!list_empty(&tmp)) { r1_bio = list_first_entry(&tmp, struct r1bio, retry_list); list_del(&r1_bio->retry_list); + idx = sector_to_idx(r1_bio->sector); + atomic_dec(&conf->nr_queued[idx]); if (mddev->degraded) set_bit(R1BIO_Degraded, &r1_bio->state); if (test_bit(R1BIO_WriteError, &r1_bio->state)) @@ -2513,7 +2642,8 @@ static void raid1d(struct md_thread *thread) } r1_bio = list_entry(head->prev, struct r1bio, retry_list); list_del(head->prev); - conf->nr_queued--; + idx = sector_to_idx(r1_bio->sector); + atomic_dec(&conf->nr_queued[idx]); spin_unlock_irqrestore(&conf->device_lock, flags); mddev = r1_bio->mddev; @@ -2552,7 +2682,6 @@ static int init_resync(struct r1conf *conf) conf->poolinfo); if (!conf->r1buf_pool) return -ENOMEM; - conf->next_resync = 0; return 0; } @@ -2581,6 +2710,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, int still_degraded = 0; int good_sectors = RESYNC_SECTORS; int min_bad = 0; /* number of sectors that are bad in all devices */ + int idx = sector_to_idx(sector_nr); if (!conf->r1buf_pool) if (init_resync(conf)) @@ -2630,7 +2760,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, * If there is non-resync activity waiting for a turn, then let it * though before starting on this new sync request. */ - if (conf->nr_waiting) + if (atomic_read(&conf->nr_waiting[idx])) schedule_timeout_uninterruptible(1); /* we are incrementing sector_nr below. To be safe, we check against @@ -2657,6 +2787,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, r1_bio->sector = sector_nr; r1_bio->state = 0; set_bit(R1BIO_IsSync, &r1_bio->state); + /* make sure good_sectors won't go across barrier unit boundary */ + good_sectors = align_to_barrier_unit_end(sector_nr, good_sectors); for (i = 0; i < conf->raid_disks * 2; i++) { struct md_rdev *rdev; @@ -2887,6 +3019,26 @@ static struct r1conf *setup_conf(struct mddev *mddev) if (!conf) goto abort; + conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR, + sizeof(atomic_t), GFP_KERNEL); + if (!conf->nr_pending) + goto abort; + + conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR, + sizeof(atomic_t), GFP_KERNEL); + if (!conf->nr_waiting) + goto abort; + + conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR, + sizeof(atomic_t), GFP_KERNEL); + if (!conf->nr_queued) + goto abort; + + conf->barrier = kcalloc(BARRIER_BUCKETS_NR, + sizeof(atomic_t), GFP_KERNEL); + if (!conf->barrier) + goto abort; + conf->mirrors = kzalloc(sizeof(struct raid1_info) * mddev->raid_disks * 2, GFP_KERNEL); @@ -2942,9 +3094,6 @@ static struct r1conf *setup_conf(struct mddev *mddev) conf->pending_count = 0; conf->recovery_disabled = mddev->recovery_disabled - 1; - conf->start_next_window = MaxSector; - conf->current_window_requests = conf->next_window_requests = 0; - err = -EIO; for (i = 0; i < conf->raid_disks * 2; i++) { @@ -2987,6 +3136,10 @@ static struct r1conf *setup_conf(struct mddev *mddev) kfree(conf->mirrors); safe_put_page(conf->tmppage); kfree(conf->poolinfo); + kfree(conf->nr_pending); + kfree(conf->nr_waiting); + kfree(conf->nr_queued); + kfree(conf->barrier); kfree(conf); } return ERR_PTR(err); @@ -3088,6 +3241,10 @@ static void raid1_free(struct mddev *mddev, void *priv) kfree(conf->mirrors); safe_put_page(conf->tmppage); kfree(conf->poolinfo); + kfree(conf->nr_pending); + kfree(conf->nr_waiting); + kfree(conf->nr_queued); + kfree(conf->barrier); kfree(conf); } @@ -3110,8 +3267,6 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors) return ret; } md_set_array_sectors(mddev, newsize); - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); if (sectors > mddev->dev_sectors && mddev->recovery_cp > mddev->dev_sectors) { mddev->recovery_cp = mddev->dev_sectors; diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index c52ef424a24b..dd22a37d0d83 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -1,6 +1,30 @@ #ifndef _RAID1_H #define _RAID1_H +/* + * each barrier unit size is 64MB fow now + * note: it must be larger than RESYNC_DEPTH + */ +#define BARRIER_UNIT_SECTOR_BITS 17 +#define BARRIER_UNIT_SECTOR_SIZE (1<<17) +/* + * In struct r1conf, the following members are related to I/O barrier + * buckets, + * atomic_t *nr_pending; + * atomic_t *nr_waiting; + * atomic_t *nr_queued; + * atomic_t *barrier; + * Each of them points to array of atomic_t variables, each array is + * designed to have BARRIER_BUCKETS_NR elements and occupy a single + * memory page. The data width of atomic_t variables is 4 bytes, equal + * to 1<<(ilog2(sizeof(atomic_t))), BARRIER_BUCKETS_NR_BITS is defined + * as (PAGE_SHIFT - ilog2(sizeof(int))) to make sure an array of + * atomic_t variables with BARRIER_BUCKETS_NR elements just exactly + * occupies a single memory page. + */ +#define BARRIER_BUCKETS_NR_BITS (PAGE_SHIFT - ilog2(sizeof(atomic_t))) +#define BARRIER_BUCKETS_NR (1<<BARRIER_BUCKETS_NR_BITS) + struct raid1_info { struct md_rdev *rdev; sector_t head_position; @@ -35,25 +59,6 @@ struct r1conf { */ int raid_disks; - /* During resync, read_balancing is only allowed on the part - * of the array that has been resynced. 'next_resync' tells us - * where that is. - */ - sector_t next_resync; - - /* When raid1 starts resync, we divide array into four partitions - * |---------|--------------|---------------------|-------------| - * next_resync start_next_window end_window - * start_next_window = next_resync + NEXT_NORMALIO_DISTANCE - * end_window = start_next_window + NEXT_NORMALIO_DISTANCE - * current_window_requests means the count of normalIO between - * start_next_window and end_window. - * next_window_requests means the count of normalIO after end_window. - * */ - sector_t start_next_window; - int current_window_requests; - int next_window_requests; - spinlock_t device_lock; /* list of 'struct r1bio' that need to be processed by raid1d, @@ -79,10 +84,10 @@ struct r1conf { */ wait_queue_head_t wait_barrier; spinlock_t resync_lock; - int nr_pending; - int nr_waiting; - int nr_queued; - int barrier; + atomic_t *nr_pending; + atomic_t *nr_waiting; + atomic_t *nr_queued; + atomic_t *barrier; int array_frozen; /* Set to 1 if a full sync is needed, (fresh device added). @@ -135,7 +140,6 @@ struct r1bio { * in this BehindIO request */ sector_t sector; - sector_t start_next_window; int sectors; unsigned long state; struct mddev *mddev; @@ -185,4 +189,10 @@ enum r1bio_state { R1BIO_WriteError, R1BIO_FailFast, }; + +static inline int sector_to_idx(sector_t sector) +{ + return hash_long(sector >> BARRIER_UNIT_SECTOR_BITS, + BARRIER_BUCKETS_NR_BITS); +} #endif diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 1920756828df..e89a8d78a9ed 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -860,7 +860,7 @@ static int raid10_congested(struct mddev *mddev, int bits) if (rdev && !test_bit(Faulty, &rdev->flags)) { struct request_queue *q = bdev_get_queue(rdev->bdev); - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= bdi_congested(q->backing_dev_info, bits); } } rcu_read_unlock(); @@ -974,7 +974,8 @@ static void wait_barrier(struct r10conf *conf) !conf->barrier || (atomic_read(&conf->nr_pending) && current->bio_list && - !bio_list_empty(current->bio_list)), + (!bio_list_empty(¤t->bio_list[0]) || + !bio_list_empty(¤t->bio_list[1]))), conf->resync_lock); conf->nr_waiting--; if (!conf->nr_waiting) @@ -1132,7 +1133,7 @@ read_again: } slot = r10_bio->read_slot; - read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); + read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); @@ -1406,7 +1407,7 @@ retry_write: int d = r10_bio->devs[i].devnum; if (r10_bio->devs[i].bio) { struct md_rdev *rdev = conf->mirrors[d].rdev; - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); r10_bio->devs[i].bio = mbio; @@ -1457,7 +1458,7 @@ retry_write: smp_mb(); rdev = conf->mirrors[d].rdev; } - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); r10_bio->devs[i].repl_bio = mbio; @@ -1477,11 +1478,24 @@ retry_write: mbio->bi_bdev = (void*)rdev; atomic_inc(&r10_bio->remaining); + + cb = blk_check_plugged(raid10_unplug, mddev, + sizeof(*plug)); + if (cb) + plug = container_of(cb, struct raid10_plug_cb, + cb); + else + plug = NULL; spin_lock_irqsave(&conf->device_lock, flags); - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; + if (plug) { + bio_list_add(&plug->pending, mbio); + plug->pending_cnt++; + } else { + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + } spin_unlock_irqrestore(&conf->device_lock, flags); - if (!mddev_check_plugged(mddev)) + if (!plug) md_wakeup_thread(mddev->thread); } } @@ -1571,7 +1585,25 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio) split = bio; } + /* + * If a bio is splitted, the first part of bio will pass + * barrier but the bio is queued in current->bio_list (see + * generic_make_request). If there is a raise_barrier() called + * here, the second part of bio can't pass barrier. But since + * the first part bio isn't dispatched to underlaying disks + * yet, the barrier is never released, hence raise_barrier will + * alays wait. We have a deadlock. + * Note, this only happens in read path. For write path, the + * first part of bio is dispatched in a schedule() call + * (because of blk plug) or offloaded to raid10d. + * Quitting from the function immediately can change the bio + * order queued in bio_list and avoid the deadlock. + */ __make_request(mddev, split); + if (split != bio && bio_data_dir(bio) == READ) { + generic_make_request(bio); + break; + } } while (split != bio); /* In case raid10d snuck in to freeze_array */ @@ -2565,7 +2597,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) if (sectors > sect_to_write) sectors = sect_to_write; /* Write at 'sector' for 'sectors' */ - wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + wbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors); wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector); wbio->bi_iter.bi_sector = wsector + @@ -2641,8 +2673,7 @@ read_more: mdname(mddev), bdevname(rdev->bdev, b), (unsigned long long)r10_bio->sector); - bio = bio_clone_mddev(r10_bio->master_bio, - GFP_NOIO, mddev); + bio = bio_clone_fast(r10_bio->master_bio, GFP_NOIO, mddev->bio_set); bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); r10_bio->devs[slot].bio = bio; r10_bio->devs[slot].rdev = rdev; @@ -3841,8 +3872,8 @@ static int raid10_run(struct mddev *mddev) * maybe... */ stripe /= conf->geo.near_copies; - if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) - mddev->queue->backing_dev_info.ra_pages = 2 * stripe; + if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe) + mddev->queue->backing_dev_info->ra_pages = 2 * stripe; } if (md_integrity_register(mddev)) @@ -3944,10 +3975,6 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) return ret; } md_set_array_sectors(mddev, size); - if (mddev->queue) { - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); - } if (sectors > mddev->dev_sectors && mddev->recovery_cp > oldsize) { mddev->recovery_cp = oldsize; @@ -4643,8 +4670,8 @@ static void end_reshape(struct r10conf *conf) int stripe = conf->geo.raid_disks * ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE); stripe /= conf->geo.near_copies; - if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) - conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; + if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe) + conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe; } conf->fullsync = 0; } diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 302dea3296ba..3f307be01b10 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -20,6 +20,7 @@ #include <linux/crc32c.h> #include <linux/random.h> #include <linux/kthread.h> +#include <linux/types.h> #include "md.h" #include "raid5.h" #include "bitmap.h" @@ -164,9 +165,60 @@ struct r5l_log { struct work_struct deferred_io_work; /* to disable write back during in degraded mode */ struct work_struct disable_writeback_work; + + /* to for chunk_aligned_read in writeback mode, details below */ + spinlock_t tree_lock; + struct radix_tree_root big_stripe_tree; }; /* + * Enable chunk_aligned_read() with write back cache. + * + * Each chunk may contain more than one stripe (for example, a 256kB + * chunk contains 64 4kB-page, so this chunk contain 64 stripes). For + * chunk_aligned_read, these stripes are grouped into one "big_stripe". + * For each big_stripe, we count how many stripes of this big_stripe + * are in the write back cache. These data are tracked in a radix tree + * (big_stripe_tree). We use radix_tree item pointer as the counter. + * r5c_tree_index() is used to calculate keys for the radix tree. + * + * chunk_aligned_read() calls r5c_big_stripe_cached() to look up + * big_stripe of each chunk in the tree. If this big_stripe is in the + * tree, chunk_aligned_read() aborts. This look up is protected by + * rcu_read_lock(). + * + * It is necessary to remember whether a stripe is counted in + * big_stripe_tree. Instead of adding new flag, we reuses existing flags: + * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these + * two flags are set, the stripe is counted in big_stripe_tree. This + * requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to + * r5c_try_caching_write(); and moving clear_bit of + * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to + * r5c_finish_stripe_write_out(). + */ + +/* + * radix tree requests lowest 2 bits of data pointer to be 2b'00. + * So it is necessary to left shift the counter by 2 bits before using it + * as data pointer of the tree. + */ +#define R5C_RADIX_COUNT_SHIFT 2 + +/* + * calculate key for big_stripe_tree + * + * sect: align_bi->bi_iter.bi_sector or sh->sector + */ +static inline sector_t r5c_tree_index(struct r5conf *conf, + sector_t sect) +{ + sector_t offset; + + offset = sector_div(sect, conf->chunk_sectors); + return sect; +} + +/* * an IO range starts from a meta data block and end at the next meta data * block. The io unit's the meta data block tracks data/parity followed it. io * unit is written to log disk with normal write, as we always flush log disk @@ -337,17 +389,30 @@ void r5c_check_cached_full_stripe(struct r5conf *conf) /* * Total log space (in sectors) needed to flush all data in cache * - * Currently, writing-out phase automatically includes all pending writes - * to the same sector. So the reclaim of each stripe takes up to - * (conf->raid_disks + 1) pages of log space. + * To avoid deadlock due to log space, it is necessary to reserve log + * space to flush critical stripes (stripes that occupying log space near + * last_checkpoint). This function helps check how much log space is + * required to flush all cached stripes. * - * To totally avoid deadlock due to log space, the code reserves - * (conf->raid_disks + 1) pages for each stripe in cache, which is not - * necessary in most cases. + * To reduce log space requirements, two mechanisms are used to give cache + * flush higher priorities: + * 1. In handle_stripe_dirtying() and schedule_reconstruction(), + * stripes ALREADY in journal can be flushed w/o pending writes; + * 2. In r5l_write_stripe() and r5c_cache_data(), stripes NOT in journal + * can be delayed (r5l_add_no_space_stripe). * - * To improve this, we will need writing-out phase to be able to NOT include - * pending writes, which will reduce the requirement to - * (conf->max_degraded + 1) pages per stripe in cache. + * In cache flush, the stripe goes through 1 and then 2. For a stripe that + * already passed 1, flushing it requires at most (conf->max_degraded + 1) + * pages of journal space. For stripes that has not passed 1, flushing it + * requires (conf->raid_disks + 1) pages of journal space. There are at + * most (conf->group_cnt + 1) stripe that passed 1. So total journal space + * required to flush all cached stripes (in pages) is: + * + * (stripe_in_journal_count - group_cnt - 1) * (max_degraded + 1) + + * (group_cnt + 1) * (raid_disks + 1) + * or + * (stripe_in_journal_count) * (max_degraded + 1) + + * (group_cnt + 1) * (raid_disks - max_degraded) */ static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) { @@ -356,8 +421,9 @@ static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) if (!r5c_is_writeback(log)) return 0; - return BLOCK_SECTORS * (conf->raid_disks + 1) * - atomic_read(&log->stripe_in_journal_count); + return BLOCK_SECTORS * + ((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) + + (conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1)); } /* @@ -412,16 +478,6 @@ void r5c_make_stripe_write_out(struct stripe_head *sh) if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); - - if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) { - BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0); - atomic_dec(&conf->r5c_cached_partial_stripes); - } - - if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { - BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0); - atomic_dec(&conf->r5c_cached_full_stripes); - } } static void r5c_handle_data_cached(struct stripe_head *sh) @@ -1271,6 +1327,10 @@ static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) atomic_inc(&conf->active_stripes); r5c_make_stripe_write_out(sh); + if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) + atomic_inc(&conf->r5c_flushing_partial_stripes); + else + atomic_inc(&conf->r5c_flushing_full_stripes); raid5_release_stripe(sh); } @@ -1313,12 +1373,16 @@ static void r5c_do_reclaim(struct r5conf *conf) unsigned long flags; int total_cached; int stripes_to_flush; + int flushing_partial, flushing_full; if (!r5c_is_writeback(log)) return; + flushing_partial = atomic_read(&conf->r5c_flushing_partial_stripes); + flushing_full = atomic_read(&conf->r5c_flushing_full_stripes); total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + - atomic_read(&conf->r5c_cached_full_stripes); + atomic_read(&conf->r5c_cached_full_stripes) - + flushing_full - flushing_partial; if (total_cached > conf->min_nr_stripes * 3 / 4 || atomic_read(&conf->empty_inactive_list_nr) > 0) @@ -1328,7 +1392,7 @@ static void r5c_do_reclaim(struct r5conf *conf) */ stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP; else if (total_cached > conf->min_nr_stripes * 1 / 2 || - atomic_read(&conf->r5c_cached_full_stripes) > + atomic_read(&conf->r5c_cached_full_stripes) - flushing_full > R5C_FULL_STRIPE_FLUSH_BATCH) /* * if stripe cache pressure moderate, or if there is many full @@ -1362,9 +1426,9 @@ static void r5c_do_reclaim(struct r5conf *conf) !test_bit(STRIPE_HANDLE, &sh->state) && atomic_read(&sh->count) == 0) { r5c_flush_stripe(conf, sh); + if (count++ >= R5C_RECLAIM_STRIPE_GROUP) + break; } - if (count++ >= R5C_RECLAIM_STRIPE_GROUP) - break; } spin_unlock(&conf->device_lock); spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); @@ -2320,6 +2384,10 @@ int r5c_try_caching_write(struct r5conf *conf, int i; struct r5dev *dev; int to_cache = 0; + void **pslot; + sector_t tree_index; + int ret; + uintptr_t refcount; BUG_ON(!r5c_is_writeback(log)); @@ -2364,6 +2432,44 @@ int r5c_try_caching_write(struct r5conf *conf, } } + /* if the stripe is not counted in big_stripe_tree, add it now */ + if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) && + !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { + tree_index = r5c_tree_index(conf, sh->sector); + spin_lock(&log->tree_lock); + pslot = radix_tree_lookup_slot(&log->big_stripe_tree, + tree_index); + if (pslot) { + refcount = (uintptr_t)radix_tree_deref_slot_protected( + pslot, &log->tree_lock) >> + R5C_RADIX_COUNT_SHIFT; + radix_tree_replace_slot( + &log->big_stripe_tree, pslot, + (void *)((refcount + 1) << R5C_RADIX_COUNT_SHIFT)); + } else { + /* + * this radix_tree_insert can fail safely, so no + * need to call radix_tree_preload() + */ + ret = radix_tree_insert( + &log->big_stripe_tree, tree_index, + (void *)(1 << R5C_RADIX_COUNT_SHIFT)); + if (ret) { + spin_unlock(&log->tree_lock); + r5c_make_stripe_write_out(sh); + return -EAGAIN; + } + } + spin_unlock(&log->tree_lock); + + /* + * set STRIPE_R5C_PARTIAL_STRIPE, this shows the stripe is + * counted in the radix tree + */ + set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state); + atomic_inc(&conf->r5c_cached_partial_stripes); + } + for (i = disks; i--; ) { dev = &sh->dev[i]; if (dev->towrite) { @@ -2438,17 +2544,20 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh, struct stripe_head_state *s) { + struct r5l_log *log = conf->log; int i; int do_wakeup = 0; + sector_t tree_index; + void **pslot; + uintptr_t refcount; - if (!conf->log || - !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) + if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) return; WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); - if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) + if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) return; for (i = sh->disks; i--; ) { @@ -2470,12 +2579,45 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, if (do_wakeup) wake_up(&conf->wait_for_overlap); - spin_lock_irq(&conf->log->stripe_in_journal_lock); + spin_lock_irq(&log->stripe_in_journal_lock); list_del_init(&sh->r5c); - spin_unlock_irq(&conf->log->stripe_in_journal_lock); + spin_unlock_irq(&log->stripe_in_journal_lock); sh->log_start = MaxSector; - atomic_dec(&conf->log->stripe_in_journal_count); - r5c_update_log_state(conf->log); + + atomic_dec(&log->stripe_in_journal_count); + r5c_update_log_state(log); + + /* stop counting this stripe in big_stripe_tree */ + if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) || + test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { + tree_index = r5c_tree_index(conf, sh->sector); + spin_lock(&log->tree_lock); + pslot = radix_tree_lookup_slot(&log->big_stripe_tree, + tree_index); + BUG_ON(pslot == NULL); + refcount = (uintptr_t)radix_tree_deref_slot_protected( + pslot, &log->tree_lock) >> + R5C_RADIX_COUNT_SHIFT; + if (refcount == 1) + radix_tree_delete(&log->big_stripe_tree, tree_index); + else + radix_tree_replace_slot( + &log->big_stripe_tree, pslot, + (void *)((refcount - 1) << R5C_RADIX_COUNT_SHIFT)); + spin_unlock(&log->tree_lock); + } + + if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) { + BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0); + atomic_dec(&conf->r5c_flushing_partial_stripes); + atomic_dec(&conf->r5c_cached_partial_stripes); + } + + if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { + BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0); + atomic_dec(&conf->r5c_flushing_full_stripes); + atomic_dec(&conf->r5c_cached_full_stripes); + } } int @@ -2535,6 +2677,22 @@ r5c_cache_data(struct r5l_log *log, struct stripe_head *sh, return 0; } +/* check whether this big stripe is in write back cache. */ +bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect) +{ + struct r5l_log *log = conf->log; + sector_t tree_index; + void *slot; + + if (!log) + return false; + + WARN_ON_ONCE(!rcu_read_lock_held()); + tree_index = r5c_tree_index(conf, sect); + slot = radix_tree_lookup(&log->big_stripe_tree, tree_index); + return slot != NULL; +} + static int r5l_load_log(struct r5l_log *log) { struct md_rdev *rdev = log->rdev; @@ -2681,6 +2839,9 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) if (!log->meta_pool) goto out_mempool; + spin_lock_init(&log->tree_lock); + INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN); + log->reclaim_thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev, "reclaim"); if (!log->reclaim_thread) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3c7e106c12a2..ed5cd705b985 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -55,6 +55,8 @@ #include <linux/ratelimit.h> #include <linux/nodemask.h> #include <linux/flex_array.h> +#include <linux/sched/signal.h> + #include <trace/events/block.h> #include "md.h" @@ -281,13 +283,13 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, atomic_dec(&conf->r5c_cached_partial_stripes); list_add_tail(&sh->lru, &conf->r5c_full_stripe_list); r5c_check_cached_full_stripe(conf); - } else { - /* partial stripe */ - if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE, - &sh->state)) - atomic_inc(&conf->r5c_cached_partial_stripes); + } else + /* + * STRIPE_R5C_PARTIAL_STRIPE is set in + * r5c_try_caching_write(). No need to + * set it again. + */ list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list); - } } } } @@ -353,17 +355,15 @@ static void release_inactive_stripe_list(struct r5conf *conf, static int release_stripe_list(struct r5conf *conf, struct list_head *temp_inactive_list) { - struct stripe_head *sh; + struct stripe_head *sh, *t; int count = 0; struct llist_node *head; head = llist_del_all(&conf->released_stripes); head = llist_reverse_order(head); - while (head) { + llist_for_each_entry_safe(sh, t, head, release_list) { int hash; - sh = llist_entry(head, struct stripe_head, release_list); - head = llist_next(head); /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ smp_mb(); clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); @@ -863,6 +863,43 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) return 1; } +static void flush_deferred_bios(struct r5conf *conf) +{ + struct bio_list tmp; + struct bio *bio; + + if (!conf->batch_bio_dispatch || !conf->group_cnt) + return; + + bio_list_init(&tmp); + spin_lock(&conf->pending_bios_lock); + bio_list_merge(&tmp, &conf->pending_bios); + bio_list_init(&conf->pending_bios); + spin_unlock(&conf->pending_bios_lock); + + while ((bio = bio_list_pop(&tmp))) + generic_make_request(bio); +} + +static void defer_bio_issue(struct r5conf *conf, struct bio *bio) +{ + /* + * change group_cnt will drain all bios, so this is safe + * + * A read generally means a read-modify-write, which usually means a + * randwrite, so we don't delay it + */ + if (!conf->batch_bio_dispatch || !conf->group_cnt || + bio_op(bio) == REQ_OP_READ) { + generic_make_request(bio); + return; + } + spin_lock(&conf->pending_bios_lock); + bio_list_add(&conf->pending_bios, bio); + spin_unlock(&conf->pending_bios_lock); + md_wakeup_thread(conf->mddev->thread); +} + static void raid5_end_read_request(struct bio *bi); static void @@ -1043,7 +1080,7 @@ again: trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), bi, disk_devt(conf->mddev->gendisk), sh->dev[i].sector); - generic_make_request(bi); + defer_bio_issue(conf, bi); } if (rrdev) { if (s->syncing || s->expanding || s->expanded @@ -1088,7 +1125,7 @@ again: trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), rbi, disk_devt(conf->mddev->gendisk), sh->dev[i].sector); - generic_make_request(rbi); + defer_bio_issue(conf, rbi); } if (!rdev && !rrdev) { if (op_is_write(op)) @@ -1364,7 +1401,8 @@ static int set_syndrome_sources(struct page **srcs, (test_bit(R5_Wantdrain, &dev->flags) || test_bit(R5_InJournal, &dev->flags))) || (srctype == SYNDROME_SRC_WRITTEN && - dev->written)) { + (dev->written || + test_bit(R5_InJournal, &dev->flags)))) { if (test_bit(R5_InJournal, &dev->flags)) srcs[slot] = sh->dev[i].orig_page; else @@ -2914,12 +2952,36 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) * like to flush data in journal to RAID disks first, so complex rmw * is handled in the write patch (handle_stripe_dirtying). * + * 2. when journal space is critical (R5C_LOG_CRITICAL=1) + * + * It is important to be able to flush all stripes in raid5-cache. + * Therefore, we need reserve some space on the journal device for + * these flushes. If flush operation includes pending writes to the + * stripe, we need to reserve (conf->raid_disk + 1) pages per stripe + * for the flush out. If we exclude these pending writes from flush + * operation, we only need (conf->max_degraded + 1) pages per stripe. + * Therefore, excluding pending writes in these cases enables more + * efficient use of the journal device. + * + * Note: To make sure the stripe makes progress, we only delay + * towrite for stripes with data already in journal (injournal > 0). + * When LOG_CRITICAL, stripes with injournal == 0 will be sent to + * no_space_stripes list. + * */ -static inline bool delay_towrite(struct r5dev *dev, - struct stripe_head_state *s) +static inline bool delay_towrite(struct r5conf *conf, + struct r5dev *dev, + struct stripe_head_state *s) { - return !test_bit(R5_OVERWRITE, &dev->flags) && - !test_bit(R5_Insync, &dev->flags) && s->injournal; + /* case 1 above */ + if (!test_bit(R5_OVERWRITE, &dev->flags) && + !test_bit(R5_Insync, &dev->flags) && s->injournal) + return true; + /* case 2 above */ + if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && + s->injournal > 0) + return true; + return false; } static void @@ -2942,7 +3004,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (dev->towrite && !delay_towrite(dev, s)) { + if (dev->towrite && !delay_towrite(conf, dev, s)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantdrain, &dev->flags); if (!expand) @@ -3694,7 +3756,7 @@ static int handle_stripe_dirtying(struct r5conf *conf, } else for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ struct r5dev *dev = &sh->dev[i]; - if (((dev->towrite && !delay_towrite(dev, s)) || + if (((dev->towrite && !delay_towrite(conf, dev, s)) || i == sh->pd_idx || i == sh->qd_idx || test_bit(R5_InJournal, &dev->flags)) && !test_bit(R5_LOCKED, &dev->flags) && @@ -3718,8 +3780,8 @@ static int handle_stripe_dirtying(struct r5conf *conf, } } - pr_debug("for sector %llu, rmw=%d rcw=%d\n", - (unsigned long long)sh->sector, rmw, rcw); + pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n", + (unsigned long long)sh->sector, sh->state, rmw, rcw); set_bit(STRIPE_HANDLE, &sh->state); if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { /* prefer read-modify-write, but need to get some data */ @@ -3759,7 +3821,7 @@ static int handle_stripe_dirtying(struct r5conf *conf, for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (((dev->towrite && !delay_towrite(dev, s)) || + if (((dev->towrite && !delay_towrite(conf, dev, s)) || i == sh->pd_idx || i == sh->qd_idx || test_bit(R5_InJournal, &dev->flags)) && !test_bit(R5_LOCKED, &dev->flags) && @@ -4995,9 +5057,9 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) return 0; } /* - * use bio_clone_mddev to make a copy of the bio + * use bio_clone_fast to make a copy of the bio */ - align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); + align_bi = bio_clone_fast(raid_bio, GFP_NOIO, mddev->bio_set); if (!align_bi) return 0; /* @@ -5025,6 +5087,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) rdev->recovery_offset >= end_sector))) rdev = NULL; } + + if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) { + rcu_read_unlock(); + bio_put(align_bi); + return 0; + } + if (rdev) { sector_t first_bad; int bad_sectors; @@ -5381,7 +5450,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) * data on failed drives. */ if (rw == READ && mddev->degraded == 0 && - !r5c_is_writeback(conf->log) && mddev->reshape_position == MaxSector) { bi = chunk_aligned_read(mddev, bi); if (!bi) @@ -6126,6 +6194,8 @@ static void raid5d(struct md_thread *thread) mutex_unlock(&conf->cache_size_mutex); } + flush_deferred_bios(conf); + r5l_flush_stripe_to_raid(conf->log); async_tx_issue_pending_all(); @@ -6331,10 +6401,10 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) mddev_suspend(mddev); conf->skip_copy = new; if (new) - mddev->queue->backing_dev_info.capabilities |= + mddev->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; else - mddev->queue->backing_dev_info.capabilities &= + mddev->queue->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES; mddev_resume(mddev); } @@ -6711,6 +6781,18 @@ static struct r5conf *setup_conf(struct mddev *mddev) atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); atomic_set(&conf->active_aligned_reads, 0); + bio_list_init(&conf->pending_bios); + spin_lock_init(&conf->pending_bios_lock); + conf->batch_bio_dispatch = true; + rdev_for_each(rdev, mddev) { + if (test_bit(Journal, &rdev->flags)) + continue; + if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) { + conf->batch_bio_dispatch = false; + break; + } + } + conf->bypass_threshold = BYPASS_THRESHOLD; conf->recovery_disabled = mddev->recovery_disabled - 1; @@ -6757,6 +6839,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) INIT_LIST_HEAD(&conf->r5c_full_stripe_list); atomic_set(&conf->r5c_cached_partial_stripes, 0); INIT_LIST_HEAD(&conf->r5c_partial_stripe_list); + atomic_set(&conf->r5c_flushing_full_stripes, 0); + atomic_set(&conf->r5c_flushing_partial_stripes, 0); conf->level = mddev->new_level; conf->chunk_sectors = mddev->new_chunk_sectors; @@ -7153,8 +7237,8 @@ static int raid5_run(struct mddev *mddev) int data_disks = conf->previous_raid_disks - conf->max_degraded; int stripe = data_disks * ((mddev->chunk_sectors << 9) / PAGE_SIZE); - if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) - mddev->queue->backing_dev_info.ra_pages = 2 * stripe; + if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe) + mddev->queue->backing_dev_info->ra_pages = 2 * stripe; chunk_size = mddev->chunk_sectors << 9; blk_queue_io_min(mddev->queue, chunk_size); @@ -7522,8 +7606,6 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) return ret; } md_set_array_sectors(mddev, newsize); - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); if (sectors > mddev->dev_sectors && mddev->recovery_cp > mddev->dev_sectors) { mddev->recovery_cp = mddev->dev_sectors; @@ -7763,8 +7845,8 @@ static void end_reshape(struct r5conf *conf) int data_disks = conf->raid_disks - conf->max_degraded; int stripe = data_disks * ((conf->chunk_sectors << 9) / PAGE_SIZE); - if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) - conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; + if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe) + conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe; } } } diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 1440fa26e296..4bb27b97bf6b 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -663,6 +663,8 @@ struct r5conf { struct list_head r5c_full_stripe_list; atomic_t r5c_cached_partial_stripes; struct list_head r5c_partial_stripe_list; + atomic_t r5c_flushing_full_stripes; + atomic_t r5c_flushing_partial_stripes; atomic_t empty_inactive_list_nr; struct llist_head released_stripes; @@ -684,6 +686,10 @@ struct r5conf { int group_cnt; int worker_cnt_per_group; struct r5l_log *log; + + struct bio_list pending_bios; + spinlock_t pending_bios_lock; + bool batch_bio_dispatch; }; @@ -788,4 +794,5 @@ extern void r5c_check_stripe_cache_usage(struct r5conf *conf); extern void r5c_check_cached_full_stripe(struct r5conf *conf); extern struct md_sysfs_entry r5c_journal_mode; extern void r5c_update_on_rdev_error(struct mddev *mddev); +extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect); #endif |