diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-02-21 23:11:41 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-02-21 23:11:41 +0300 |
commit | 7a771ceac771d009f7203c40b256b0608d7ea2f8 (patch) | |
tree | 940260bccb165f47669397515c00900629c01803 | |
parent | e67bd12d6036ae3de9eeb0ba52e43691264ec850 (diff) | |
parent | d67a5f4b5947aba4bfe9a80a2b86079c215ca755 (diff) | |
download | linux-7a771ceac771d009f7203c40b256b0608d7ea2f8.tar.xz |
Merge tag 'dm-4.11-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer:
- Fix dm-raid transient device failure processing and other smaller
tweaks.
- Add journal support to the DM raid target to close the 'write hole'
on raid 4/5/6.
- Fix dm-cache corruption, due to rounding bug, when cache exceeds 2TB.
- Add 'metadata2' feature to dm-cache to separate the dirty bitset out
from other cache metadata. This improves speed of shutting down a
large cache device (which implies writing out dirty bits).
- Fix a memory leak during dm-stats data structure destruction.
- Fix a DM multipath round-robin path selector performance regression
that was caused by less precise balancing across all paths.
- Lastly, introduce a DM core fix for a long-standing DM snapshot
deadlock that is rooted in the complexity of the device stack used in
conjunction with block core maintaining bios on current->bio_list to
manage recursion in generic_make_request(). A more comprehensive fix
to block core (and its hook in the cpu scheduler) would be wonderful
but this DM-specific fix is pragmatic considering how difficult it
has been to make progress on a generic fix.
* tag 'dm-4.11-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (22 commits)
dm: flush queued bios when process blocks to avoid deadlock
dm round robin: revert "use percpu 'repeat_count' and 'current_path'"
dm stats: fix a leaked s->histogram_boundaries array
dm space map metadata: constify dm_space_map structures
dm cache metadata: use cursor api in blocks_are_clean_separate_dirty()
dm persistent data: add cursor skip functions to the cursor APIs
dm cache metadata: use dm_bitset_new() to create the dirty bitset in format 2
dm bitset: add dm_bitset_new()
dm cache metadata: name the cache block that couldn't be loaded
dm cache metadata: add "metadata2" feature
dm cache metadata: use bitset cursor api to load discard bitset
dm bitset: introduce cursor api
dm btree: use GFP_NOFS in dm_btree_del()
dm space map common: memcpy the disk root to ensure it's arch aligned
dm block manager: add unlikely() annotations on dm_bufio error paths
dm cache: fix corruption seen when using cache > 2TB
dm raid: cleanup awkward branching in raid_message() option processing
dm raid: use mddev rather than rdev->mddev
dm raid: use read_disk_sb() throughout
dm raid: add raid4/5/6 journaling support
...
-rw-r--r-- | Documentation/device-mapper/cache.txt | 4 | ||||
-rw-r--r-- | Documentation/device-mapper/dm-raid.txt | 17 | ||||
-rw-r--r-- | drivers/md/dm-cache-metadata.c | 353 | ||||
-rw-r--r-- | drivers/md/dm-cache-metadata.h | 11 | ||||
-rw-r--r-- | drivers/md/dm-cache-target.c | 44 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 296 | ||||
-rw-r--r-- | drivers/md/dm-round-robin.c | 67 | ||||
-rw-r--r-- | drivers/md/dm-stats.c | 1 | ||||
-rw-r--r-- | drivers/md/dm.c | 55 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-array.c | 21 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-array.h | 1 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-bitset.c | 146 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-bitset.h | 39 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-block-manager.c | 8 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-btree.c | 18 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-btree.h | 1 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-space-map-common.c | 16 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-space-map-metadata.c | 4 |
18 files changed, 875 insertions, 227 deletions
diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt index 785eab87aa71..f228604ddbcd 100644 --- a/Documentation/device-mapper/cache.txt +++ b/Documentation/device-mapper/cache.txt @@ -207,6 +207,10 @@ Optional feature arguments are: block, then the cache block is invalidated. To enable passthrough mode the cache must be clean. + metadata2 : use version 2 of the metadata. This stores the dirty bits + in a separate btree, which improves speed of shutting + down the cache. + A policy called 'default' is always registered. This is an alias for the policy we currently think is giving best all round performance. diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt index 5e3786fd9ea7..0d199353e477 100644 --- a/Documentation/device-mapper/dm-raid.txt +++ b/Documentation/device-mapper/dm-raid.txt @@ -161,6 +161,15 @@ The target is named "raid" and it accepts the following parameters: the RAID type (i.e. the allocation algorithm) as well, e.g. changing from raid5_ls to raid5_n. + [journal_dev <dev>] + This option adds a journal device to raid4/5/6 raid sets and + uses it to close the 'write hole' caused by the non-atomic updates + to the component devices which can cause data loss during recovery. + The journal device is used as writethrough thus causing writes to + be throttled versus non-journaled raid4/5/6 sets. + Takeover/reshape is not possible with a raid4/5/6 journal device; + it has to be deconfigured before requesting these. + <#raid_devs>: The number of devices composing the array. Each device consists of two entries. The first is the device containing the metadata (if any); the second is the one containing the @@ -245,6 +254,9 @@ recovery. Here is a fuller description of the individual fields: <data_offset> The current data offset to the start of the user data on each component device of a raid set (see the respective raid parameter to support out-of-place reshaping). + <journal_char> 'A' - active raid4/5/6 journal device. + 'D' - dead journal device. + '-' - no journal device. Message Interface @@ -314,3 +326,8 @@ Version History 1.9.0 Add support for RAID level takeover/reshape/region size and set size reduction. 1.9.1 Fix activation of existing RAID 4/10 mapped devices +1.9.2 Don't emit '- -' on the status table line in case the constructor + fails reading a superblock. Correctly emit 'maj:min1 maj:min2' and + 'D' on the status line. If '- -' is passed into the constructor, emit + '- -' on the table line and '-' as the status line health character. +1.10.0 Add support for raid4/5/6 journal device diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 624fe4319b24..e4c2c1a1e993 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -25,7 +25,7 @@ * defines a range of metadata versions that this module can handle. */ #define MIN_CACHE_VERSION 1 -#define MAX_CACHE_VERSION 1 +#define MAX_CACHE_VERSION 2 #define CACHE_METADATA_CACHE_SIZE 64 @@ -55,6 +55,7 @@ enum mapping_bits { /* * The data on the cache is different from that on the origin. + * This flag is only used by metadata format 1. */ M_DIRTY = 2 }; @@ -93,12 +94,18 @@ struct cache_disk_superblock { __le32 write_misses; __le32 policy_version[CACHE_POLICY_VERSION_SIZE]; + + /* + * Metadata format 2 fields. + */ + __le64 dirty_root; } __packed; struct dm_cache_metadata { atomic_t ref_count; struct list_head list; + unsigned version; struct block_device *bdev; struct dm_block_manager *bm; struct dm_space_map *metadata_sm; @@ -142,11 +149,18 @@ struct dm_cache_metadata { bool fail_io:1; /* + * Metadata format 2 fields. + */ + dm_block_t dirty_root; + struct dm_disk_bitset dirty_info; + + /* * These structures are used when loading metadata. They're too * big to put on the stack. */ struct dm_array_cursor mapping_cursor; struct dm_array_cursor hint_cursor; + struct dm_bitset_cursor dirty_cursor; }; /*------------------------------------------------------------------- @@ -170,6 +184,7 @@ static void sb_prepare_for_write(struct dm_block_validator *v, static int check_metadata_version(struct cache_disk_superblock *disk_super) { uint32_t metadata_version = le32_to_cpu(disk_super->version); + if (metadata_version < MIN_CACHE_VERSION || metadata_version > MAX_CACHE_VERSION) { DMERR("Cache metadata version %u found, but only versions between %u and %u supported.", metadata_version, MIN_CACHE_VERSION, MAX_CACHE_VERSION); @@ -310,6 +325,11 @@ static void __copy_sm_root(struct dm_cache_metadata *cmd, sizeof(cmd->metadata_space_map_root)); } +static bool separate_dirty_bits(struct dm_cache_metadata *cmd) +{ + return cmd->version >= 2; +} + static int __write_initial_superblock(struct dm_cache_metadata *cmd) { int r; @@ -341,7 +361,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) disk_super->flags = 0; memset(disk_super->uuid, 0, sizeof(disk_super->uuid)); disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC); - disk_super->version = cpu_to_le32(MAX_CACHE_VERSION); + disk_super->version = cpu_to_le32(cmd->version); memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name)); memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version)); disk_super->policy_hint_size = 0; @@ -362,6 +382,9 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) disk_super->write_hits = cpu_to_le32(0); disk_super->write_misses = cpu_to_le32(0); + if (separate_dirty_bits(cmd)) + disk_super->dirty_root = cpu_to_le64(cmd->dirty_root); + return dm_tm_commit(cmd->tm, sblock); } @@ -382,6 +405,13 @@ static int __format_metadata(struct dm_cache_metadata *cmd) if (r < 0) goto bad; + if (separate_dirty_bits(cmd)) { + dm_disk_bitset_init(cmd->tm, &cmd->dirty_info); + r = dm_bitset_empty(&cmd->dirty_info, &cmd->dirty_root); + if (r < 0) + goto bad; + } + dm_disk_bitset_init(cmd->tm, &cmd->discard_info); r = dm_bitset_empty(&cmd->discard_info, &cmd->discard_root); if (r < 0) @@ -407,9 +437,10 @@ bad: static int __check_incompat_features(struct cache_disk_superblock *disk_super, struct dm_cache_metadata *cmd) { - uint32_t features; + uint32_t incompat_flags, features; - features = le32_to_cpu(disk_super->incompat_flags) & ~DM_CACHE_FEATURE_INCOMPAT_SUPP; + incompat_flags = le32_to_cpu(disk_super->incompat_flags); + features = incompat_flags & ~DM_CACHE_FEATURE_INCOMPAT_SUPP; if (features) { DMERR("could not access metadata due to unsupported optional features (%lx).", (unsigned long)features); @@ -470,6 +501,7 @@ static int __open_metadata(struct dm_cache_metadata *cmd) } __setup_mapping_info(cmd); + dm_disk_bitset_init(cmd->tm, &cmd->dirty_info); dm_disk_bitset_init(cmd->tm, &cmd->discard_info); sb_flags = le32_to_cpu(disk_super->flags); cmd->clean_when_opened = test_bit(CLEAN_SHUTDOWN, &sb_flags); @@ -548,6 +580,7 @@ static unsigned long clear_clean_shutdown(unsigned long flags) static void read_superblock_fields(struct dm_cache_metadata *cmd, struct cache_disk_superblock *disk_super) { + cmd->version = le32_to_cpu(disk_super->version); cmd->flags = le32_to_cpu(disk_super->flags); cmd->root = le64_to_cpu(disk_super->mapping_root); cmd->hint_root = le64_to_cpu(disk_super->hint_root); @@ -567,6 +600,9 @@ static void read_superblock_fields(struct dm_cache_metadata *cmd, cmd->stats.write_hits = le32_to_cpu(disk_super->write_hits); cmd->stats.write_misses = le32_to_cpu(disk_super->write_misses); + if (separate_dirty_bits(cmd)) + cmd->dirty_root = le64_to_cpu(disk_super->dirty_root); + cmd->changed = false; } @@ -625,6 +661,13 @@ static int __commit_transaction(struct dm_cache_metadata *cmd, */ BUILD_BUG_ON(sizeof(struct cache_disk_superblock) > 512); + if (separate_dirty_bits(cmd)) { + r = dm_bitset_flush(&cmd->dirty_info, cmd->dirty_root, + &cmd->dirty_root); + if (r) + return r; + } + r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root, &cmd->discard_root); if (r) @@ -649,6 +692,8 @@ static int __commit_transaction(struct dm_cache_metadata *cmd, update_flags(disk_super, mutator); disk_super->mapping_root = cpu_to_le64(cmd->root); + if (separate_dirty_bits(cmd)) + disk_super->dirty_root = cpu_to_le64(cmd->dirty_root); disk_super->hint_root = cpu_to_le64(cmd->hint_root); disk_super->discard_root = cpu_to_le64(cmd->discard_root); disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); @@ -698,7 +743,8 @@ static void unpack_value(__le64 value_le, dm_oblock_t *block, unsigned *flags) static struct dm_cache_metadata *metadata_open(struct block_device *bdev, sector_t data_block_size, bool may_format_device, - size_t policy_hint_size) + size_t policy_hint_size, + unsigned metadata_version) { int r; struct dm_cache_metadata *cmd; @@ -709,6 +755,7 @@ static struct dm_cache_metadata *metadata_open(struct block_device *bdev, return ERR_PTR(-ENOMEM); } + cmd->version = metadata_version; atomic_set(&cmd->ref_count, 1); init_rwsem(&cmd->root_lock); cmd->bdev = bdev; @@ -757,7 +804,8 @@ static struct dm_cache_metadata *lookup(struct block_device *bdev) static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev, sector_t data_block_size, bool may_format_device, - size_t policy_hint_size) + size_t policy_hint_size, + unsigned metadata_version) { struct dm_cache_metadata *cmd, *cmd2; @@ -768,7 +816,8 @@ static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev, if (cmd) return cmd; - cmd = metadata_open(bdev, data_block_size, may_format_device, policy_hint_size); + cmd = metadata_open(bdev, data_block_size, may_format_device, + policy_hint_size, metadata_version); if (!IS_ERR(cmd)) { mutex_lock(&table_lock); cmd2 = lookup(bdev); @@ -800,10 +849,11 @@ static bool same_params(struct dm_cache_metadata *cmd, sector_t data_block_size) struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, sector_t data_block_size, bool may_format_device, - size_t policy_hint_size) + size_t policy_hint_size, + unsigned metadata_version) { - struct dm_cache_metadata *cmd = lookup_or_open(bdev, data_block_size, - may_format_device, policy_hint_size); + struct dm_cache_metadata *cmd = lookup_or_open(bdev, data_block_size, may_format_device, + policy_hint_size, metadata_version); if (!IS_ERR(cmd) && !same_params(cmd, data_block_size)) { dm_cache_metadata_close(cmd); @@ -829,8 +879,8 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd) /* * Checks that the given cache block is either unmapped or clean. */ -static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b, - bool *result) +static int block_clean_combined_dirty(struct dm_cache_metadata *cmd, dm_cblock_t b, + bool *result) { int r; __le64 value; @@ -838,10 +888,8 @@ static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b, unsigned flags; r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(b), &value); - if (r) { - DMERR("block_unmapped_or_clean failed"); + if (r) return r; - } unpack_value(value, &ob, &flags); *result = !((flags & M_VALID) && (flags & M_DIRTY)); @@ -849,17 +897,19 @@ static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b, return 0; } -static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd, - dm_cblock_t begin, dm_cblock_t end, - bool *result) +static int blocks_are_clean_combined_dirty(struct dm_cache_metadata *cmd, + dm_cblock_t begin, dm_cblock_t end, + bool *result) { int r; *result = true; while (begin != end) { - r = block_unmapped_or_clean(cmd, begin, result); - if (r) + r = block_clean_combined_dirty(cmd, begin, result); + if (r) { + DMERR("block_clean_combined_dirty failed"); return r; + } if (!*result) { DMERR("cache block %llu is dirty", @@ -873,6 +923,67 @@ static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd, return 0; } +static int blocks_are_clean_separate_dirty(struct dm_cache_metadata *cmd, + dm_cblock_t begin, dm_cblock_t end, + bool *result) +{ + int r; + bool dirty_flag; + *result = true; + + r = dm_bitset_cursor_begin(&cmd->dirty_info, cmd->dirty_root, + from_cblock(begin), &cmd->dirty_cursor); + if (r) { + DMERR("%s: dm_bitset_cursor_begin for dirty failed", __func__); + return r; + } + + r = dm_bitset_cursor_skip(&cmd->dirty_cursor, from_cblock(begin)); + if (r) { + DMERR("%s: dm_bitset_cursor_skip for dirty failed", __func__); + dm_bitset_cursor_end(&cmd->dirty_cursor); + return r; + } + + while (begin != end) { + /* + * We assume that unmapped blocks have their dirty bit + * cleared. + */ + dirty_flag = dm_bitset_cursor_get_value(&cmd->dirty_cursor); + if (dirty_flag) { + DMERR("%s: cache block %llu is dirty", __func__, + (unsigned long long) from_cblock(begin)); + dm_bitset_cursor_end(&cmd->dirty_cursor); + *result = false; + return 0; + } + + r = dm_bitset_cursor_next(&cmd->dirty_cursor); + if (r) { + DMERR("%s: dm_bitset_cursor_next for dirty failed", __func__); + dm_bitset_cursor_end(&cmd->dirty_cursor); + return r; + } + + begin = to_cblock(from_cblock(begin) + 1); + } + + dm_bitset_cursor_end(&cmd->dirty_cursor); + + return 0; +} + +static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd, + dm_cblock_t begin, dm_cblock_t end, + bool *result) +{ + if (separate_dirty_bits(cmd)) + return blocks_are_clean_separate_dirty(cmd, begin, end, result); + else + return blocks_are_clean_combined_dirty(cmd, begin, end, result); +} + static bool cmd_write_lock(struct dm_cache_metadata *cmd) { down_write(&cmd->root_lock); @@ -950,8 +1061,18 @@ int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size) r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks), from_cblock(new_cache_size), &null_mapping, &cmd->root); - if (!r) - cmd->cache_blocks = new_cache_size; + if (r) + goto out; + + if (separate_dirty_bits(cmd)) { + r = dm_bitset_resize(&cmd->dirty_info, cmd->dirty_root, + from_cblock(cmd->cache_blocks), from_cblock(new_cache_size), + false, &cmd->dirty_root); + if (r) + goto out; + } + + cmd->cache_blocks = new_cache_size; cmd->changed = true; out: @@ -995,14 +1116,6 @@ static int __clear_discard(struct dm_cache_metadata *cmd, dm_dblock_t b) from_dblock(b), &cmd->discard_root); } -static int __is_discarded(struct dm_cache_metadata *cmd, dm_dblock_t b, - bool *is_discarded) -{ - return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root, - from_dblock(b), &cmd->discard_root, - is_discarded); -} - static int __discard(struct dm_cache_metadata *cmd, dm_dblock_t dblock, bool discard) { @@ -1032,22 +1145,38 @@ static int __load_discards(struct dm_cache_metadata *cmd, load_discard_fn fn, void *context) { int r = 0; - dm_block_t b; - bool discard; + uint32_t b; + struct dm_bitset_cursor c; - for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) { - dm_dblock_t dblock = to_dblock(b); + if (from_dblock(cmd->discard_nr_blocks) == 0) + /* nothing to do */ + return 0; - if (cmd->clean_when_opened) { - r = __is_discarded(cmd, dblock, &discard); - if (r) - return r; - } else - discard = false; + if (cmd->clean_when_opened) { + r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root, &cmd->discard_root); + if (r) + return r; - r = fn(context, cmd->discard_block_size, dblock, discard); + r = dm_bitset_cursor_begin(&cmd->discard_info, cmd->discard_root, + from_dblock(cmd->discard_nr_blocks), &c); if (r) - break; + return r; + + for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) { + r = fn(context, cmd->discard_block_size, to_dblock(b), + dm_bitset_cursor_get_value(&c)); + if (r) + break; + } + + dm_bitset_cursor_end(&c); + + } else { + for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) { + r = fn(context, cmd->discard_block_size, to_dblock(b), false); + if (r) + return r; + } } return r; @@ -1177,11 +1306,11 @@ static bool hints_array_available(struct dm_cache_metadata *cmd, hints_array_initialized(cmd); } -static int __load_mapping(struct dm_cache_metadata *cmd, - uint64_t cb, bool hints_valid, - struct dm_array_cursor *mapping_cursor, - struct dm_array_cursor *hint_cursor, - load_mapping_fn fn, void *context) +static int __load_mapping_v1(struct dm_cache_metadata *cmd, + uint64_t cb, bool hints_valid, + struct dm_array_cursor *mapping_cursor, + struct dm_array_cursor *hint_cursor, + load_mapping_fn fn, void *context) { int r = 0; @@ -1206,8 +1335,51 @@ static int __load_mapping(struct dm_cache_metadata *cmd, r = fn(context, oblock, to_cblock(cb), flags & M_DIRTY, le32_to_cpu(hint), hints_valid); - if (r) - DMERR("policy couldn't load cblock"); + if (r) { + DMERR("policy couldn't load cache block %llu", + (unsigned long long) from_cblock(to_cblock(cb))); + } + } + + return r; +} + +static int __load_mapping_v2(struct dm_cache_metadata *cmd, + uint64_t cb, bool hints_valid, + struct dm_array_cursor *mapping_cursor, + struct dm_array_cursor *hint_cursor, + struct dm_bitset_cursor *dirty_cursor, + load_mapping_fn fn, void *context) +{ + int r = 0; + + __le64 mapping; + __le32 hint = 0; + + __le64 *mapping_value_le; + __le32 *hint_value_le; + + dm_oblock_t oblock; + unsigned flags; + bool dirty; + + dm_array_cursor_get_value(mapping_cursor, (void **) &mapping_value_le); + memcpy(&mapping, mapping_value_le, sizeof(mapping)); + unpack_value(mapping, &oblock, &flags); + + if (flags & M_VALID) { + if (hints_valid) { + dm_array_cursor_get_value(hint_cursor, (void **) &hint_value_le); + memcpy(&hint, hint_value_le, sizeof(hint)); + } + + dirty = dm_bitset_cursor_get_value(dirty_cursor); + r = fn(context, oblock, to_cblock(cb), dirty, + le32_to_cpu(hint), hints_valid); + if (r) { + DMERR("policy couldn't load cache block %llu", + (unsigned long long) from_cblock(to_cblock(cb))); + } } return r; @@ -1238,10 +1410,28 @@ static int __load_mappings(struct dm_cache_metadata *cmd, } } + if (separate_dirty_bits(cmd)) { + r = dm_bitset_cursor_begin(&cmd->dirty_info, cmd->dirty_root, + from_cblock(cmd->cache_blocks), + &cmd->dirty_cursor); + if (r) { + dm_array_cursor_end(&cmd->hint_cursor); + dm_array_cursor_end(&cmd->mapping_cursor); + return r; + } + } + for (cb = 0; ; cb++) { - r = __load_mapping(cmd, cb, hints_valid, - &cmd->mapping_cursor, &cmd->hint_cursor, - fn, context); + if (separate_dirty_bits(cmd)) + r = __load_mapping_v2(cmd, cb, hints_valid, + &cmd->mapping_cursor, + &cmd->hint_cursor, + &cmd->dirty_cursor, + fn, context); + else + r = __load_mapping_v1(cmd, cb, hints_valid, + &cmd->mapping_cursor, &cmd->hint_cursor, + fn, context); if (r) goto out; @@ -1264,12 +1454,23 @@ static int __load_mappings(struct dm_cache_metadata *cmd, goto out; } } + + if (separate_dirty_bits(cmd)) { + r = dm_bitset_cursor_next(&cmd->dirty_cursor); + if (r) { + DMERR("dm_bitset_cursor_next for dirty failed"); + goto out; + } + } } out: dm_array_cursor_end(&cmd->mapping_cursor); if (hints_valid) dm_array_cursor_end(&cmd->hint_cursor); + if (separate_dirty_bits(cmd)) + dm_bitset_cursor_end(&cmd->dirty_cursor); + return r; } @@ -1352,13 +1553,55 @@ static int __dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty } -int dm_cache_set_dirty(struct dm_cache_metadata *cmd, - dm_cblock_t cblock, bool dirty) +static int __set_dirty_bits_v1(struct dm_cache_metadata *cmd, unsigned nr_bits, unsigned long *bits) +{ + int r; + unsigned i; + for (i = 0; i < nr_bits; i++) { + r = __dirty(cmd, to_cblock(i), test_bit(i, bits)); + if (r) + return r; + } + + return 0; +} + +static int is_dirty_callback(uint32_t index, bool *value, void *context) +{ + unsigned long *bits = context; + *value = test_bit(index, bits); + return 0; +} + +static int __set_dirty_bits_v2(struct dm_cache_metadata *cmd, unsigned nr_bits, unsigned long *bits) +{ + int r = 0; + + /* nr_bits is really just a sanity check */ + if (nr_bits != from_cblock(cmd->cache_blocks)) { + DMERR("dirty bitset is wrong size"); + return -EINVAL; + } + + r = dm_bitset_del(&cmd->dirty_info, cmd->dirty_root); + if (r) + return r; + + cmd->changed = true; + return dm_bitset_new(&cmd->dirty_info, &cmd->dirty_root, nr_bits, is_dirty_callback, bits); +} + +int dm_cache_set_dirty_bits(struct dm_cache_metadata *cmd, + unsigned nr_bits, + unsigned long *bits) { int r; WRITE_LOCK(cmd); - r = __dirty(cmd, cblock, dirty); + if (separate_dirty_bits(cmd)) + r = __set_dirty_bits_v2(cmd, nr_bits, bits); + else + r = __set_dirty_bits_v1(cmd, nr_bits, bits); WRITE_UNLOCK(cmd); return r; diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index 8528744195e5..4f07c08cf107 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h @@ -45,18 +45,20 @@ * As these various flags are defined they should be added to the * following masks. */ + #define DM_CACHE_FEATURE_COMPAT_SUPP 0UL #define DM_CACHE_FEATURE_COMPAT_RO_SUPP 0UL #define DM_CACHE_FEATURE_INCOMPAT_SUPP 0UL /* - * Reopens or creates a new, empty metadata volume. - * Returns an ERR_PTR on failure. + * Reopens or creates a new, empty metadata volume. Returns an ERR_PTR on + * failure. If reopening then features must match. */ struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, sector_t data_block_size, bool may_format_device, - size_t policy_hint_size); + size_t policy_hint_size, + unsigned metadata_version); void dm_cache_metadata_close(struct dm_cache_metadata *cmd); @@ -91,7 +93,8 @@ int dm_cache_load_mappings(struct dm_cache_metadata *cmd, load_mapping_fn fn, void *context); -int dm_cache_set_dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty); +int dm_cache_set_dirty_bits(struct dm_cache_metadata *cmd, + unsigned nr_bits, unsigned long *bits); struct dm_cache_statistics { uint32_t read_hits; diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 894bc14469c8..9c689b34e6e7 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -179,6 +179,7 @@ enum cache_io_mode { struct cache_features { enum cache_metadata_mode mode; enum cache_io_mode io_mode; + unsigned metadata_version; }; struct cache_stats { @@ -248,7 +249,7 @@ struct cache { /* * Fields for converting from sectors to blocks. */ - uint32_t sectors_per_block; + sector_t sectors_per_block; int sectors_per_block_shift; spinlock_t lock; @@ -2534,13 +2535,14 @@ static void init_features(struct cache_features *cf) { cf->mode = CM_WRITE; cf->io_mode = CM_IO_WRITEBACK; + cf->metadata_version = 1; } static int parse_features(struct cache_args *ca, struct dm_arg_set *as, char **error) { static struct dm_arg _args[] = { - {0, 1, "Invalid number of cache feature arguments"}, + {0, 2, "Invalid number of cache feature arguments"}, }; int r; @@ -2566,6 +2568,9 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as, else if (!strcasecmp(arg, "passthrough")) cf->io_mode = CM_IO_PASSTHROUGH; + else if (!strcasecmp(arg, "metadata2")) + cf->metadata_version = 2; + else { *error = "Unrecognised cache feature requested"; return -EINVAL; @@ -2820,7 +2825,8 @@ static int cache_create(struct cache_args *ca, struct cache **result) cmd = dm_cache_metadata_open(cache->metadata_dev->bdev, ca->block_size, may_format, - dm_cache_policy_get_hint_size(cache->policy)); + dm_cache_policy_get_hint_size(cache->policy), + ca->features.metadata_version); if (IS_ERR(cmd)) { *error = "Error creating metadata object"; r = PTR_ERR(cmd); @@ -3165,21 +3171,16 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) static int write_dirty_bitset(struct cache *cache) { - unsigned i, r; + int r; if (get_cache_mode(cache) >= CM_READ_ONLY) return -EINVAL; - for (i = 0; i < from_cblock(cache->cache_size); i++) { - r = dm_cache_set_dirty(cache->cmd, to_cblock(i), - is_dirty(cache, to_cblock(i))); - if (r) { - metadata_operation_failed(cache, "dm_cache_set_dirty", r); - return r; - } - } + r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset); + if (r) + metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r); - return 0; + return r; } static int write_discard_bitset(struct cache *cache) @@ -3540,11 +3541,11 @@ static void cache_status(struct dm_target *ti, status_type_t type, residency = policy_residency(cache->policy); - DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ", + DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ", (unsigned)DM_CACHE_METADATA_BLOCK_SIZE, (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata), (unsigned long long)nr_blocks_metadata, - cache->sectors_per_block, + (unsigned long long)cache->sectors_per_block, (unsigned long long) from_cblock(residency), (unsigned long long) from_cblock(cache->cache_size), (unsigned) atomic_read(&cache->stats.read_hit), @@ -3555,14 +3556,19 @@ static void cache_status(struct dm_target *ti, status_type_t type, (unsigned) atomic_read(&cache->stats.promotion), (unsigned long) atomic_read(&cache->nr_dirty)); + if (cache->features.metadata_version == 2) + DMEMIT("2 metadata2 "); + else + DMEMIT("1 "); + if (writethrough_mode(&cache->features)) - DMEMIT("1 writethrough "); + DMEMIT("writethrough "); else if (passthrough_mode(&cache->features)) - DMEMIT("1 passthrough "); + DMEMIT("passthrough "); else if (writeback_mode(&cache->features)) - DMEMIT("1 writeback "); + DMEMIT("writeback "); else { DMERR("%s: internal error: unknown io mode: %d", @@ -3810,7 +3816,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {1, 9, 0}, + .version = {1, 10, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index b8f978e551d7..5c9e95d66f3b 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -24,6 +24,11 @@ */ #define MIN_FREE_RESHAPE_SPACE to_sector(4*4096) +/* + * Minimum journal space 4 MiB in sectors. + */ +#define MIN_RAID456_JOURNAL_SPACE (4*2048) + static bool devices_handle_discard_safely = false; /* @@ -73,6 +78,9 @@ struct raid_dev { #define __CTR_FLAG_DATA_OFFSET 13 /* 2 */ /* Only with reshapable raid4/5/6/10! */ #define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */ +/* New for v1.10.0 */ +#define __CTR_FLAG_JOURNAL_DEV 15 /* 2 */ /* Only with raid4/5/6! */ + /* * Flags for rs->ctr_flags field. */ @@ -91,6 +99,7 @@ struct raid_dev { #define CTR_FLAG_DELTA_DISKS (1 << __CTR_FLAG_DELTA_DISKS) #define CTR_FLAG_DATA_OFFSET (1 << __CTR_FLAG_DATA_OFFSET) #define CTR_FLAG_RAID10_USE_NEAR_SETS (1 << __CTR_FLAG_RAID10_USE_NEAR_SETS) +#define CTR_FLAG_JOURNAL_DEV (1 << __CTR_FLAG_JOURNAL_DEV) /* * Definitions of various constructor flags to @@ -163,7 +172,8 @@ struct raid_dev { CTR_FLAG_STRIPE_CACHE | \ CTR_FLAG_REGION_SIZE | \ CTR_FLAG_DELTA_DISKS | \ - CTR_FLAG_DATA_OFFSET) + CTR_FLAG_DATA_OFFSET | \ + CTR_FLAG_JOURNAL_DEV) #define RAID6_VALID_FLAGS (CTR_FLAG_SYNC | \ CTR_FLAG_REBUILD | \ @@ -173,7 +183,8 @@ struct raid_dev { CTR_FLAG_STRIPE_CACHE | \ CTR_FLAG_REGION_SIZE | \ CTR_FLAG_DELTA_DISKS | \ - CTR_FLAG_DATA_OFFSET) + CTR_FLAG_DATA_OFFSET | \ + CTR_FLAG_JOURNAL_DEV) /* ...valid options definitions per raid level */ /* @@ -222,6 +233,12 @@ struct raid_set { struct raid_type *raid_type; struct dm_target_callbacks callbacks; + /* Optional raid4/5/6 journal device */ + struct journal_dev { + struct dm_dev *dev; + struct md_rdev rdev; + } journal_dev; + struct raid_dev dev[0]; }; @@ -306,6 +323,7 @@ static struct arg_name_flag { { CTR_FLAG_DATA_OFFSET, "data_offset"}, { CTR_FLAG_DELTA_DISKS, "delta_disks"}, { CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"}, + { CTR_FLAG_JOURNAL_DEV, "journal_dev" }, }; /* Return argument name string for given @flag */ @@ -370,7 +388,7 @@ static bool rs_is_reshapable(struct raid_set *rs) /* Return true, if raid set in @rs is recovering */ static bool rs_is_recovering(struct raid_set *rs) { - return rs->md.recovery_cp < rs->dev[0].rdev.sectors; + return rs->md.recovery_cp < rs->md.dev_sectors; } /* Return true, if raid set in @rs is reshaping */ @@ -627,7 +645,8 @@ static void rs_set_capacity(struct raid_set *rs) * is unintended in case of out-of-place reshaping */ rdev_for_each(rdev, mddev) - rdev->sectors = mddev->dev_sectors; + if (!test_bit(Journal, &rdev->flags)) + rdev->sectors = mddev->dev_sectors; set_capacity(gendisk, mddev->array_sectors); revalidate_disk(gendisk); @@ -713,6 +732,11 @@ static void raid_set_free(struct raid_set *rs) { int i; + if (rs->journal_dev.dev) { + md_rdev_clear(&rs->journal_dev.rdev); + dm_put_device(rs->ti, rs->journal_dev.dev); + } + for (i = 0; i < rs->raid_disks; i++) { if (rs->dev[i].meta_dev) dm_put_device(rs->ti, rs->dev[i].meta_dev); @@ -760,10 +784,11 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as) rs->dev[i].data_dev = NULL; /* - * There are no offsets, since there is a separate device - * for data and metadata. + * There are no offsets initially. + * Out of place reshape will set them accordingly. */ rs->dev[i].rdev.data_offset = 0; + rs->dev[i].rdev.new_data_offset = 0; rs->dev[i].rdev.mddev = &rs->md; arg = dm_shift_arg(as); @@ -821,6 +846,9 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as) rebuild++; } + if (rs->journal_dev.dev) + list_add_tail(&rs->journal_dev.rdev.same_set, &rs->md.disks); + if (metadata_available) { rs->md.external = 0; rs->md.persistent = 1; @@ -1026,6 +1054,8 @@ too_many: * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) * [stripe_cache <sectors>] Stripe cache size for higher RAIDs * [region_size <sectors>] Defines granularity of bitmap + * [journal_dev <dev>] raid4/5/6 journaling deviice + * (i.e. write hole closing log) * * RAID10-only options: * [raid10_copies <# copies>] Number of copies. (Default: 2) @@ -1133,7 +1163,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, /* * Parameters that take a string value are checked here. */ - + /* "raid10_format {near|offset|far} */ if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT))) { if (test_and_set_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) { rs->ti->error = "Only one 'raid10_format' argument pair allowed"; @@ -1151,6 +1181,41 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, continue; } + /* "journal_dev dev" */ + if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV))) { + int r; + struct md_rdev *jdev; + + if (test_and_set_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) { + rs->ti->error = "Only one raid4/5/6 set journaling device allowed"; + return -EINVAL; + } + if (!rt_is_raid456(rt)) { + rs->ti->error = "'journal_dev' is an invalid parameter for this RAID type"; + return -EINVAL; + } + r = dm_get_device(rs->ti, arg, dm_table_get_mode(rs->ti->table), + &rs->journal_dev.dev); + if (r) { + rs->ti->error = "raid4/5/6 journal device lookup failure"; + return r; + } + jdev = &rs->journal_dev.rdev; + md_rdev_init(jdev); + jdev->mddev = &rs->md; + jdev->bdev = rs->journal_dev.dev->bdev; + jdev->sectors = to_sector(i_size_read(jdev->bdev->bd_inode)); + if (jdev->sectors < MIN_RAID456_JOURNAL_SPACE) { + rs->ti->error = "No space for raid4/5/6 journal"; + return -ENOSPC; + } + set_bit(Journal, &jdev->flags); + continue; + } + + /* + * Parameters with number values from here on. + */ if (kstrtoint(arg, 10, &value) < 0) { rs->ti->error = "Bad numerical argument given in raid params"; return -EINVAL; @@ -1425,6 +1490,25 @@ static unsigned int rs_data_stripes(struct raid_set *rs) return rs->raid_disks - rs->raid_type->parity_devs; } +/* + * Retrieve rdev->sectors from any valid raid device of @rs + * to allow userpace to pass in arbitray "- -" device tupples. + */ +static sector_t __rdev_sectors(struct raid_set *rs) +{ + int i; + + for (i = 0; i < rs->md.raid_disks; i++) { + struct md_rdev *rdev = &rs->dev[i].rdev; + + if (!test_bit(Journal, &rdev->flags) && + rdev->bdev && rdev->sectors) + return rdev->sectors; + } + + BUG(); /* Constructor ensures we got some. */ +} + /* Calculate the sectors per device and per array used for @rs */ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev) { @@ -1468,7 +1552,8 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev) array_sectors = (data_stripes + delta_disks) * dev_sectors; rdev_for_each(rdev, mddev) - rdev->sectors = dev_sectors; + if (!test_bit(Journal, &rdev->flags)) + rdev->sectors = dev_sectors; mddev->array_sectors = array_sectors; mddev->dev_sectors = dev_sectors; @@ -1510,9 +1595,9 @@ static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors) else if (dev_sectors == MaxSector) /* Prevent recovery */ __rs_setup_recovery(rs, MaxSector); - else if (rs->dev[0].rdev.sectors < dev_sectors) + else if (__rdev_sectors(rs) < dev_sectors) /* Grown raid set */ - __rs_setup_recovery(rs, rs->dev[0].rdev.sectors); + __rs_setup_recovery(rs, __rdev_sectors(rs)); else __rs_setup_recovery(rs, MaxSector); } @@ -1851,18 +1936,21 @@ static int rs_check_reshape(struct raid_set *rs) return -EPERM; } -static int read_disk_sb(struct md_rdev *rdev, int size) +static int read_disk_sb(struct md_rdev *rdev, int size, bool force_reload) { BUG_ON(!rdev->sb_page); - if (rdev->sb_loaded) + if (rdev->sb_loaded && !force_reload) return 0; + rdev->sb_loaded = 0; + if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true)) { DMERR("Failed to read superblock of device at position %d", rdev->raid_disk); md_error(rdev->mddev, rdev); - return -EINVAL; + set_bit(Faulty, &rdev->flags); + return -EIO; } rdev->sb_loaded = 1; @@ -1990,7 +2078,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev) return -EINVAL; } - r = read_disk_sb(rdev, rdev->sb_size); + r = read_disk_sb(rdev, rdev->sb_size, false); if (r) return r; @@ -2146,6 +2234,9 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) */ d = 0; rdev_for_each(r, mddev) { + if (test_bit(Journal, &rdev->flags)) + continue; + if (test_bit(FirstUse, &r->flags)) new_devs++; @@ -2201,7 +2292,8 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) */ sb_retrieve_failed_devices(sb, failed_devices); rdev_for_each(r, mddev) { - if (!r->sb_page) + if (test_bit(Journal, &rdev->flags) || + !r->sb_page) continue; sb2 = page_address(r->sb_page); sb2->failed_devices = 0; @@ -2253,7 +2345,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev) struct mddev *mddev = &rs->md; struct dm_raid_superblock *sb; - if (rs_is_raid0(rs) || !rdev->sb_page) + if (rs_is_raid0(rs) || !rdev->sb_page || rdev->raid_disk < 0) return 0; sb = page_address(rdev->sb_page); @@ -2278,7 +2370,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev) /* Enable bitmap creation for RAID levels != 0 */ mddev->bitmap_info.offset = rt_is_raid0(rs->raid_type) ? 0 : to_sector(4096); - rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset; + mddev->bitmap_info.default_offset = mddev->bitmap_info.offset; if (!test_and_clear_bit(FirstUse, &rdev->flags)) { /* Retrieve device size stored in superblock to be prepared for shrink */ @@ -2316,21 +2408,22 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev) static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) { int r; - struct raid_dev *dev; - struct md_rdev *rdev, *tmp, *freshest; + struct md_rdev *rdev, *freshest; struct mddev *mddev = &rs->md; freshest = NULL; - rdev_for_each_safe(rdev, tmp, mddev) { + rdev_for_each(rdev, mddev) { + if (test_bit(Journal, &rdev->flags)) + continue; + /* * Skipping super_load due to CTR_FLAG_SYNC will cause * the array to undergo initialization again as * though it were new. This is the intended effect * of the "sync" directive. * - * When reshaping capability is added, we must ensure - * that the "sync" directive is disallowed during the - * reshape. + * With reshaping capability added, we must ensure that + * that the "sync" directive is disallowed during the reshape. */ if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags)) continue; @@ -2347,6 +2440,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) case 0: break; default: + /* This is a failure to read the superblock from the metadata device. */ /* * We have to keep any raid0 data/metadata device pairs or * the MD raid0 personality will fail to start the array. @@ -2354,33 +2448,16 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) if (rs_is_raid0(rs)) continue; - dev = container_of(rdev, struct raid_dev, rdev); - if (dev->meta_dev) - dm_put_device(ti, dev->meta_dev); - - dev->meta_dev = NULL; - rdev->meta_bdev = NULL; - - if (rdev->sb_page) - put_page(rdev->sb_page); - - rdev->sb_page = NULL; - - rdev->sb_loaded = 0; - /* - * We might be able to salvage the data device - * even though the meta device has failed. For - * now, we behave as though '- -' had been - * set for this device in the table. + * We keep the dm_devs to be able to emit the device tuple + * properly on the table line in raid_status() (rather than + * mistakenly acting as if '- -' got passed into the constructor). + * + * The rdev has to stay on the same_set list to allow for + * the attempt to restore faulty devices on second resume. */ - if (dev->data_dev) - dm_put_device(ti, dev->data_dev); - - dev->data_dev = NULL; - rdev->bdev = NULL; - - list_del(&rdev->same_set); + rdev->raid_disk = rdev->saved_raid_disk = -1; + break; } } @@ -2401,7 +2478,9 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) return -EINVAL; rdev_for_each(rdev, mddev) - if ((rdev != freshest) && super_validate(rs, rdev)) + if (!test_bit(Journal, &rdev->flags) && + rdev != freshest && + super_validate(rs, rdev)) return -EINVAL; return 0; } @@ -2488,10 +2567,12 @@ static int rs_adjust_data_offsets(struct raid_set *rs) return -ENOSPC; } out: - /* Adjust data offsets on all rdevs */ + /* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */ rdev_for_each(rdev, &rs->md) { - rdev->data_offset = data_offset; - rdev->new_data_offset = new_data_offset; + if (!test_bit(Journal, &rdev->flags)) { + rdev->data_offset = data_offset; + rdev->new_data_offset = new_data_offset; + } } return 0; @@ -2504,8 +2585,10 @@ static void __reorder_raid_disk_indexes(struct raid_set *rs) struct md_rdev *rdev; rdev_for_each(rdev, &rs->md) { - rdev->raid_disk = i++; - rdev->saved_raid_disk = rdev->new_raid_disk = -1; + if (!test_bit(Journal, &rdev->flags)) { + rdev->raid_disk = i++; + rdev->saved_raid_disk = rdev->new_raid_disk = -1; + } } } @@ -2845,7 +2928,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (r) goto bad; - calculated_dev_sectors = rs->dev[0].rdev.sectors; + calculated_dev_sectors = rs->md.dev_sectors; /* * Backup any new raid set level, layout, ... @@ -2858,7 +2941,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (r) goto bad; - resize = calculated_dev_sectors != rs->dev[0].rdev.sectors; + resize = calculated_dev_sectors != __rdev_sectors(rs); INIT_WORK(&rs->md.event_work, do_table_event); ti->private = rs; @@ -2902,6 +2985,13 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } + /* We can't takeover a journaled raid4/5/6 */ + if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) { + ti->error = "Can't takeover a journaled raid4/5/6 set"; + r = -EPERM; + goto bad; + } + /* * If a takeover is needed, userspace sets any additional * devices to rebuild and we can check for a valid request here. @@ -2924,6 +3014,18 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) rs_set_new(rs); } else if (rs_reshape_requested(rs)) { /* + * No need to check for 'ongoing' takeover here, because takeover + * is an instant operation as oposed to an ongoing reshape. + */ + + /* We can't reshape a journaled raid4/5/6 */ + if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) { + ti->error = "Can't reshape a journaled raid4/5/6 set"; + r = -EPERM; + goto bad; + } + + /* * We can only prepare for a reshape here, because the * raid set needs to run to provide the repective reshape * check functions via its MD personality instance. @@ -3071,18 +3173,23 @@ static const char *decipher_sync_action(struct mddev *mddev) } /* - * Return status string @rdev + * Return status string for @rdev * * Status characters: * - * 'D' = Dead/Failed device + * 'D' = Dead/Failed raid set component or raid4/5/6 journal device * 'a' = Alive but not in-sync - * 'A' = Alive and in-sync + * 'A' = Alive and in-sync raid set component or alive raid4/5/6 journal device + * '-' = Non-existing device (i.e. uspace passed '- -' into the ctr) */ static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync) { - if (test_bit(Faulty, &rdev->flags)) + if (!rdev->bdev) + return "-"; + else if (test_bit(Faulty, &rdev->flags)) return "D"; + else if (test_bit(Journal, &rdev->flags)) + return "A"; else if (!array_in_sync || !test_bit(In_sync, &rdev->flags)) return "a"; else @@ -3151,7 +3258,8 @@ static sector_t rs_get_progress(struct raid_set *rs, * being initialized. */ rdev_for_each(rdev, mddev) - if (!test_bit(In_sync, &rdev->flags)) + if (!test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags)) *array_in_sync = true; #if 0 r = 0; /* HM FIXME: TESTME: https://bugzilla.redhat.com/show_bug.cgi?id=1210637 ? */ @@ -3183,7 +3291,6 @@ static void raid_status(struct dm_target *ti, status_type_t type, sector_t progress, resync_max_sectors, resync_mismatches; const char *sync_action; struct raid_type *rt; - struct md_rdev *rdev; switch (type) { case STATUSTYPE_INFO: @@ -3204,9 +3311,9 @@ static void raid_status(struct dm_target *ti, status_type_t type, atomic64_read(&mddev->resync_mismatches) : 0; sync_action = decipher_sync_action(&rs->md); - /* HM FIXME: do we want another state char for raid0? It shows 'D' or 'A' now */ - rdev_for_each(rdev, mddev) - DMEMIT(__raid_dev_status(rdev, array_in_sync)); + /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */ + for (i = 0; i < rs->raid_disks; i++) + DMEMIT(__raid_dev_status(&rs->dev[i].rdev, array_in_sync)); /* * In-sync/Reshape ratio: @@ -3252,6 +3359,12 @@ static void raid_status(struct dm_target *ti, status_type_t type, * so retrieving it from the first raid disk is sufficient. */ DMEMIT(" %llu", (unsigned long long) rs->dev[0].rdev.data_offset); + + /* + * v1.10.0+: + */ + DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? + __raid_dev_status(&rs->journal_dev.rdev, 0) : "-"); break; case STATUSTYPE_TABLE: @@ -3265,7 +3378,8 @@ static void raid_status(struct dm_target *ti, status_type_t type, raid_param_cnt += rebuild_disks * 2 + write_mostly_params + hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) + - hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2; + hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 + + (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0); /* Emit table line */ DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors); if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) @@ -3312,6 +3426,9 @@ static void raid_status(struct dm_target *ti, status_type_t type, if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags)) DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE), mddev->sync_speed_min); + if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) + DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV), + __get_dev_name(rs->journal_dev.dev)); DMEMIT(" %d", rs->raid_disks); for (i = 0; i < rs->raid_disks; i++) DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev), @@ -3347,10 +3464,11 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv) else { if (!strcasecmp(argv[0], "check")) set_bit(MD_RECOVERY_CHECK, &mddev->recovery); - else if (!!strcasecmp(argv[0], "repair")) + else if (!strcasecmp(argv[0], "repair")) { + set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + } else return -EINVAL; - set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - set_bit(MD_RECOVERY_SYNC, &mddev->recovery); } if (mddev->ro == 2) { /* A write to sync_action is enough to justify @@ -3427,11 +3545,14 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs) memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices)); - for (i = 0; i < rs->md.raid_disks; i++) { + for (i = 0; i < mddev->raid_disks; i++) { r = &rs->dev[i].rdev; - if (test_bit(Faulty, &r->flags) && r->sb_page && - sync_page_io(r, 0, r->sb_size, r->sb_page, - REQ_OP_READ, 0, true)) { + /* HM FIXME: enhance journal device recovery processing */ + if (test_bit(Journal, &r->flags)) + continue; + + if (test_bit(Faulty, &r->flags) && + r->meta_bdev && !read_disk_sb(r, r->sb_size, true)) { DMINFO("Faulty %s device #%d has readable super block." " Attempting to revive it.", rs->raid_type->name, i); @@ -3445,22 +3566,26 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs) * '>= 0' - meaning we must call this function * ourselves. */ - if ((r->raid_disk >= 0) && - (mddev->pers->hot_remove_disk(mddev, r) != 0)) - /* Failed to revive this device, try next */ - continue; - - r->raid_disk = i; - r->saved_raid_disk = i; flags = r->flags; + clear_bit(In_sync, &r->flags); /* Mandatory for hot remove. */ + if (r->raid_disk >= 0) { + if (mddev->pers->hot_remove_disk(mddev, r)) { + /* Failed to revive this device, try next */ + r->flags = flags; + continue; + } + } else + r->raid_disk = r->saved_raid_disk = i; + clear_bit(Faulty, &r->flags); clear_bit(WriteErrorSeen, &r->flags); - clear_bit(In_sync, &r->flags); + if (mddev->pers->hot_add_disk(mddev, r)) { - r->raid_disk = -1; - r->saved_raid_disk = -1; + /* Failed to revive this device, try next */ + r->raid_disk = r->saved_raid_disk = -1; r->flags = flags; } else { + clear_bit(In_sync, &r->flags); r->recovery_offset = 0; set_bit(i, (void *) cleared_failed_devices); cleared = true; @@ -3473,6 +3598,9 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs) uint64_t failed_devices[DISKS_ARRAY_ELEMS]; rdev_for_each(r, &rs->md) { + if (test_bit(Journal, &r->flags)) + continue; + sb = page_address(r->sb_page); sb_retrieve_failed_devices(sb, failed_devices); @@ -3651,7 +3779,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 9, 1}, + .version = {1, 10, 0}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c index 6c25213ab38c..bdbb7e6e8212 100644 --- a/drivers/md/dm-round-robin.c +++ b/drivers/md/dm-round-robin.c @@ -17,8 +17,8 @@ #include <linux/module.h> #define DM_MSG_PREFIX "multipath round-robin" -#define RR_MIN_IO 1000 -#define RR_VERSION "1.1.0" +#define RR_MIN_IO 1 +#define RR_VERSION "1.2.0" /*----------------------------------------------------------------- * Path-handling code, paths are held in lists @@ -47,44 +47,19 @@ struct selector { struct list_head valid_paths; struct list_head invalid_paths; spinlock_t lock; - struct dm_path * __percpu *current_path; - struct percpu_counter repeat_count; }; -static void set_percpu_current_path(struct selector *s, struct dm_path *path) -{ - int cpu; - - for_each_possible_cpu(cpu) - *per_cpu_ptr(s->current_path, cpu) = path; -} - static struct selector *alloc_selector(void) { struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return NULL; - - INIT_LIST_HEAD(&s->valid_paths); - INIT_LIST_HEAD(&s->invalid_paths); - spin_lock_init(&s->lock); - - s->current_path = alloc_percpu(struct dm_path *); - if (!s->current_path) - goto out_current_path; - set_percpu_current_path(s, NULL); - - if (percpu_counter_init(&s->repeat_count, 0, GFP_KERNEL)) - goto out_repeat_count; + if (s) { + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->invalid_paths); + spin_lock_init(&s->lock); + } return s; - -out_repeat_count: - free_percpu(s->current_path); -out_current_path: - kfree(s); - return NULL;; } static int rr_create(struct path_selector *ps, unsigned argc, char **argv) @@ -105,8 +80,6 @@ static void rr_destroy(struct path_selector *ps) free_paths(&s->valid_paths); free_paths(&s->invalid_paths); - free_percpu(s->current_path); - percpu_counter_destroy(&s->repeat_count); kfree(s); ps->context = NULL; } @@ -157,6 +130,11 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path, return -EINVAL; } + if (repeat_count > 1) { + DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead"); + repeat_count = 1; + } + /* allocate the path */ pi = kmalloc(sizeof(*pi), GFP_KERNEL); if (!pi) { @@ -183,9 +161,6 @@ static void rr_fail_path(struct path_selector *ps, struct dm_path *p) struct path_info *pi = p->pscontext; spin_lock_irqsave(&s->lock, flags); - if (p == *this_cpu_ptr(s->current_path)) - set_percpu_current_path(s, NULL); - list_move(&pi->list, &s->invalid_paths); spin_unlock_irqrestore(&s->lock, flags); } @@ -208,29 +183,15 @@ static struct dm_path *rr_select_path(struct path_selector *ps, size_t nr_bytes) unsigned long flags; struct selector *s = ps->context; struct path_info *pi = NULL; - struct dm_path *current_path = NULL; - - local_irq_save(flags); - current_path = *this_cpu_ptr(s->current_path); - if (current_path) { - percpu_counter_dec(&s->repeat_count); - if (percpu_counter_read_positive(&s->repeat_count) > 0) { - local_irq_restore(flags); - return current_path; - } - } - spin_lock(&s->lock); + spin_lock_irqsave(&s->lock, flags); if (!list_empty(&s->valid_paths)) { pi = list_entry(s->valid_paths.next, struct path_info, list); list_move_tail(&pi->list, &s->valid_paths); - percpu_counter_set(&s->repeat_count, pi->repeat_count); - set_percpu_current_path(s, pi->path); - current_path = pi->path; } spin_unlock_irqrestore(&s->lock, flags); - return current_path; + return pi ? pi->path : NULL; } static struct path_selector_type rr_ps = { diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 38b05f23b96c..0250e7e521ab 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -175,6 +175,7 @@ static void dm_stat_free(struct rcu_head *head) int cpu; struct dm_stat *s = container_of(head, struct dm_stat, rcu_head); + kfree(s->histogram_boundaries); kfree(s->program_id); kfree(s->aux_data); for_each_possible_cpu(cpu) { diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 5bd9ab06a562..9f37d7fc2786 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -974,10 +974,61 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) } EXPORT_SYMBOL_GPL(dm_accept_partial_bio); +/* + * Flush current->bio_list when the target map method blocks. + * This fixes deadlocks in snapshot and possibly in other targets. + */ +struct dm_offload { + struct blk_plug plug; + struct blk_plug_cb cb; +}; + +static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule) +{ + struct dm_offload *o = container_of(cb, struct dm_offload, cb); + struct bio_list list; + struct bio *bio; + + INIT_LIST_HEAD(&o->cb.list); + + if (unlikely(!current->bio_list)) + return; + + list = *current->bio_list; + bio_list_init(current->bio_list); + + while ((bio = bio_list_pop(&list))) { + struct bio_set *bs = bio->bi_pool; + if (unlikely(!bs) || bs == fs_bio_set) { + bio_list_add(current->bio_list, bio); + continue; + } + + spin_lock(&bs->rescue_lock); + bio_list_add(&bs->rescue_list, bio); + queue_work(bs->rescue_workqueue, &bs->rescue_work); + spin_unlock(&bs->rescue_lock); + } +} + +static void dm_offload_start(struct dm_offload *o) +{ + blk_start_plug(&o->plug); + o->cb.callback = flush_current_bio_list; + list_add(&o->cb.list, ¤t->plug->cb_list); +} + +static void dm_offload_end(struct dm_offload *o) +{ + list_del(&o->cb.list); + blk_finish_plug(&o->plug); +} + static void __map_bio(struct dm_target_io *tio) { int r; sector_t sector; + struct dm_offload o; struct bio *clone = &tio->clone; struct dm_target *ti = tio->ti; @@ -990,7 +1041,11 @@ static void __map_bio(struct dm_target_io *tio) */ atomic_inc(&tio->io->io_count); sector = clone->bi_iter.bi_sector; + + dm_offload_start(&o); r = ti->type->map(ti, clone); + dm_offload_end(&o); + if (r == DM_MAPIO_REMAPPED) { /* the bio has been remapped so dispatch it */ diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index 7938cd21fa4c..185dc60360b5 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -976,6 +976,27 @@ int dm_array_cursor_next(struct dm_array_cursor *c) } EXPORT_SYMBOL_GPL(dm_array_cursor_next); +int dm_array_cursor_skip(struct dm_array_cursor *c, uint32_t count) +{ + int r; + + do { + uint32_t remaining = le32_to_cpu(c->ab->nr_entries) - c->index; + + if (count < remaining) { + c->index += count; + return 0; + } + + count -= remaining; + r = dm_array_cursor_next(c); + + } while (!r); + + return r; +} +EXPORT_SYMBOL_GPL(dm_array_cursor_skip); + void dm_array_cursor_get_value(struct dm_array_cursor *c, void **value_le) { *value_le = element_at(c->info, c->ab, c->index); diff --git a/drivers/md/persistent-data/dm-array.h b/drivers/md/persistent-data/dm-array.h index 27ee49a55473..d7d2d579c662 100644 --- a/drivers/md/persistent-data/dm-array.h +++ b/drivers/md/persistent-data/dm-array.h @@ -207,6 +207,7 @@ void dm_array_cursor_end(struct dm_array_cursor *c); uint32_t dm_array_cursor_index(struct dm_array_cursor *c); int dm_array_cursor_next(struct dm_array_cursor *c); +int dm_array_cursor_skip(struct dm_array_cursor *c, uint32_t count); /* * value_le is only valid while the cursor points at the current value. diff --git a/drivers/md/persistent-data/dm-bitset.c b/drivers/md/persistent-data/dm-bitset.c index 36f7cc2c7109..b7208d82e748 100644 --- a/drivers/md/persistent-data/dm-bitset.c +++ b/drivers/md/persistent-data/dm-bitset.c @@ -39,6 +39,48 @@ int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *root) } EXPORT_SYMBOL_GPL(dm_bitset_empty); +struct packer_context { + bit_value_fn fn; + unsigned nr_bits; + void *context; +}; + +static int pack_bits(uint32_t index, void *value, void *context) +{ + int r; + struct packer_context *p = context; + unsigned bit, nr = min(64u, p->nr_bits - (index * 64)); + uint64_t word = 0; + bool bv; + + for (bit = 0; bit < nr; bit++) { + r = p->fn(index * 64 + bit, &bv, p->context); + if (r) + return r; + + if (bv) + set_bit(bit, (unsigned long *) &word); + else + clear_bit(bit, (unsigned long *) &word); + } + + *((__le64 *) value) = cpu_to_le64(word); + + return 0; +} + +int dm_bitset_new(struct dm_disk_bitset *info, dm_block_t *root, + uint32_t size, bit_value_fn fn, void *context) +{ + struct packer_context p; + p.fn = fn; + p.nr_bits = size; + p.context = context; + + return dm_array_new(&info->array_info, root, dm_div_up(size, 64), pack_bits, &p); +} +EXPORT_SYMBOL_GPL(dm_bitset_new); + int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t root, uint32_t old_nr_entries, uint32_t new_nr_entries, bool default_value, dm_block_t *new_root) @@ -168,4 +210,108 @@ int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root, } EXPORT_SYMBOL_GPL(dm_bitset_test_bit); +static int cursor_next_array_entry(struct dm_bitset_cursor *c) +{ + int r; + __le64 *value; + + r = dm_array_cursor_next(&c->cursor); + if (r) + return r; + + dm_array_cursor_get_value(&c->cursor, (void **) &value); + c->array_index++; + c->bit_index = 0; + c->current_bits = le64_to_cpu(*value); + return 0; +} + +int dm_bitset_cursor_begin(struct dm_disk_bitset *info, + dm_block_t root, uint32_t nr_entries, + struct dm_bitset_cursor *c) +{ + int r; + __le64 *value; + + if (!nr_entries) + return -ENODATA; + + c->info = info; + c->entries_remaining = nr_entries; + + r = dm_array_cursor_begin(&info->array_info, root, &c->cursor); + if (r) + return r; + + dm_array_cursor_get_value(&c->cursor, (void **) &value); + c->array_index = 0; + c->bit_index = 0; + c->current_bits = le64_to_cpu(*value); + + return r; +} +EXPORT_SYMBOL_GPL(dm_bitset_cursor_begin); + +void dm_bitset_cursor_end(struct dm_bitset_cursor *c) +{ + return dm_array_cursor_end(&c->cursor); +} +EXPORT_SYMBOL_GPL(dm_bitset_cursor_end); + +int dm_bitset_cursor_next(struct dm_bitset_cursor *c) +{ + int r = 0; + + if (!c->entries_remaining) + return -ENODATA; + + c->entries_remaining--; + if (++c->bit_index > 63) + r = cursor_next_array_entry(c); + + return r; +} +EXPORT_SYMBOL_GPL(dm_bitset_cursor_next); + +int dm_bitset_cursor_skip(struct dm_bitset_cursor *c, uint32_t count) +{ + int r; + __le64 *value; + uint32_t nr_array_skip; + uint32_t remaining_in_word = 64 - c->bit_index; + + if (c->entries_remaining < count) + return -ENODATA; + + if (count < remaining_in_word) { + c->bit_index += count; + c->entries_remaining -= count; + return 0; + + } else { + c->entries_remaining -= remaining_in_word; + count -= remaining_in_word; + } + + nr_array_skip = (count / 64) + 1; + r = dm_array_cursor_skip(&c->cursor, nr_array_skip); + if (r) + return r; + + dm_array_cursor_get_value(&c->cursor, (void **) &value); + c->entries_remaining -= count; + c->array_index += nr_array_skip; + c->bit_index = count & 63; + c->current_bits = le64_to_cpu(*value); + + return 0; +} +EXPORT_SYMBOL_GPL(dm_bitset_cursor_skip); + +bool dm_bitset_cursor_get_value(struct dm_bitset_cursor *c) +{ + return test_bit(c->bit_index, (unsigned long *) &c->current_bits); +} +EXPORT_SYMBOL_GPL(dm_bitset_cursor_get_value); + /*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-bitset.h b/drivers/md/persistent-data/dm-bitset.h index c2287d672ef5..df888da04ee1 100644 --- a/drivers/md/persistent-data/dm-bitset.h +++ b/drivers/md/persistent-data/dm-bitset.h @@ -93,6 +93,22 @@ void dm_disk_bitset_init(struct dm_transaction_manager *tm, int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *new_root); /* + * Creates a new bitset populated with values provided by a callback + * function. This is more efficient than creating an empty bitset, + * resizing, and then setting values since that process incurs a lot of + * copying. + * + * info - describes the array + * root - the root block of the array on disk + * size - the number of entries in the array + * fn - the callback + * context - passed to the callback + */ +typedef int (*bit_value_fn)(uint32_t index, bool *value, void *context); +int dm_bitset_new(struct dm_disk_bitset *info, dm_block_t *root, + uint32_t size, bit_value_fn fn, void *context); + +/* * Resize the bitset. * * info - describes the bitset @@ -161,6 +177,29 @@ int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root, int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root, dm_block_t *new_root); +struct dm_bitset_cursor { + struct dm_disk_bitset *info; + struct dm_array_cursor cursor; + + uint32_t entries_remaining; + uint32_t array_index; + uint32_t bit_index; + uint64_t current_bits; +}; + +/* + * Make sure you've flush any dm_disk_bitset and updated the root before + * using this. + */ +int dm_bitset_cursor_begin(struct dm_disk_bitset *info, + dm_block_t root, uint32_t nr_entries, + struct dm_bitset_cursor *c); +void dm_bitset_cursor_end(struct dm_bitset_cursor *c); + +int dm_bitset_cursor_next(struct dm_bitset_cursor *c); +int dm_bitset_cursor_skip(struct dm_bitset_cursor *c, uint32_t count); +bool dm_bitset_cursor_get_value(struct dm_bitset_cursor *c); + /*----------------------------------------------------------------*/ #endif /* _LINUX_DM_BITSET_H */ diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index 758d90cc2733..0863905dee02 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -462,7 +462,7 @@ int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, int r; p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result); - if (IS_ERR(p)) + if (unlikely(IS_ERR(p))) return PTR_ERR(p); aux = dm_bufio_get_aux_data(to_buffer(*result)); @@ -498,7 +498,7 @@ int dm_bm_write_lock(struct dm_block_manager *bm, return -EPERM; p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result); - if (IS_ERR(p)) + if (unlikely(IS_ERR(p))) return PTR_ERR(p); aux = dm_bufio_get_aux_data(to_buffer(*result)); @@ -531,7 +531,7 @@ int dm_bm_read_try_lock(struct dm_block_manager *bm, int r; p = dm_bufio_get(bm->bufio, b, (struct dm_buffer **) result); - if (IS_ERR(p)) + if (unlikely(IS_ERR(p))) return PTR_ERR(p); if (unlikely(!p)) return -EWOULDBLOCK; @@ -567,7 +567,7 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm, return -EPERM; p = dm_bufio_new(bm->bufio, b, (struct dm_buffer **) result); - if (IS_ERR(p)) + if (unlikely(IS_ERR(p))) return PTR_ERR(p); memset(p, 0, dm_bm_block_size(bm)); diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index 20a40329d84a..02e2ee0d8a00 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -272,7 +272,12 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root) int r; struct del_stack *s; - s = kmalloc(sizeof(*s), GFP_NOIO); + /* + * dm_btree_del() is called via an ioctl, as such should be + * considered an FS op. We can't recurse back into the FS, so we + * allocate GFP_NOFS. + */ + s = kmalloc(sizeof(*s), GFP_NOFS); if (!s) return -ENOMEM; s->info = info; @@ -1139,6 +1144,17 @@ int dm_btree_cursor_next(struct dm_btree_cursor *c) } EXPORT_SYMBOL_GPL(dm_btree_cursor_next); +int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count) +{ + int r = 0; + + while (count-- && !r) + r = dm_btree_cursor_next(c); + + return r; +} +EXPORT_SYMBOL_GPL(dm_btree_cursor_skip); + int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le) { if (c->depth) { diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h index db9bd26adf31..3dc5bb1a4748 100644 --- a/drivers/md/persistent-data/dm-btree.h +++ b/drivers/md/persistent-data/dm-btree.h @@ -209,6 +209,7 @@ int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root, bool prefetch_leaves, struct dm_btree_cursor *c); void dm_btree_cursor_end(struct dm_btree_cursor *c); int dm_btree_cursor_next(struct dm_btree_cursor *c); +int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count); int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le); #endif /* _LINUX_DM_BTREE_H */ diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index 4c28608a0c94..829b4ce057d8 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -626,13 +626,19 @@ int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm, void *root_le, size_t len) { int r; - struct disk_sm_root *smr = root_le; + struct disk_sm_root smr; if (len < sizeof(struct disk_sm_root)) { DMERR("sm_metadata root too small"); return -ENOMEM; } + /* + * We don't know the alignment of the root_le buffer, so need to + * copy into a new structure. + */ + memcpy(&smr, root_le, sizeof(smr)); + r = sm_ll_init(ll, tm); if (r < 0) return r; @@ -644,10 +650,10 @@ int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm, ll->max_entries = metadata_ll_max_entries; ll->commit = metadata_ll_commit; - ll->nr_blocks = le64_to_cpu(smr->nr_blocks); - ll->nr_allocated = le64_to_cpu(smr->nr_allocated); - ll->bitmap_root = le64_to_cpu(smr->bitmap_root); - ll->ref_count_root = le64_to_cpu(smr->ref_count_root); + ll->nr_blocks = le64_to_cpu(smr.nr_blocks); + ll->nr_allocated = le64_to_cpu(smr.nr_allocated); + ll->bitmap_root = le64_to_cpu(smr.bitmap_root); + ll->ref_count_root = le64_to_cpu(smr.ref_count_root); return ll->open_index(ll); } diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 20557e2c60c6..4aed69d9dd17 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -544,7 +544,7 @@ static int sm_metadata_copy_root(struct dm_space_map *sm, void *where_le, size_t static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks); -static struct dm_space_map ops = { +static const struct dm_space_map ops = { .destroy = sm_metadata_destroy, .extend = sm_metadata_extend, .get_nr_blocks = sm_metadata_get_nr_blocks, @@ -671,7 +671,7 @@ static int sm_bootstrap_copy_root(struct dm_space_map *sm, void *where, return -EINVAL; } -static struct dm_space_map bootstrap_ops = { +static const struct dm_space_map bootstrap_ops = { .destroy = sm_bootstrap_destroy, .extend = sm_bootstrap_extend, .get_nr_blocks = sm_bootstrap_get_nr_blocks, |