summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-02-21 23:11:41 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2017-02-21 23:11:41 +0300
commit7a771ceac771d009f7203c40b256b0608d7ea2f8 (patch)
tree940260bccb165f47669397515c00900629c01803
parente67bd12d6036ae3de9eeb0ba52e43691264ec850 (diff)
parentd67a5f4b5947aba4bfe9a80a2b86079c215ca755 (diff)
downloadlinux-7a771ceac771d009f7203c40b256b0608d7ea2f8.tar.xz
Merge tag 'dm-4.11-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer: - Fix dm-raid transient device failure processing and other smaller tweaks. - Add journal support to the DM raid target to close the 'write hole' on raid 4/5/6. - Fix dm-cache corruption, due to rounding bug, when cache exceeds 2TB. - Add 'metadata2' feature to dm-cache to separate the dirty bitset out from other cache metadata. This improves speed of shutting down a large cache device (which implies writing out dirty bits). - Fix a memory leak during dm-stats data structure destruction. - Fix a DM multipath round-robin path selector performance regression that was caused by less precise balancing across all paths. - Lastly, introduce a DM core fix for a long-standing DM snapshot deadlock that is rooted in the complexity of the device stack used in conjunction with block core maintaining bios on current->bio_list to manage recursion in generic_make_request(). A more comprehensive fix to block core (and its hook in the cpu scheduler) would be wonderful but this DM-specific fix is pragmatic considering how difficult it has been to make progress on a generic fix. * tag 'dm-4.11-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (22 commits) dm: flush queued bios when process blocks to avoid deadlock dm round robin: revert "use percpu 'repeat_count' and 'current_path'" dm stats: fix a leaked s->histogram_boundaries array dm space map metadata: constify dm_space_map structures dm cache metadata: use cursor api in blocks_are_clean_separate_dirty() dm persistent data: add cursor skip functions to the cursor APIs dm cache metadata: use dm_bitset_new() to create the dirty bitset in format 2 dm bitset: add dm_bitset_new() dm cache metadata: name the cache block that couldn't be loaded dm cache metadata: add "metadata2" feature dm cache metadata: use bitset cursor api to load discard bitset dm bitset: introduce cursor api dm btree: use GFP_NOFS in dm_btree_del() dm space map common: memcpy the disk root to ensure it's arch aligned dm block manager: add unlikely() annotations on dm_bufio error paths dm cache: fix corruption seen when using cache > 2TB dm raid: cleanup awkward branching in raid_message() option processing dm raid: use mddev rather than rdev->mddev dm raid: use read_disk_sb() throughout dm raid: add raid4/5/6 journaling support ...
-rw-r--r--Documentation/device-mapper/cache.txt4
-rw-r--r--Documentation/device-mapper/dm-raid.txt17
-rw-r--r--drivers/md/dm-cache-metadata.c353
-rw-r--r--drivers/md/dm-cache-metadata.h11
-rw-r--r--drivers/md/dm-cache-target.c44
-rw-r--r--drivers/md/dm-raid.c296
-rw-r--r--drivers/md/dm-round-robin.c67
-rw-r--r--drivers/md/dm-stats.c1
-rw-r--r--drivers/md/dm.c55
-rw-r--r--drivers/md/persistent-data/dm-array.c21
-rw-r--r--drivers/md/persistent-data/dm-array.h1
-rw-r--r--drivers/md/persistent-data/dm-bitset.c146
-rw-r--r--drivers/md/persistent-data/dm-bitset.h39
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c8
-rw-r--r--drivers/md/persistent-data/dm-btree.c18
-rw-r--r--drivers/md/persistent-data/dm-btree.h1
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c16
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c4
18 files changed, 875 insertions, 227 deletions
diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt
index 785eab87aa71..f228604ddbcd 100644
--- a/Documentation/device-mapper/cache.txt
+++ b/Documentation/device-mapper/cache.txt
@@ -207,6 +207,10 @@ Optional feature arguments are:
block, then the cache block is invalidated.
To enable passthrough mode the cache must be clean.
+ metadata2 : use version 2 of the metadata. This stores the dirty bits
+ in a separate btree, which improves speed of shutting
+ down the cache.
+
A policy called 'default' is always registered. This is an alias for
the policy we currently think is giving best all round performance.
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index 5e3786fd9ea7..0d199353e477 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -161,6 +161,15 @@ The target is named "raid" and it accepts the following parameters:
the RAID type (i.e. the allocation algorithm) as well, e.g.
changing from raid5_ls to raid5_n.
+ [journal_dev <dev>]
+ This option adds a journal device to raid4/5/6 raid sets and
+ uses it to close the 'write hole' caused by the non-atomic updates
+ to the component devices which can cause data loss during recovery.
+ The journal device is used as writethrough thus causing writes to
+ be throttled versus non-journaled raid4/5/6 sets.
+ Takeover/reshape is not possible with a raid4/5/6 journal device;
+ it has to be deconfigured before requesting these.
+
<#raid_devs>: The number of devices composing the array.
Each device consists of two entries. The first is the device
containing the metadata (if any); the second is the one containing the
@@ -245,6 +254,9 @@ recovery. Here is a fuller description of the individual fields:
<data_offset> The current data offset to the start of the user data on
each component device of a raid set (see the respective
raid parameter to support out-of-place reshaping).
+ <journal_char> 'A' - active raid4/5/6 journal device.
+ 'D' - dead journal device.
+ '-' - no journal device.
Message Interface
@@ -314,3 +326,8 @@ Version History
1.9.0 Add support for RAID level takeover/reshape/region size
and set size reduction.
1.9.1 Fix activation of existing RAID 4/10 mapped devices
+1.9.2 Don't emit '- -' on the status table line in case the constructor
+ fails reading a superblock. Correctly emit 'maj:min1 maj:min2' and
+ 'D' on the status line. If '- -' is passed into the constructor, emit
+ '- -' on the table line and '-' as the status line health character.
+1.10.0 Add support for raid4/5/6 journal device
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 624fe4319b24..e4c2c1a1e993 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -25,7 +25,7 @@
* defines a range of metadata versions that this module can handle.
*/
#define MIN_CACHE_VERSION 1
-#define MAX_CACHE_VERSION 1
+#define MAX_CACHE_VERSION 2
#define CACHE_METADATA_CACHE_SIZE 64
@@ -55,6 +55,7 @@ enum mapping_bits {
/*
* The data on the cache is different from that on the origin.
+ * This flag is only used by metadata format 1.
*/
M_DIRTY = 2
};
@@ -93,12 +94,18 @@ struct cache_disk_superblock {
__le32 write_misses;
__le32 policy_version[CACHE_POLICY_VERSION_SIZE];
+
+ /*
+ * Metadata format 2 fields.
+ */
+ __le64 dirty_root;
} __packed;
struct dm_cache_metadata {
atomic_t ref_count;
struct list_head list;
+ unsigned version;
struct block_device *bdev;
struct dm_block_manager *bm;
struct dm_space_map *metadata_sm;
@@ -142,11 +149,18 @@ struct dm_cache_metadata {
bool fail_io:1;
/*
+ * Metadata format 2 fields.
+ */
+ dm_block_t dirty_root;
+ struct dm_disk_bitset dirty_info;
+
+ /*
* These structures are used when loading metadata. They're too
* big to put on the stack.
*/
struct dm_array_cursor mapping_cursor;
struct dm_array_cursor hint_cursor;
+ struct dm_bitset_cursor dirty_cursor;
};
/*-------------------------------------------------------------------
@@ -170,6 +184,7 @@ static void sb_prepare_for_write(struct dm_block_validator *v,
static int check_metadata_version(struct cache_disk_superblock *disk_super)
{
uint32_t metadata_version = le32_to_cpu(disk_super->version);
+
if (metadata_version < MIN_CACHE_VERSION || metadata_version > MAX_CACHE_VERSION) {
DMERR("Cache metadata version %u found, but only versions between %u and %u supported.",
metadata_version, MIN_CACHE_VERSION, MAX_CACHE_VERSION);
@@ -310,6 +325,11 @@ static void __copy_sm_root(struct dm_cache_metadata *cmd,
sizeof(cmd->metadata_space_map_root));
}
+static bool separate_dirty_bits(struct dm_cache_metadata *cmd)
+{
+ return cmd->version >= 2;
+}
+
static int __write_initial_superblock(struct dm_cache_metadata *cmd)
{
int r;
@@ -341,7 +361,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
disk_super->flags = 0;
memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC);
- disk_super->version = cpu_to_le32(MAX_CACHE_VERSION);
+ disk_super->version = cpu_to_le32(cmd->version);
memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name));
memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version));
disk_super->policy_hint_size = 0;
@@ -362,6 +382,9 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
disk_super->write_hits = cpu_to_le32(0);
disk_super->write_misses = cpu_to_le32(0);
+ if (separate_dirty_bits(cmd))
+ disk_super->dirty_root = cpu_to_le64(cmd->dirty_root);
+
return dm_tm_commit(cmd->tm, sblock);
}
@@ -382,6 +405,13 @@ static int __format_metadata(struct dm_cache_metadata *cmd)
if (r < 0)
goto bad;
+ if (separate_dirty_bits(cmd)) {
+ dm_disk_bitset_init(cmd->tm, &cmd->dirty_info);
+ r = dm_bitset_empty(&cmd->dirty_info, &cmd->dirty_root);
+ if (r < 0)
+ goto bad;
+ }
+
dm_disk_bitset_init(cmd->tm, &cmd->discard_info);
r = dm_bitset_empty(&cmd->discard_info, &cmd->discard_root);
if (r < 0)
@@ -407,9 +437,10 @@ bad:
static int __check_incompat_features(struct cache_disk_superblock *disk_super,
struct dm_cache_metadata *cmd)
{
- uint32_t features;
+ uint32_t incompat_flags, features;
- features = le32_to_cpu(disk_super->incompat_flags) & ~DM_CACHE_FEATURE_INCOMPAT_SUPP;
+ incompat_flags = le32_to_cpu(disk_super->incompat_flags);
+ features = incompat_flags & ~DM_CACHE_FEATURE_INCOMPAT_SUPP;
if (features) {
DMERR("could not access metadata due to unsupported optional features (%lx).",
(unsigned long)features);
@@ -470,6 +501,7 @@ static int __open_metadata(struct dm_cache_metadata *cmd)
}
__setup_mapping_info(cmd);
+ dm_disk_bitset_init(cmd->tm, &cmd->dirty_info);
dm_disk_bitset_init(cmd->tm, &cmd->discard_info);
sb_flags = le32_to_cpu(disk_super->flags);
cmd->clean_when_opened = test_bit(CLEAN_SHUTDOWN, &sb_flags);
@@ -548,6 +580,7 @@ static unsigned long clear_clean_shutdown(unsigned long flags)
static void read_superblock_fields(struct dm_cache_metadata *cmd,
struct cache_disk_superblock *disk_super)
{
+ cmd->version = le32_to_cpu(disk_super->version);
cmd->flags = le32_to_cpu(disk_super->flags);
cmd->root = le64_to_cpu(disk_super->mapping_root);
cmd->hint_root = le64_to_cpu(disk_super->hint_root);
@@ -567,6 +600,9 @@ static void read_superblock_fields(struct dm_cache_metadata *cmd,
cmd->stats.write_hits = le32_to_cpu(disk_super->write_hits);
cmd->stats.write_misses = le32_to_cpu(disk_super->write_misses);
+ if (separate_dirty_bits(cmd))
+ cmd->dirty_root = le64_to_cpu(disk_super->dirty_root);
+
cmd->changed = false;
}
@@ -625,6 +661,13 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
*/
BUILD_BUG_ON(sizeof(struct cache_disk_superblock) > 512);
+ if (separate_dirty_bits(cmd)) {
+ r = dm_bitset_flush(&cmd->dirty_info, cmd->dirty_root,
+ &cmd->dirty_root);
+ if (r)
+ return r;
+ }
+
r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root,
&cmd->discard_root);
if (r)
@@ -649,6 +692,8 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
update_flags(disk_super, mutator);
disk_super->mapping_root = cpu_to_le64(cmd->root);
+ if (separate_dirty_bits(cmd))
+ disk_super->dirty_root = cpu_to_le64(cmd->dirty_root);
disk_super->hint_root = cpu_to_le64(cmd->hint_root);
disk_super->discard_root = cpu_to_le64(cmd->discard_root);
disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
@@ -698,7 +743,8 @@ static void unpack_value(__le64 value_le, dm_oblock_t *block, unsigned *flags)
static struct dm_cache_metadata *metadata_open(struct block_device *bdev,
sector_t data_block_size,
bool may_format_device,
- size_t policy_hint_size)
+ size_t policy_hint_size,
+ unsigned metadata_version)
{
int r;
struct dm_cache_metadata *cmd;
@@ -709,6 +755,7 @@ static struct dm_cache_metadata *metadata_open(struct block_device *bdev,
return ERR_PTR(-ENOMEM);
}
+ cmd->version = metadata_version;
atomic_set(&cmd->ref_count, 1);
init_rwsem(&cmd->root_lock);
cmd->bdev = bdev;
@@ -757,7 +804,8 @@ static struct dm_cache_metadata *lookup(struct block_device *bdev)
static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev,
sector_t data_block_size,
bool may_format_device,
- size_t policy_hint_size)
+ size_t policy_hint_size,
+ unsigned metadata_version)
{
struct dm_cache_metadata *cmd, *cmd2;
@@ -768,7 +816,8 @@ static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev,
if (cmd)
return cmd;
- cmd = metadata_open(bdev, data_block_size, may_format_device, policy_hint_size);
+ cmd = metadata_open(bdev, data_block_size, may_format_device,
+ policy_hint_size, metadata_version);
if (!IS_ERR(cmd)) {
mutex_lock(&table_lock);
cmd2 = lookup(bdev);
@@ -800,10 +849,11 @@ static bool same_params(struct dm_cache_metadata *cmd, sector_t data_block_size)
struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
sector_t data_block_size,
bool may_format_device,
- size_t policy_hint_size)
+ size_t policy_hint_size,
+ unsigned metadata_version)
{
- struct dm_cache_metadata *cmd = lookup_or_open(bdev, data_block_size,
- may_format_device, policy_hint_size);
+ struct dm_cache_metadata *cmd = lookup_or_open(bdev, data_block_size, may_format_device,
+ policy_hint_size, metadata_version);
if (!IS_ERR(cmd) && !same_params(cmd, data_block_size)) {
dm_cache_metadata_close(cmd);
@@ -829,8 +879,8 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
/*
* Checks that the given cache block is either unmapped or clean.
*/
-static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b,
- bool *result)
+static int block_clean_combined_dirty(struct dm_cache_metadata *cmd, dm_cblock_t b,
+ bool *result)
{
int r;
__le64 value;
@@ -838,10 +888,8 @@ static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b,
unsigned flags;
r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(b), &value);
- if (r) {
- DMERR("block_unmapped_or_clean failed");
+ if (r)
return r;
- }
unpack_value(value, &ob, &flags);
*result = !((flags & M_VALID) && (flags & M_DIRTY));
@@ -849,17 +897,19 @@ static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b,
return 0;
}
-static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd,
- dm_cblock_t begin, dm_cblock_t end,
- bool *result)
+static int blocks_are_clean_combined_dirty(struct dm_cache_metadata *cmd,
+ dm_cblock_t begin, dm_cblock_t end,
+ bool *result)
{
int r;
*result = true;
while (begin != end) {
- r = block_unmapped_or_clean(cmd, begin, result);
- if (r)
+ r = block_clean_combined_dirty(cmd, begin, result);
+ if (r) {
+ DMERR("block_clean_combined_dirty failed");
return r;
+ }
if (!*result) {
DMERR("cache block %llu is dirty",
@@ -873,6 +923,67 @@ static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd,
return 0;
}
+static int blocks_are_clean_separate_dirty(struct dm_cache_metadata *cmd,
+ dm_cblock_t begin, dm_cblock_t end,
+ bool *result)
+{
+ int r;
+ bool dirty_flag;
+ *result = true;
+
+ r = dm_bitset_cursor_begin(&cmd->dirty_info, cmd->dirty_root,
+ from_cblock(begin), &cmd->dirty_cursor);
+ if (r) {
+ DMERR("%s: dm_bitset_cursor_begin for dirty failed", __func__);
+ return r;
+ }
+
+ r = dm_bitset_cursor_skip(&cmd->dirty_cursor, from_cblock(begin));
+ if (r) {
+ DMERR("%s: dm_bitset_cursor_skip for dirty failed", __func__);
+ dm_bitset_cursor_end(&cmd->dirty_cursor);
+ return r;
+ }
+
+ while (begin != end) {
+ /*
+ * We assume that unmapped blocks have their dirty bit
+ * cleared.
+ */
+ dirty_flag = dm_bitset_cursor_get_value(&cmd->dirty_cursor);
+ if (dirty_flag) {
+ DMERR("%s: cache block %llu is dirty", __func__,
+ (unsigned long long) from_cblock(begin));
+ dm_bitset_cursor_end(&cmd->dirty_cursor);
+ *result = false;
+ return 0;
+ }
+
+ r = dm_bitset_cursor_next(&cmd->dirty_cursor);
+ if (r) {
+ DMERR("%s: dm_bitset_cursor_next for dirty failed", __func__);
+ dm_bitset_cursor_end(&cmd->dirty_cursor);
+ return r;
+ }
+
+ begin = to_cblock(from_cblock(begin) + 1);
+ }
+
+ dm_bitset_cursor_end(&cmd->dirty_cursor);
+
+ return 0;
+}
+
+static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd,
+ dm_cblock_t begin, dm_cblock_t end,
+ bool *result)
+{
+ if (separate_dirty_bits(cmd))
+ return blocks_are_clean_separate_dirty(cmd, begin, end, result);
+ else
+ return blocks_are_clean_combined_dirty(cmd, begin, end, result);
+}
+
static bool cmd_write_lock(struct dm_cache_metadata *cmd)
{
down_write(&cmd->root_lock);
@@ -950,8 +1061,18 @@ int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks),
from_cblock(new_cache_size),
&null_mapping, &cmd->root);
- if (!r)
- cmd->cache_blocks = new_cache_size;
+ if (r)
+ goto out;
+
+ if (separate_dirty_bits(cmd)) {
+ r = dm_bitset_resize(&cmd->dirty_info, cmd->dirty_root,
+ from_cblock(cmd->cache_blocks), from_cblock(new_cache_size),
+ false, &cmd->dirty_root);
+ if (r)
+ goto out;
+ }
+
+ cmd->cache_blocks = new_cache_size;
cmd->changed = true;
out:
@@ -995,14 +1116,6 @@ static int __clear_discard(struct dm_cache_metadata *cmd, dm_dblock_t b)
from_dblock(b), &cmd->discard_root);
}
-static int __is_discarded(struct dm_cache_metadata *cmd, dm_dblock_t b,
- bool *is_discarded)
-{
- return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root,
- from_dblock(b), &cmd->discard_root,
- is_discarded);
-}
-
static int __discard(struct dm_cache_metadata *cmd,
dm_dblock_t dblock, bool discard)
{
@@ -1032,22 +1145,38 @@ static int __load_discards(struct dm_cache_metadata *cmd,
load_discard_fn fn, void *context)
{
int r = 0;
- dm_block_t b;
- bool discard;
+ uint32_t b;
+ struct dm_bitset_cursor c;
- for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) {
- dm_dblock_t dblock = to_dblock(b);
+ if (from_dblock(cmd->discard_nr_blocks) == 0)
+ /* nothing to do */
+ return 0;
- if (cmd->clean_when_opened) {
- r = __is_discarded(cmd, dblock, &discard);
- if (r)
- return r;
- } else
- discard = false;
+ if (cmd->clean_when_opened) {
+ r = dm_bitset_flush(&cmd->discard_info, cmd->discard_root, &cmd->discard_root);
+ if (r)
+ return r;
- r = fn(context, cmd->discard_block_size, dblock, discard);
+ r = dm_bitset_cursor_begin(&cmd->discard_info, cmd->discard_root,
+ from_dblock(cmd->discard_nr_blocks), &c);
if (r)
- break;
+ return r;
+
+ for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) {
+ r = fn(context, cmd->discard_block_size, to_dblock(b),
+ dm_bitset_cursor_get_value(&c));
+ if (r)
+ break;
+ }
+
+ dm_bitset_cursor_end(&c);
+
+ } else {
+ for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) {
+ r = fn(context, cmd->discard_block_size, to_dblock(b), false);
+ if (r)
+ return r;
+ }
}
return r;
@@ -1177,11 +1306,11 @@ static bool hints_array_available(struct dm_cache_metadata *cmd,
hints_array_initialized(cmd);
}
-static int __load_mapping(struct dm_cache_metadata *cmd,
- uint64_t cb, bool hints_valid,
- struct dm_array_cursor *mapping_cursor,
- struct dm_array_cursor *hint_cursor,
- load_mapping_fn fn, void *context)
+static int __load_mapping_v1(struct dm_cache_metadata *cmd,
+ uint64_t cb, bool hints_valid,
+ struct dm_array_cursor *mapping_cursor,
+ struct dm_array_cursor *hint_cursor,
+ load_mapping_fn fn, void *context)
{
int r = 0;
@@ -1206,8 +1335,51 @@ static int __load_mapping(struct dm_cache_metadata *cmd,
r = fn(context, oblock, to_cblock(cb), flags & M_DIRTY,
le32_to_cpu(hint), hints_valid);
- if (r)
- DMERR("policy couldn't load cblock");
+ if (r) {
+ DMERR("policy couldn't load cache block %llu",
+ (unsigned long long) from_cblock(to_cblock(cb)));
+ }
+ }
+
+ return r;
+}
+
+static int __load_mapping_v2(struct dm_cache_metadata *cmd,
+ uint64_t cb, bool hints_valid,
+ struct dm_array_cursor *mapping_cursor,
+ struct dm_array_cursor *hint_cursor,
+ struct dm_bitset_cursor *dirty_cursor,
+ load_mapping_fn fn, void *context)
+{
+ int r = 0;
+
+ __le64 mapping;
+ __le32 hint = 0;
+
+ __le64 *mapping_value_le;
+ __le32 *hint_value_le;
+
+ dm_oblock_t oblock;
+ unsigned flags;
+ bool dirty;
+
+ dm_array_cursor_get_value(mapping_cursor, (void **) &mapping_value_le);
+ memcpy(&mapping, mapping_value_le, sizeof(mapping));
+ unpack_value(mapping, &oblock, &flags);
+
+ if (flags & M_VALID) {
+ if (hints_valid) {
+ dm_array_cursor_get_value(hint_cursor, (void **) &hint_value_le);
+ memcpy(&hint, hint_value_le, sizeof(hint));
+ }
+
+ dirty = dm_bitset_cursor_get_value(dirty_cursor);
+ r = fn(context, oblock, to_cblock(cb), dirty,
+ le32_to_cpu(hint), hints_valid);
+ if (r) {
+ DMERR("policy couldn't load cache block %llu",
+ (unsigned long long) from_cblock(to_cblock(cb)));
+ }
}
return r;
@@ -1238,10 +1410,28 @@ static int __load_mappings(struct dm_cache_metadata *cmd,
}
}
+ if (separate_dirty_bits(cmd)) {
+ r = dm_bitset_cursor_begin(&cmd->dirty_info, cmd->dirty_root,
+ from_cblock(cmd->cache_blocks),
+ &cmd->dirty_cursor);
+ if (r) {
+ dm_array_cursor_end(&cmd->hint_cursor);
+ dm_array_cursor_end(&cmd->mapping_cursor);
+ return r;
+ }
+ }
+
for (cb = 0; ; cb++) {
- r = __load_mapping(cmd, cb, hints_valid,
- &cmd->mapping_cursor, &cmd->hint_cursor,
- fn, context);
+ if (separate_dirty_bits(cmd))
+ r = __load_mapping_v2(cmd, cb, hints_valid,
+ &cmd->mapping_cursor,
+ &cmd->hint_cursor,
+ &cmd->dirty_cursor,
+ fn, context);
+ else
+ r = __load_mapping_v1(cmd, cb, hints_valid,
+ &cmd->mapping_cursor, &cmd->hint_cursor,
+ fn, context);
if (r)
goto out;
@@ -1264,12 +1454,23 @@ static int __load_mappings(struct dm_cache_metadata *cmd,
goto out;
}
}
+
+ if (separate_dirty_bits(cmd)) {
+ r = dm_bitset_cursor_next(&cmd->dirty_cursor);
+ if (r) {
+ DMERR("dm_bitset_cursor_next for dirty failed");
+ goto out;
+ }
+ }
}
out:
dm_array_cursor_end(&cmd->mapping_cursor);
if (hints_valid)
dm_array_cursor_end(&cmd->hint_cursor);
+ if (separate_dirty_bits(cmd))
+ dm_bitset_cursor_end(&cmd->dirty_cursor);
+
return r;
}
@@ -1352,13 +1553,55 @@ static int __dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty
}
-int dm_cache_set_dirty(struct dm_cache_metadata *cmd,
- dm_cblock_t cblock, bool dirty)
+static int __set_dirty_bits_v1(struct dm_cache_metadata *cmd, unsigned nr_bits, unsigned long *bits)
+{
+ int r;
+ unsigned i;
+ for (i = 0; i < nr_bits; i++) {
+ r = __dirty(cmd, to_cblock(i), test_bit(i, bits));
+ if (r)
+ return r;
+ }
+
+ return 0;
+}
+
+static int is_dirty_callback(uint32_t index, bool *value, void *context)
+{
+ unsigned long *bits = context;
+ *value = test_bit(index, bits);
+ return 0;
+}
+
+static int __set_dirty_bits_v2(struct dm_cache_metadata *cmd, unsigned nr_bits, unsigned long *bits)
+{
+ int r = 0;
+
+ /* nr_bits is really just a sanity check */
+ if (nr_bits != from_cblock(cmd->cache_blocks)) {
+ DMERR("dirty bitset is wrong size");
+ return -EINVAL;
+ }
+
+ r = dm_bitset_del(&cmd->dirty_info, cmd->dirty_root);
+ if (r)
+ return r;
+
+ cmd->changed = true;
+ return dm_bitset_new(&cmd->dirty_info, &cmd->dirty_root, nr_bits, is_dirty_callback, bits);
+}
+
+int dm_cache_set_dirty_bits(struct dm_cache_metadata *cmd,
+ unsigned nr_bits,
+ unsigned long *bits)
{
int r;
WRITE_LOCK(cmd);
- r = __dirty(cmd, cblock, dirty);
+ if (separate_dirty_bits(cmd))
+ r = __set_dirty_bits_v2(cmd, nr_bits, bits);
+ else
+ r = __set_dirty_bits_v1(cmd, nr_bits, bits);
WRITE_UNLOCK(cmd);
return r;
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index 8528744195e5..4f07c08cf107 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -45,18 +45,20 @@
* As these various flags are defined they should be added to the
* following masks.
*/
+
#define DM_CACHE_FEATURE_COMPAT_SUPP 0UL
#define DM_CACHE_FEATURE_COMPAT_RO_SUPP 0UL
#define DM_CACHE_FEATURE_INCOMPAT_SUPP 0UL
/*
- * Reopens or creates a new, empty metadata volume.
- * Returns an ERR_PTR on failure.
+ * Reopens or creates a new, empty metadata volume. Returns an ERR_PTR on
+ * failure. If reopening then features must match.
*/
struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev,
sector_t data_block_size,
bool may_format_device,
- size_t policy_hint_size);
+ size_t policy_hint_size,
+ unsigned metadata_version);
void dm_cache_metadata_close(struct dm_cache_metadata *cmd);
@@ -91,7 +93,8 @@ int dm_cache_load_mappings(struct dm_cache_metadata *cmd,
load_mapping_fn fn,
void *context);
-int dm_cache_set_dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty);
+int dm_cache_set_dirty_bits(struct dm_cache_metadata *cmd,
+ unsigned nr_bits, unsigned long *bits);
struct dm_cache_statistics {
uint32_t read_hits;
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 894bc14469c8..9c689b34e6e7 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -179,6 +179,7 @@ enum cache_io_mode {
struct cache_features {
enum cache_metadata_mode mode;
enum cache_io_mode io_mode;
+ unsigned metadata_version;
};
struct cache_stats {
@@ -248,7 +249,7 @@ struct cache {
/*
* Fields for converting from sectors to blocks.
*/
- uint32_t sectors_per_block;
+ sector_t sectors_per_block;
int sectors_per_block_shift;
spinlock_t lock;
@@ -2534,13 +2535,14 @@ static void init_features(struct cache_features *cf)
{
cf->mode = CM_WRITE;
cf->io_mode = CM_IO_WRITEBACK;
+ cf->metadata_version = 1;
}
static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
char **error)
{
static struct dm_arg _args[] = {
- {0, 1, "Invalid number of cache feature arguments"},
+ {0, 2, "Invalid number of cache feature arguments"},
};
int r;
@@ -2566,6 +2568,9 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
else if (!strcasecmp(arg, "passthrough"))
cf->io_mode = CM_IO_PASSTHROUGH;
+ else if (!strcasecmp(arg, "metadata2"))
+ cf->metadata_version = 2;
+
else {
*error = "Unrecognised cache feature requested";
return -EINVAL;
@@ -2820,7 +2825,8 @@ static int cache_create(struct cache_args *ca, struct cache **result)
cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
ca->block_size, may_format,
- dm_cache_policy_get_hint_size(cache->policy));
+ dm_cache_policy_get_hint_size(cache->policy),
+ ca->features.metadata_version);
if (IS_ERR(cmd)) {
*error = "Error creating metadata object";
r = PTR_ERR(cmd);
@@ -3165,21 +3171,16 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
static int write_dirty_bitset(struct cache *cache)
{
- unsigned i, r;
+ int r;
if (get_cache_mode(cache) >= CM_READ_ONLY)
return -EINVAL;
- for (i = 0; i < from_cblock(cache->cache_size); i++) {
- r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
- is_dirty(cache, to_cblock(i)));
- if (r) {
- metadata_operation_failed(cache, "dm_cache_set_dirty", r);
- return r;
- }
- }
+ r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset);
+ if (r)
+ metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r);
- return 0;
+ return r;
}
static int write_discard_bitset(struct cache *cache)
@@ -3540,11 +3541,11 @@ static void cache_status(struct dm_target *ti, status_type_t type,
residency = policy_residency(cache->policy);
- DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ",
+ DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ",
(unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
(unsigned long long)nr_blocks_metadata,
- cache->sectors_per_block,
+ (unsigned long long)cache->sectors_per_block,
(unsigned long long) from_cblock(residency),
(unsigned long long) from_cblock(cache->cache_size),
(unsigned) atomic_read(&cache->stats.read_hit),
@@ -3555,14 +3556,19 @@ static void cache_status(struct dm_target *ti, status_type_t type,
(unsigned) atomic_read(&cache->stats.promotion),
(unsigned long) atomic_read(&cache->nr_dirty));
+ if (cache->features.metadata_version == 2)
+ DMEMIT("2 metadata2 ");
+ else
+ DMEMIT("1 ");
+
if (writethrough_mode(&cache->features))
- DMEMIT("1 writethrough ");
+ DMEMIT("writethrough ");
else if (passthrough_mode(&cache->features))
- DMEMIT("1 passthrough ");
+ DMEMIT("passthrough ");
else if (writeback_mode(&cache->features))
- DMEMIT("1 writeback ");
+ DMEMIT("writeback ");
else {
DMERR("%s: internal error: unknown io mode: %d",
@@ -3810,7 +3816,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type cache_target = {
.name = "cache",
- .version = {1, 9, 0},
+ .version = {1, 10, 0},
.module = THIS_MODULE,
.ctr = cache_ctr,
.dtr = cache_dtr,
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index b8f978e551d7..5c9e95d66f3b 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -24,6 +24,11 @@
*/
#define MIN_FREE_RESHAPE_SPACE to_sector(4*4096)
+/*
+ * Minimum journal space 4 MiB in sectors.
+ */
+#define MIN_RAID456_JOURNAL_SPACE (4*2048)
+
static bool devices_handle_discard_safely = false;
/*
@@ -73,6 +78,9 @@ struct raid_dev {
#define __CTR_FLAG_DATA_OFFSET 13 /* 2 */ /* Only with reshapable raid4/5/6/10! */
#define __CTR_FLAG_RAID10_USE_NEAR_SETS 14 /* 2 */ /* Only with raid10! */
+/* New for v1.10.0 */
+#define __CTR_FLAG_JOURNAL_DEV 15 /* 2 */ /* Only with raid4/5/6! */
+
/*
* Flags for rs->ctr_flags field.
*/
@@ -91,6 +99,7 @@ struct raid_dev {
#define CTR_FLAG_DELTA_DISKS (1 << __CTR_FLAG_DELTA_DISKS)
#define CTR_FLAG_DATA_OFFSET (1 << __CTR_FLAG_DATA_OFFSET)
#define CTR_FLAG_RAID10_USE_NEAR_SETS (1 << __CTR_FLAG_RAID10_USE_NEAR_SETS)
+#define CTR_FLAG_JOURNAL_DEV (1 << __CTR_FLAG_JOURNAL_DEV)
/*
* Definitions of various constructor flags to
@@ -163,7 +172,8 @@ struct raid_dev {
CTR_FLAG_STRIPE_CACHE | \
CTR_FLAG_REGION_SIZE | \
CTR_FLAG_DELTA_DISKS | \
- CTR_FLAG_DATA_OFFSET)
+ CTR_FLAG_DATA_OFFSET | \
+ CTR_FLAG_JOURNAL_DEV)
#define RAID6_VALID_FLAGS (CTR_FLAG_SYNC | \
CTR_FLAG_REBUILD | \
@@ -173,7 +183,8 @@ struct raid_dev {
CTR_FLAG_STRIPE_CACHE | \
CTR_FLAG_REGION_SIZE | \
CTR_FLAG_DELTA_DISKS | \
- CTR_FLAG_DATA_OFFSET)
+ CTR_FLAG_DATA_OFFSET | \
+ CTR_FLAG_JOURNAL_DEV)
/* ...valid options definitions per raid level */
/*
@@ -222,6 +233,12 @@ struct raid_set {
struct raid_type *raid_type;
struct dm_target_callbacks callbacks;
+ /* Optional raid4/5/6 journal device */
+ struct journal_dev {
+ struct dm_dev *dev;
+ struct md_rdev rdev;
+ } journal_dev;
+
struct raid_dev dev[0];
};
@@ -306,6 +323,7 @@ static struct arg_name_flag {
{ CTR_FLAG_DATA_OFFSET, "data_offset"},
{ CTR_FLAG_DELTA_DISKS, "delta_disks"},
{ CTR_FLAG_RAID10_USE_NEAR_SETS, "raid10_use_near_sets"},
+ { CTR_FLAG_JOURNAL_DEV, "journal_dev" },
};
/* Return argument name string for given @flag */
@@ -370,7 +388,7 @@ static bool rs_is_reshapable(struct raid_set *rs)
/* Return true, if raid set in @rs is recovering */
static bool rs_is_recovering(struct raid_set *rs)
{
- return rs->md.recovery_cp < rs->dev[0].rdev.sectors;
+ return rs->md.recovery_cp < rs->md.dev_sectors;
}
/* Return true, if raid set in @rs is reshaping */
@@ -627,7 +645,8 @@ static void rs_set_capacity(struct raid_set *rs)
* is unintended in case of out-of-place reshaping
*/
rdev_for_each(rdev, mddev)
- rdev->sectors = mddev->dev_sectors;
+ if (!test_bit(Journal, &rdev->flags))
+ rdev->sectors = mddev->dev_sectors;
set_capacity(gendisk, mddev->array_sectors);
revalidate_disk(gendisk);
@@ -713,6 +732,11 @@ static void raid_set_free(struct raid_set *rs)
{
int i;
+ if (rs->journal_dev.dev) {
+ md_rdev_clear(&rs->journal_dev.rdev);
+ dm_put_device(rs->ti, rs->journal_dev.dev);
+ }
+
for (i = 0; i < rs->raid_disks; i++) {
if (rs->dev[i].meta_dev)
dm_put_device(rs->ti, rs->dev[i].meta_dev);
@@ -760,10 +784,11 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
rs->dev[i].data_dev = NULL;
/*
- * There are no offsets, since there is a separate device
- * for data and metadata.
+ * There are no offsets initially.
+ * Out of place reshape will set them accordingly.
*/
rs->dev[i].rdev.data_offset = 0;
+ rs->dev[i].rdev.new_data_offset = 0;
rs->dev[i].rdev.mddev = &rs->md;
arg = dm_shift_arg(as);
@@ -821,6 +846,9 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as)
rebuild++;
}
+ if (rs->journal_dev.dev)
+ list_add_tail(&rs->journal_dev.rdev.same_set, &rs->md.disks);
+
if (metadata_available) {
rs->md.external = 0;
rs->md.persistent = 1;
@@ -1026,6 +1054,8 @@ too_many:
* [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
* [stripe_cache <sectors>] Stripe cache size for higher RAIDs
* [region_size <sectors>] Defines granularity of bitmap
+ * [journal_dev <dev>] raid4/5/6 journaling deviice
+ * (i.e. write hole closing log)
*
* RAID10-only options:
* [raid10_copies <# copies>] Number of copies. (Default: 2)
@@ -1133,7 +1163,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
/*
* Parameters that take a string value are checked here.
*/
-
+ /* "raid10_format {near|offset|far} */
if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_RAID10_FORMAT))) {
if (test_and_set_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags)) {
rs->ti->error = "Only one 'raid10_format' argument pair allowed";
@@ -1151,6 +1181,41 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as,
continue;
}
+ /* "journal_dev dev" */
+ if (!strcasecmp(key, dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV))) {
+ int r;
+ struct md_rdev *jdev;
+
+ if (test_and_set_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
+ rs->ti->error = "Only one raid4/5/6 set journaling device allowed";
+ return -EINVAL;
+ }
+ if (!rt_is_raid456(rt)) {
+ rs->ti->error = "'journal_dev' is an invalid parameter for this RAID type";
+ return -EINVAL;
+ }
+ r = dm_get_device(rs->ti, arg, dm_table_get_mode(rs->ti->table),
+ &rs->journal_dev.dev);
+ if (r) {
+ rs->ti->error = "raid4/5/6 journal device lookup failure";
+ return r;
+ }
+ jdev = &rs->journal_dev.rdev;
+ md_rdev_init(jdev);
+ jdev->mddev = &rs->md;
+ jdev->bdev = rs->journal_dev.dev->bdev;
+ jdev->sectors = to_sector(i_size_read(jdev->bdev->bd_inode));
+ if (jdev->sectors < MIN_RAID456_JOURNAL_SPACE) {
+ rs->ti->error = "No space for raid4/5/6 journal";
+ return -ENOSPC;
+ }
+ set_bit(Journal, &jdev->flags);
+ continue;
+ }
+
+ /*
+ * Parameters with number values from here on.
+ */
if (kstrtoint(arg, 10, &value) < 0) {
rs->ti->error = "Bad numerical argument given in raid params";
return -EINVAL;
@@ -1425,6 +1490,25 @@ static unsigned int rs_data_stripes(struct raid_set *rs)
return rs->raid_disks - rs->raid_type->parity_devs;
}
+/*
+ * Retrieve rdev->sectors from any valid raid device of @rs
+ * to allow userpace to pass in arbitray "- -" device tupples.
+ */
+static sector_t __rdev_sectors(struct raid_set *rs)
+{
+ int i;
+
+ for (i = 0; i < rs->md.raid_disks; i++) {
+ struct md_rdev *rdev = &rs->dev[i].rdev;
+
+ if (!test_bit(Journal, &rdev->flags) &&
+ rdev->bdev && rdev->sectors)
+ return rdev->sectors;
+ }
+
+ BUG(); /* Constructor ensures we got some. */
+}
+
/* Calculate the sectors per device and per array used for @rs */
static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
{
@@ -1468,7 +1552,8 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev)
array_sectors = (data_stripes + delta_disks) * dev_sectors;
rdev_for_each(rdev, mddev)
- rdev->sectors = dev_sectors;
+ if (!test_bit(Journal, &rdev->flags))
+ rdev->sectors = dev_sectors;
mddev->array_sectors = array_sectors;
mddev->dev_sectors = dev_sectors;
@@ -1510,9 +1595,9 @@ static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors)
else if (dev_sectors == MaxSector)
/* Prevent recovery */
__rs_setup_recovery(rs, MaxSector);
- else if (rs->dev[0].rdev.sectors < dev_sectors)
+ else if (__rdev_sectors(rs) < dev_sectors)
/* Grown raid set */
- __rs_setup_recovery(rs, rs->dev[0].rdev.sectors);
+ __rs_setup_recovery(rs, __rdev_sectors(rs));
else
__rs_setup_recovery(rs, MaxSector);
}
@@ -1851,18 +1936,21 @@ static int rs_check_reshape(struct raid_set *rs)
return -EPERM;
}
-static int read_disk_sb(struct md_rdev *rdev, int size)
+static int read_disk_sb(struct md_rdev *rdev, int size, bool force_reload)
{
BUG_ON(!rdev->sb_page);
- if (rdev->sb_loaded)
+ if (rdev->sb_loaded && !force_reload)
return 0;
+ rdev->sb_loaded = 0;
+
if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true)) {
DMERR("Failed to read superblock of device at position %d",
rdev->raid_disk);
md_error(rdev->mddev, rdev);
- return -EINVAL;
+ set_bit(Faulty, &rdev->flags);
+ return -EIO;
}
rdev->sb_loaded = 1;
@@ -1990,7 +2078,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
return -EINVAL;
}
- r = read_disk_sb(rdev, rdev->sb_size);
+ r = read_disk_sb(rdev, rdev->sb_size, false);
if (r)
return r;
@@ -2146,6 +2234,9 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
*/
d = 0;
rdev_for_each(r, mddev) {
+ if (test_bit(Journal, &rdev->flags))
+ continue;
+
if (test_bit(FirstUse, &r->flags))
new_devs++;
@@ -2201,7 +2292,8 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
*/
sb_retrieve_failed_devices(sb, failed_devices);
rdev_for_each(r, mddev) {
- if (!r->sb_page)
+ if (test_bit(Journal, &rdev->flags) ||
+ !r->sb_page)
continue;
sb2 = page_address(r->sb_page);
sb2->failed_devices = 0;
@@ -2253,7 +2345,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
struct mddev *mddev = &rs->md;
struct dm_raid_superblock *sb;
- if (rs_is_raid0(rs) || !rdev->sb_page)
+ if (rs_is_raid0(rs) || !rdev->sb_page || rdev->raid_disk < 0)
return 0;
sb = page_address(rdev->sb_page);
@@ -2278,7 +2370,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
/* Enable bitmap creation for RAID levels != 0 */
mddev->bitmap_info.offset = rt_is_raid0(rs->raid_type) ? 0 : to_sector(4096);
- rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
+ mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
if (!test_and_clear_bit(FirstUse, &rdev->flags)) {
/* Retrieve device size stored in superblock to be prepared for shrink */
@@ -2316,21 +2408,22 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
{
int r;
- struct raid_dev *dev;
- struct md_rdev *rdev, *tmp, *freshest;
+ struct md_rdev *rdev, *freshest;
struct mddev *mddev = &rs->md;
freshest = NULL;
- rdev_for_each_safe(rdev, tmp, mddev) {
+ rdev_for_each(rdev, mddev) {
+ if (test_bit(Journal, &rdev->flags))
+ continue;
+
/*
* Skipping super_load due to CTR_FLAG_SYNC will cause
* the array to undergo initialization again as
* though it were new. This is the intended effect
* of the "sync" directive.
*
- * When reshaping capability is added, we must ensure
- * that the "sync" directive is disallowed during the
- * reshape.
+ * With reshaping capability added, we must ensure that
+ * that the "sync" directive is disallowed during the reshape.
*/
if (test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags))
continue;
@@ -2347,6 +2440,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
case 0:
break;
default:
+ /* This is a failure to read the superblock from the metadata device. */
/*
* We have to keep any raid0 data/metadata device pairs or
* the MD raid0 personality will fail to start the array.
@@ -2354,33 +2448,16 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
if (rs_is_raid0(rs))
continue;
- dev = container_of(rdev, struct raid_dev, rdev);
- if (dev->meta_dev)
- dm_put_device(ti, dev->meta_dev);
-
- dev->meta_dev = NULL;
- rdev->meta_bdev = NULL;
-
- if (rdev->sb_page)
- put_page(rdev->sb_page);
-
- rdev->sb_page = NULL;
-
- rdev->sb_loaded = 0;
-
/*
- * We might be able to salvage the data device
- * even though the meta device has failed. For
- * now, we behave as though '- -' had been
- * set for this device in the table.
+ * We keep the dm_devs to be able to emit the device tuple
+ * properly on the table line in raid_status() (rather than
+ * mistakenly acting as if '- -' got passed into the constructor).
+ *
+ * The rdev has to stay on the same_set list to allow for
+ * the attempt to restore faulty devices on second resume.
*/
- if (dev->data_dev)
- dm_put_device(ti, dev->data_dev);
-
- dev->data_dev = NULL;
- rdev->bdev = NULL;
-
- list_del(&rdev->same_set);
+ rdev->raid_disk = rdev->saved_raid_disk = -1;
+ break;
}
}
@@ -2401,7 +2478,9 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
return -EINVAL;
rdev_for_each(rdev, mddev)
- if ((rdev != freshest) && super_validate(rs, rdev))
+ if (!test_bit(Journal, &rdev->flags) &&
+ rdev != freshest &&
+ super_validate(rs, rdev))
return -EINVAL;
return 0;
}
@@ -2488,10 +2567,12 @@ static int rs_adjust_data_offsets(struct raid_set *rs)
return -ENOSPC;
}
out:
- /* Adjust data offsets on all rdevs */
+ /* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */
rdev_for_each(rdev, &rs->md) {
- rdev->data_offset = data_offset;
- rdev->new_data_offset = new_data_offset;
+ if (!test_bit(Journal, &rdev->flags)) {
+ rdev->data_offset = data_offset;
+ rdev->new_data_offset = new_data_offset;
+ }
}
return 0;
@@ -2504,8 +2585,10 @@ static void __reorder_raid_disk_indexes(struct raid_set *rs)
struct md_rdev *rdev;
rdev_for_each(rdev, &rs->md) {
- rdev->raid_disk = i++;
- rdev->saved_raid_disk = rdev->new_raid_disk = -1;
+ if (!test_bit(Journal, &rdev->flags)) {
+ rdev->raid_disk = i++;
+ rdev->saved_raid_disk = rdev->new_raid_disk = -1;
+ }
}
}
@@ -2845,7 +2928,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (r)
goto bad;
- calculated_dev_sectors = rs->dev[0].rdev.sectors;
+ calculated_dev_sectors = rs->md.dev_sectors;
/*
* Backup any new raid set level, layout, ...
@@ -2858,7 +2941,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (r)
goto bad;
- resize = calculated_dev_sectors != rs->dev[0].rdev.sectors;
+ resize = calculated_dev_sectors != __rdev_sectors(rs);
INIT_WORK(&rs->md.event_work, do_table_event);
ti->private = rs;
@@ -2902,6 +2985,13 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
+ /* We can't takeover a journaled raid4/5/6 */
+ if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
+ ti->error = "Can't takeover a journaled raid4/5/6 set";
+ r = -EPERM;
+ goto bad;
+ }
+
/*
* If a takeover is needed, userspace sets any additional
* devices to rebuild and we can check for a valid request here.
@@ -2924,6 +3014,18 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
rs_set_new(rs);
} else if (rs_reshape_requested(rs)) {
/*
+ * No need to check for 'ongoing' takeover here, because takeover
+ * is an instant operation as oposed to an ongoing reshape.
+ */
+
+ /* We can't reshape a journaled raid4/5/6 */
+ if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags)) {
+ ti->error = "Can't reshape a journaled raid4/5/6 set";
+ r = -EPERM;
+ goto bad;
+ }
+
+ /*
* We can only prepare for a reshape here, because the
* raid set needs to run to provide the repective reshape
* check functions via its MD personality instance.
@@ -3071,18 +3173,23 @@ static const char *decipher_sync_action(struct mddev *mddev)
}
/*
- * Return status string @rdev
+ * Return status string for @rdev
*
* Status characters:
*
- * 'D' = Dead/Failed device
+ * 'D' = Dead/Failed raid set component or raid4/5/6 journal device
* 'a' = Alive but not in-sync
- * 'A' = Alive and in-sync
+ * 'A' = Alive and in-sync raid set component or alive raid4/5/6 journal device
+ * '-' = Non-existing device (i.e. uspace passed '- -' into the ctr)
*/
static const char *__raid_dev_status(struct md_rdev *rdev, bool array_in_sync)
{
- if (test_bit(Faulty, &rdev->flags))
+ if (!rdev->bdev)
+ return "-";
+ else if (test_bit(Faulty, &rdev->flags))
return "D";
+ else if (test_bit(Journal, &rdev->flags))
+ return "A";
else if (!array_in_sync || !test_bit(In_sync, &rdev->flags))
return "a";
else
@@ -3151,7 +3258,8 @@ static sector_t rs_get_progress(struct raid_set *rs,
* being initialized.
*/
rdev_for_each(rdev, mddev)
- if (!test_bit(In_sync, &rdev->flags))
+ if (!test_bit(Journal, &rdev->flags) &&
+ !test_bit(In_sync, &rdev->flags))
*array_in_sync = true;
#if 0
r = 0; /* HM FIXME: TESTME: https://bugzilla.redhat.com/show_bug.cgi?id=1210637 ? */
@@ -3183,7 +3291,6 @@ static void raid_status(struct dm_target *ti, status_type_t type,
sector_t progress, resync_max_sectors, resync_mismatches;
const char *sync_action;
struct raid_type *rt;
- struct md_rdev *rdev;
switch (type) {
case STATUSTYPE_INFO:
@@ -3204,9 +3311,9 @@ static void raid_status(struct dm_target *ti, status_type_t type,
atomic64_read(&mddev->resync_mismatches) : 0;
sync_action = decipher_sync_action(&rs->md);
- /* HM FIXME: do we want another state char for raid0? It shows 'D' or 'A' now */
- rdev_for_each(rdev, mddev)
- DMEMIT(__raid_dev_status(rdev, array_in_sync));
+ /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */
+ for (i = 0; i < rs->raid_disks; i++)
+ DMEMIT(__raid_dev_status(&rs->dev[i].rdev, array_in_sync));
/*
* In-sync/Reshape ratio:
@@ -3252,6 +3359,12 @@ static void raid_status(struct dm_target *ti, status_type_t type,
* so retrieving it from the first raid disk is sufficient.
*/
DMEMIT(" %llu", (unsigned long long) rs->dev[0].rdev.data_offset);
+
+ /*
+ * v1.10.0+:
+ */
+ DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ?
+ __raid_dev_status(&rs->journal_dev.rdev, 0) : "-");
break;
case STATUSTYPE_TABLE:
@@ -3265,7 +3378,8 @@ static void raid_status(struct dm_target *ti, status_type_t type,
raid_param_cnt += rebuild_disks * 2 +
write_mostly_params +
hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) +
- hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2;
+ hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 +
+ (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0);
/* Emit table line */
DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors);
if (test_bit(__CTR_FLAG_RAID10_FORMAT, &rs->ctr_flags))
@@ -3312,6 +3426,9 @@ static void raid_status(struct dm_target *ti, status_type_t type,
if (test_bit(__CTR_FLAG_MIN_RECOVERY_RATE, &rs->ctr_flags))
DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MIN_RECOVERY_RATE),
mddev->sync_speed_min);
+ if (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags))
+ DMEMIT(" %s %s", dm_raid_arg_name_by_flag(CTR_FLAG_JOURNAL_DEV),
+ __get_dev_name(rs->journal_dev.dev));
DMEMIT(" %d", rs->raid_disks);
for (i = 0; i < rs->raid_disks; i++)
DMEMIT(" %s %s", __get_dev_name(rs->dev[i].meta_dev),
@@ -3347,10 +3464,11 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv)
else {
if (!strcasecmp(argv[0], "check"))
set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
- else if (!!strcasecmp(argv[0], "repair"))
+ else if (!strcasecmp(argv[0], "repair")) {
+ set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ } else
return -EINVAL;
- set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
- set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
}
if (mddev->ro == 2) {
/* A write to sync_action is enough to justify
@@ -3427,11 +3545,14 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices));
- for (i = 0; i < rs->md.raid_disks; i++) {
+ for (i = 0; i < mddev->raid_disks; i++) {
r = &rs->dev[i].rdev;
- if (test_bit(Faulty, &r->flags) && r->sb_page &&
- sync_page_io(r, 0, r->sb_size, r->sb_page,
- REQ_OP_READ, 0, true)) {
+ /* HM FIXME: enhance journal device recovery processing */
+ if (test_bit(Journal, &r->flags))
+ continue;
+
+ if (test_bit(Faulty, &r->flags) &&
+ r->meta_bdev && !read_disk_sb(r, r->sb_size, true)) {
DMINFO("Faulty %s device #%d has readable super block."
" Attempting to revive it.",
rs->raid_type->name, i);
@@ -3445,22 +3566,26 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
* '>= 0' - meaning we must call this function
* ourselves.
*/
- if ((r->raid_disk >= 0) &&
- (mddev->pers->hot_remove_disk(mddev, r) != 0))
- /* Failed to revive this device, try next */
- continue;
-
- r->raid_disk = i;
- r->saved_raid_disk = i;
flags = r->flags;
+ clear_bit(In_sync, &r->flags); /* Mandatory for hot remove. */
+ if (r->raid_disk >= 0) {
+ if (mddev->pers->hot_remove_disk(mddev, r)) {
+ /* Failed to revive this device, try next */
+ r->flags = flags;
+ continue;
+ }
+ } else
+ r->raid_disk = r->saved_raid_disk = i;
+
clear_bit(Faulty, &r->flags);
clear_bit(WriteErrorSeen, &r->flags);
- clear_bit(In_sync, &r->flags);
+
if (mddev->pers->hot_add_disk(mddev, r)) {
- r->raid_disk = -1;
- r->saved_raid_disk = -1;
+ /* Failed to revive this device, try next */
+ r->raid_disk = r->saved_raid_disk = -1;
r->flags = flags;
} else {
+ clear_bit(In_sync, &r->flags);
r->recovery_offset = 0;
set_bit(i, (void *) cleared_failed_devices);
cleared = true;
@@ -3473,6 +3598,9 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
uint64_t failed_devices[DISKS_ARRAY_ELEMS];
rdev_for_each(r, &rs->md) {
+ if (test_bit(Journal, &r->flags))
+ continue;
+
sb = page_address(r->sb_page);
sb_retrieve_failed_devices(sb, failed_devices);
@@ -3651,7 +3779,7 @@ static void raid_resume(struct dm_target *ti)
static struct target_type raid_target = {
.name = "raid",
- .version = {1, 9, 1},
+ .version = {1, 10, 0},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index 6c25213ab38c..bdbb7e6e8212 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -17,8 +17,8 @@
#include <linux/module.h>
#define DM_MSG_PREFIX "multipath round-robin"
-#define RR_MIN_IO 1000
-#define RR_VERSION "1.1.0"
+#define RR_MIN_IO 1
+#define RR_VERSION "1.2.0"
/*-----------------------------------------------------------------
* Path-handling code, paths are held in lists
@@ -47,44 +47,19 @@ struct selector {
struct list_head valid_paths;
struct list_head invalid_paths;
spinlock_t lock;
- struct dm_path * __percpu *current_path;
- struct percpu_counter repeat_count;
};
-static void set_percpu_current_path(struct selector *s, struct dm_path *path)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- *per_cpu_ptr(s->current_path, cpu) = path;
-}
-
static struct selector *alloc_selector(void)
{
struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
- if (!s)
- return NULL;
-
- INIT_LIST_HEAD(&s->valid_paths);
- INIT_LIST_HEAD(&s->invalid_paths);
- spin_lock_init(&s->lock);
-
- s->current_path = alloc_percpu(struct dm_path *);
- if (!s->current_path)
- goto out_current_path;
- set_percpu_current_path(s, NULL);
-
- if (percpu_counter_init(&s->repeat_count, 0, GFP_KERNEL))
- goto out_repeat_count;
+ if (s) {
+ INIT_LIST_HEAD(&s->valid_paths);
+ INIT_LIST_HEAD(&s->invalid_paths);
+ spin_lock_init(&s->lock);
+ }
return s;
-
-out_repeat_count:
- free_percpu(s->current_path);
-out_current_path:
- kfree(s);
- return NULL;;
}
static int rr_create(struct path_selector *ps, unsigned argc, char **argv)
@@ -105,8 +80,6 @@ static void rr_destroy(struct path_selector *ps)
free_paths(&s->valid_paths);
free_paths(&s->invalid_paths);
- free_percpu(s->current_path);
- percpu_counter_destroy(&s->repeat_count);
kfree(s);
ps->context = NULL;
}
@@ -157,6 +130,11 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path,
return -EINVAL;
}
+ if (repeat_count > 1) {
+ DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead");
+ repeat_count = 1;
+ }
+
/* allocate the path */
pi = kmalloc(sizeof(*pi), GFP_KERNEL);
if (!pi) {
@@ -183,9 +161,6 @@ static void rr_fail_path(struct path_selector *ps, struct dm_path *p)
struct path_info *pi = p->pscontext;
spin_lock_irqsave(&s->lock, flags);
- if (p == *this_cpu_ptr(s->current_path))
- set_percpu_current_path(s, NULL);
-
list_move(&pi->list, &s->invalid_paths);
spin_unlock_irqrestore(&s->lock, flags);
}
@@ -208,29 +183,15 @@ static struct dm_path *rr_select_path(struct path_selector *ps, size_t nr_bytes)
unsigned long flags;
struct selector *s = ps->context;
struct path_info *pi = NULL;
- struct dm_path *current_path = NULL;
-
- local_irq_save(flags);
- current_path = *this_cpu_ptr(s->current_path);
- if (current_path) {
- percpu_counter_dec(&s->repeat_count);
- if (percpu_counter_read_positive(&s->repeat_count) > 0) {
- local_irq_restore(flags);
- return current_path;
- }
- }
- spin_lock(&s->lock);
+ spin_lock_irqsave(&s->lock, flags);
if (!list_empty(&s->valid_paths)) {
pi = list_entry(s->valid_paths.next, struct path_info, list);
list_move_tail(&pi->list, &s->valid_paths);
- percpu_counter_set(&s->repeat_count, pi->repeat_count);
- set_percpu_current_path(s, pi->path);
- current_path = pi->path;
}
spin_unlock_irqrestore(&s->lock, flags);
- return current_path;
+ return pi ? pi->path : NULL;
}
static struct path_selector_type rr_ps = {
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 38b05f23b96c..0250e7e521ab 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -175,6 +175,7 @@ static void dm_stat_free(struct rcu_head *head)
int cpu;
struct dm_stat *s = container_of(head, struct dm_stat, rcu_head);
+ kfree(s->histogram_boundaries);
kfree(s->program_id);
kfree(s->aux_data);
for_each_possible_cpu(cpu) {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 5bd9ab06a562..9f37d7fc2786 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -974,10 +974,61 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
}
EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
+/*
+ * Flush current->bio_list when the target map method blocks.
+ * This fixes deadlocks in snapshot and possibly in other targets.
+ */
+struct dm_offload {
+ struct blk_plug plug;
+ struct blk_plug_cb cb;
+};
+
+static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule)
+{
+ struct dm_offload *o = container_of(cb, struct dm_offload, cb);
+ struct bio_list list;
+ struct bio *bio;
+
+ INIT_LIST_HEAD(&o->cb.list);
+
+ if (unlikely(!current->bio_list))
+ return;
+
+ list = *current->bio_list;
+ bio_list_init(current->bio_list);
+
+ while ((bio = bio_list_pop(&list))) {
+ struct bio_set *bs = bio->bi_pool;
+ if (unlikely(!bs) || bs == fs_bio_set) {
+ bio_list_add(current->bio_list, bio);
+ continue;
+ }
+
+ spin_lock(&bs->rescue_lock);
+ bio_list_add(&bs->rescue_list, bio);
+ queue_work(bs->rescue_workqueue, &bs->rescue_work);
+ spin_unlock(&bs->rescue_lock);
+ }
+}
+
+static void dm_offload_start(struct dm_offload *o)
+{
+ blk_start_plug(&o->plug);
+ o->cb.callback = flush_current_bio_list;
+ list_add(&o->cb.list, &current->plug->cb_list);
+}
+
+static void dm_offload_end(struct dm_offload *o)
+{
+ list_del(&o->cb.list);
+ blk_finish_plug(&o->plug);
+}
+
static void __map_bio(struct dm_target_io *tio)
{
int r;
sector_t sector;
+ struct dm_offload o;
struct bio *clone = &tio->clone;
struct dm_target *ti = tio->ti;
@@ -990,7 +1041,11 @@ static void __map_bio(struct dm_target_io *tio)
*/
atomic_inc(&tio->io->io_count);
sector = clone->bi_iter.bi_sector;
+
+ dm_offload_start(&o);
r = ti->type->map(ti, clone);
+ dm_offload_end(&o);
+
if (r == DM_MAPIO_REMAPPED) {
/* the bio has been remapped so dispatch it */
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
index 7938cd21fa4c..185dc60360b5 100644
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -976,6 +976,27 @@ int dm_array_cursor_next(struct dm_array_cursor *c)
}
EXPORT_SYMBOL_GPL(dm_array_cursor_next);
+int dm_array_cursor_skip(struct dm_array_cursor *c, uint32_t count)
+{
+ int r;
+
+ do {
+ uint32_t remaining = le32_to_cpu(c->ab->nr_entries) - c->index;
+
+ if (count < remaining) {
+ c->index += count;
+ return 0;
+ }
+
+ count -= remaining;
+ r = dm_array_cursor_next(c);
+
+ } while (!r);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_array_cursor_skip);
+
void dm_array_cursor_get_value(struct dm_array_cursor *c, void **value_le)
{
*value_le = element_at(c->info, c->ab, c->index);
diff --git a/drivers/md/persistent-data/dm-array.h b/drivers/md/persistent-data/dm-array.h
index 27ee49a55473..d7d2d579c662 100644
--- a/drivers/md/persistent-data/dm-array.h
+++ b/drivers/md/persistent-data/dm-array.h
@@ -207,6 +207,7 @@ void dm_array_cursor_end(struct dm_array_cursor *c);
uint32_t dm_array_cursor_index(struct dm_array_cursor *c);
int dm_array_cursor_next(struct dm_array_cursor *c);
+int dm_array_cursor_skip(struct dm_array_cursor *c, uint32_t count);
/*
* value_le is only valid while the cursor points at the current value.
diff --git a/drivers/md/persistent-data/dm-bitset.c b/drivers/md/persistent-data/dm-bitset.c
index 36f7cc2c7109..b7208d82e748 100644
--- a/drivers/md/persistent-data/dm-bitset.c
+++ b/drivers/md/persistent-data/dm-bitset.c
@@ -39,6 +39,48 @@ int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *root)
}
EXPORT_SYMBOL_GPL(dm_bitset_empty);
+struct packer_context {
+ bit_value_fn fn;
+ unsigned nr_bits;
+ void *context;
+};
+
+static int pack_bits(uint32_t index, void *value, void *context)
+{
+ int r;
+ struct packer_context *p = context;
+ unsigned bit, nr = min(64u, p->nr_bits - (index * 64));
+ uint64_t word = 0;
+ bool bv;
+
+ for (bit = 0; bit < nr; bit++) {
+ r = p->fn(index * 64 + bit, &bv, p->context);
+ if (r)
+ return r;
+
+ if (bv)
+ set_bit(bit, (unsigned long *) &word);
+ else
+ clear_bit(bit, (unsigned long *) &word);
+ }
+
+ *((__le64 *) value) = cpu_to_le64(word);
+
+ return 0;
+}
+
+int dm_bitset_new(struct dm_disk_bitset *info, dm_block_t *root,
+ uint32_t size, bit_value_fn fn, void *context)
+{
+ struct packer_context p;
+ p.fn = fn;
+ p.nr_bits = size;
+ p.context = context;
+
+ return dm_array_new(&info->array_info, root, dm_div_up(size, 64), pack_bits, &p);
+}
+EXPORT_SYMBOL_GPL(dm_bitset_new);
+
int dm_bitset_resize(struct dm_disk_bitset *info, dm_block_t root,
uint32_t old_nr_entries, uint32_t new_nr_entries,
bool default_value, dm_block_t *new_root)
@@ -168,4 +210,108 @@ int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root,
}
EXPORT_SYMBOL_GPL(dm_bitset_test_bit);
+static int cursor_next_array_entry(struct dm_bitset_cursor *c)
+{
+ int r;
+ __le64 *value;
+
+ r = dm_array_cursor_next(&c->cursor);
+ if (r)
+ return r;
+
+ dm_array_cursor_get_value(&c->cursor, (void **) &value);
+ c->array_index++;
+ c->bit_index = 0;
+ c->current_bits = le64_to_cpu(*value);
+ return 0;
+}
+
+int dm_bitset_cursor_begin(struct dm_disk_bitset *info,
+ dm_block_t root, uint32_t nr_entries,
+ struct dm_bitset_cursor *c)
+{
+ int r;
+ __le64 *value;
+
+ if (!nr_entries)
+ return -ENODATA;
+
+ c->info = info;
+ c->entries_remaining = nr_entries;
+
+ r = dm_array_cursor_begin(&info->array_info, root, &c->cursor);
+ if (r)
+ return r;
+
+ dm_array_cursor_get_value(&c->cursor, (void **) &value);
+ c->array_index = 0;
+ c->bit_index = 0;
+ c->current_bits = le64_to_cpu(*value);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_bitset_cursor_begin);
+
+void dm_bitset_cursor_end(struct dm_bitset_cursor *c)
+{
+ return dm_array_cursor_end(&c->cursor);
+}
+EXPORT_SYMBOL_GPL(dm_bitset_cursor_end);
+
+int dm_bitset_cursor_next(struct dm_bitset_cursor *c)
+{
+ int r = 0;
+
+ if (!c->entries_remaining)
+ return -ENODATA;
+
+ c->entries_remaining--;
+ if (++c->bit_index > 63)
+ r = cursor_next_array_entry(c);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_bitset_cursor_next);
+
+int dm_bitset_cursor_skip(struct dm_bitset_cursor *c, uint32_t count)
+{
+ int r;
+ __le64 *value;
+ uint32_t nr_array_skip;
+ uint32_t remaining_in_word = 64 - c->bit_index;
+
+ if (c->entries_remaining < count)
+ return -ENODATA;
+
+ if (count < remaining_in_word) {
+ c->bit_index += count;
+ c->entries_remaining -= count;
+ return 0;
+
+ } else {
+ c->entries_remaining -= remaining_in_word;
+ count -= remaining_in_word;
+ }
+
+ nr_array_skip = (count / 64) + 1;
+ r = dm_array_cursor_skip(&c->cursor, nr_array_skip);
+ if (r)
+ return r;
+
+ dm_array_cursor_get_value(&c->cursor, (void **) &value);
+ c->entries_remaining -= count;
+ c->array_index += nr_array_skip;
+ c->bit_index = count & 63;
+ c->current_bits = le64_to_cpu(*value);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dm_bitset_cursor_skip);
+
+bool dm_bitset_cursor_get_value(struct dm_bitset_cursor *c)
+{
+ return test_bit(c->bit_index, (unsigned long *) &c->current_bits);
+}
+EXPORT_SYMBOL_GPL(dm_bitset_cursor_get_value);
+
/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-bitset.h b/drivers/md/persistent-data/dm-bitset.h
index c2287d672ef5..df888da04ee1 100644
--- a/drivers/md/persistent-data/dm-bitset.h
+++ b/drivers/md/persistent-data/dm-bitset.h
@@ -93,6 +93,22 @@ void dm_disk_bitset_init(struct dm_transaction_manager *tm,
int dm_bitset_empty(struct dm_disk_bitset *info, dm_block_t *new_root);
/*
+ * Creates a new bitset populated with values provided by a callback
+ * function. This is more efficient than creating an empty bitset,
+ * resizing, and then setting values since that process incurs a lot of
+ * copying.
+ *
+ * info - describes the array
+ * root - the root block of the array on disk
+ * size - the number of entries in the array
+ * fn - the callback
+ * context - passed to the callback
+ */
+typedef int (*bit_value_fn)(uint32_t index, bool *value, void *context);
+int dm_bitset_new(struct dm_disk_bitset *info, dm_block_t *root,
+ uint32_t size, bit_value_fn fn, void *context);
+
+/*
* Resize the bitset.
*
* info - describes the bitset
@@ -161,6 +177,29 @@ int dm_bitset_test_bit(struct dm_disk_bitset *info, dm_block_t root,
int dm_bitset_flush(struct dm_disk_bitset *info, dm_block_t root,
dm_block_t *new_root);
+struct dm_bitset_cursor {
+ struct dm_disk_bitset *info;
+ struct dm_array_cursor cursor;
+
+ uint32_t entries_remaining;
+ uint32_t array_index;
+ uint32_t bit_index;
+ uint64_t current_bits;
+};
+
+/*
+ * Make sure you've flush any dm_disk_bitset and updated the root before
+ * using this.
+ */
+int dm_bitset_cursor_begin(struct dm_disk_bitset *info,
+ dm_block_t root, uint32_t nr_entries,
+ struct dm_bitset_cursor *c);
+void dm_bitset_cursor_end(struct dm_bitset_cursor *c);
+
+int dm_bitset_cursor_next(struct dm_bitset_cursor *c);
+int dm_bitset_cursor_skip(struct dm_bitset_cursor *c, uint32_t count);
+bool dm_bitset_cursor_get_value(struct dm_bitset_cursor *c);
+
/*----------------------------------------------------------------*/
#endif /* _LINUX_DM_BITSET_H */
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 758d90cc2733..0863905dee02 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -462,7 +462,7 @@ int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
int r;
p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
- if (IS_ERR(p))
+ if (unlikely(IS_ERR(p)))
return PTR_ERR(p);
aux = dm_bufio_get_aux_data(to_buffer(*result));
@@ -498,7 +498,7 @@ int dm_bm_write_lock(struct dm_block_manager *bm,
return -EPERM;
p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result);
- if (IS_ERR(p))
+ if (unlikely(IS_ERR(p)))
return PTR_ERR(p);
aux = dm_bufio_get_aux_data(to_buffer(*result));
@@ -531,7 +531,7 @@ int dm_bm_read_try_lock(struct dm_block_manager *bm,
int r;
p = dm_bufio_get(bm->bufio, b, (struct dm_buffer **) result);
- if (IS_ERR(p))
+ if (unlikely(IS_ERR(p)))
return PTR_ERR(p);
if (unlikely(!p))
return -EWOULDBLOCK;
@@ -567,7 +567,7 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm,
return -EPERM;
p = dm_bufio_new(bm->bufio, b, (struct dm_buffer **) result);
- if (IS_ERR(p))
+ if (unlikely(IS_ERR(p)))
return PTR_ERR(p);
memset(p, 0, dm_bm_block_size(bm));
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 20a40329d84a..02e2ee0d8a00 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -272,7 +272,12 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
int r;
struct del_stack *s;
- s = kmalloc(sizeof(*s), GFP_NOIO);
+ /*
+ * dm_btree_del() is called via an ioctl, as such should be
+ * considered an FS op. We can't recurse back into the FS, so we
+ * allocate GFP_NOFS.
+ */
+ s = kmalloc(sizeof(*s), GFP_NOFS);
if (!s)
return -ENOMEM;
s->info = info;
@@ -1139,6 +1144,17 @@ int dm_btree_cursor_next(struct dm_btree_cursor *c)
}
EXPORT_SYMBOL_GPL(dm_btree_cursor_next);
+int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count)
+{
+ int r = 0;
+
+ while (count-- && !r)
+ r = dm_btree_cursor_next(c);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_btree_cursor_skip);
+
int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le)
{
if (c->depth) {
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
index db9bd26adf31..3dc5bb1a4748 100644
--- a/drivers/md/persistent-data/dm-btree.h
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -209,6 +209,7 @@ int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root,
bool prefetch_leaves, struct dm_btree_cursor *c);
void dm_btree_cursor_end(struct dm_btree_cursor *c);
int dm_btree_cursor_next(struct dm_btree_cursor *c);
+int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count);
int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le);
#endif /* _LINUX_DM_BTREE_H */
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index 4c28608a0c94..829b4ce057d8 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -626,13 +626,19 @@ int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm,
void *root_le, size_t len)
{
int r;
- struct disk_sm_root *smr = root_le;
+ struct disk_sm_root smr;
if (len < sizeof(struct disk_sm_root)) {
DMERR("sm_metadata root too small");
return -ENOMEM;
}
+ /*
+ * We don't know the alignment of the root_le buffer, so need to
+ * copy into a new structure.
+ */
+ memcpy(&smr, root_le, sizeof(smr));
+
r = sm_ll_init(ll, tm);
if (r < 0)
return r;
@@ -644,10 +650,10 @@ int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm,
ll->max_entries = metadata_ll_max_entries;
ll->commit = metadata_ll_commit;
- ll->nr_blocks = le64_to_cpu(smr->nr_blocks);
- ll->nr_allocated = le64_to_cpu(smr->nr_allocated);
- ll->bitmap_root = le64_to_cpu(smr->bitmap_root);
- ll->ref_count_root = le64_to_cpu(smr->ref_count_root);
+ ll->nr_blocks = le64_to_cpu(smr.nr_blocks);
+ ll->nr_allocated = le64_to_cpu(smr.nr_allocated);
+ ll->bitmap_root = le64_to_cpu(smr.bitmap_root);
+ ll->ref_count_root = le64_to_cpu(smr.ref_count_root);
return ll->open_index(ll);
}
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 20557e2c60c6..4aed69d9dd17 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -544,7 +544,7 @@ static int sm_metadata_copy_root(struct dm_space_map *sm, void *where_le, size_t
static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks);
-static struct dm_space_map ops = {
+static const struct dm_space_map ops = {
.destroy = sm_metadata_destroy,
.extend = sm_metadata_extend,
.get_nr_blocks = sm_metadata_get_nr_blocks,
@@ -671,7 +671,7 @@ static int sm_bootstrap_copy_root(struct dm_space_map *sm, void *where,
return -EINVAL;
}
-static struct dm_space_map bootstrap_ops = {
+static const struct dm_space_map bootstrap_ops = {
.destroy = sm_bootstrap_destroy,
.extend = sm_bootstrap_extend,
.get_nr_blocks = sm_bootstrap_get_nr_blocks,