summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bitmap.c4
-rw-r--r--drivers/md/dm-bufio.c26
-rw-r--r--drivers/md/dm-cache-metadata.c68
-rw-r--r--drivers/md/dm-cache-metadata.h2
-rw-r--r--drivers/md/dm-cache-policy-cleaner.c7
-rw-r--r--drivers/md/dm-cache-policy-internal.h2
-rw-r--r--drivers/md/dm-cache-policy-mq.c8
-rw-r--r--drivers/md/dm-cache-policy.c8
-rw-r--r--drivers/md/dm-cache-policy.h6
-rw-r--r--drivers/md/dm-cache-target.c300
-rw-r--r--drivers/md/dm-crypt.c3
-rw-r--r--drivers/md/dm-mpath.c1
-rw-r--r--drivers/md/dm-raid.c111
-rw-r--r--drivers/md/dm-raid1.c2
-rw-r--r--drivers/md/dm-snap.c1
-rw-r--r--drivers/md/dm-stripe.c13
-rw-r--r--drivers/md/dm-table.c2
-rw-r--r--drivers/md/dm-thin-metadata.c36
-rw-r--r--drivers/md/dm-thin-metadata.h7
-rw-r--r--drivers/md/dm-thin.c209
-rw-r--r--drivers/md/dm-verity.c43
-rw-r--r--drivers/md/dm.c5
-rw-r--r--drivers/md/faulty.c6
-rw-r--r--drivers/md/linear.c3
-rw-r--r--drivers/md/md.c260
-rw-r--r--drivers/md/md.h5
-rw-r--r--drivers/md/persistent-data/dm-btree-remove.c46
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.c3
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c127
-rw-r--r--drivers/md/persistent-data/dm-space-map.h23
-rw-r--r--drivers/md/raid0.c9
-rw-r--r--drivers/md/raid1.c141
-rw-r--r--drivers/md/raid10.c102
-rw-r--r--drivers/md/raid5.c192
-rw-r--r--drivers/md/raid5.h5
35 files changed, 1201 insertions, 585 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 4fd9d6aeff6a..5a2c75499824 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -846,7 +846,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
set_bit(bit, kaddr);
else
- test_and_set_bit_le(bit, kaddr);
+ set_bit_le(bit, kaddr);
kunmap_atomic(kaddr);
pr_debug("set file bit %lu page %lu\n", bit, page->index);
/* record page number so it gets flushed to disk when unplug occurs */
@@ -868,7 +868,7 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
clear_bit(bit, paddr);
else
- test_and_clear_bit_le(bit, paddr);
+ clear_bit_le(bit, paddr);
kunmap_atomic(paddr);
if (!test_page_attr(bitmap, page->index, BITMAP_PAGE_NEEDWRITE)) {
set_page_attr(bitmap, page->index, BITMAP_PAGE_PENDING);
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 3c955e10a618..0387e05cdb98 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -319,6 +319,9 @@ static void __cache_size_refresh(void)
static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
enum data_mode *data_mode)
{
+ unsigned noio_flag;
+ void *ptr;
+
if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) {
*data_mode = DATA_MODE_SLAB;
return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask);
@@ -332,7 +335,26 @@ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
}
*data_mode = DATA_MODE_VMALLOC;
- return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
+
+ /*
+ * __vmalloc allocates the data pages and auxiliary structures with
+ * gfp_flags that were specified, but pagetables are always allocated
+ * with GFP_KERNEL, no matter what was specified as gfp_mask.
+ *
+ * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that
+ * all allocations done by this process (including pagetables) are done
+ * as if GFP_NOIO was specified.
+ */
+
+ if (gfp_mask & __GFP_NORETRY)
+ noio_flag = memalloc_noio_save();
+
+ ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
+
+ if (gfp_mask & __GFP_NORETRY)
+ memalloc_noio_restore(noio_flag);
+
+ return ptr;
}
/*
@@ -1025,6 +1047,8 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
{
struct blk_plug plug;
+ BUG_ON(dm_bufio_in_request());
+
blk_start_plug(&plug);
dm_bufio_lock(c);
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index fbd3625f2748..1af7255bbffb 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -83,6 +83,8 @@ struct cache_disk_superblock {
__le32 read_misses;
__le32 write_hits;
__le32 write_misses;
+
+ __le32 policy_version[CACHE_POLICY_VERSION_SIZE];
} __packed;
struct dm_cache_metadata {
@@ -109,6 +111,7 @@ struct dm_cache_metadata {
bool clean_when_opened:1;
char policy_name[CACHE_POLICY_NAME_SIZE];
+ unsigned policy_version[CACHE_POLICY_VERSION_SIZE];
size_t policy_hint_size;
struct dm_cache_statistics stats;
};
@@ -268,7 +271,8 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC);
disk_super->version = cpu_to_le32(CACHE_VERSION);
- memset(disk_super->policy_name, 0, CACHE_POLICY_NAME_SIZE);
+ memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name));
+ memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version));
disk_super->policy_hint_size = 0;
r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root,
@@ -284,7 +288,6 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
disk_super->data_block_size = cpu_to_le32(cmd->data_block_size);
disk_super->cache_blocks = cpu_to_le32(0);
- memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name));
disk_super->read_hits = cpu_to_le32(0);
disk_super->read_misses = cpu_to_le32(0);
@@ -478,6 +481,9 @@ static void read_superblock_fields(struct dm_cache_metadata *cmd,
cmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks));
strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name));
+ cmd->policy_version[0] = le32_to_cpu(disk_super->policy_version[0]);
+ cmd->policy_version[1] = le32_to_cpu(disk_super->policy_version[1]);
+ cmd->policy_version[2] = le32_to_cpu(disk_super->policy_version[2]);
cmd->policy_hint_size = le32_to_cpu(disk_super->policy_hint_size);
cmd->stats.read_hits = le32_to_cpu(disk_super->read_hits);
@@ -572,6 +578,9 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));
disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks));
strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name));
+ disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]);
+ disk_super->policy_version[1] = cpu_to_le32(cmd->policy_version[1]);
+ disk_super->policy_version[2] = cpu_to_le32(cmd->policy_version[2]);
disk_super->read_hits = cpu_to_le32(cmd->stats.read_hits);
disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses);
@@ -854,18 +863,43 @@ struct thunk {
bool hints_valid;
};
+static bool policy_unchanged(struct dm_cache_metadata *cmd,
+ struct dm_cache_policy *policy)
+{
+ const char *policy_name = dm_cache_policy_get_name(policy);
+ const unsigned *policy_version = dm_cache_policy_get_version(policy);
+ size_t policy_hint_size = dm_cache_policy_get_hint_size(policy);
+
+ /*
+ * Ensure policy names match.
+ */
+ if (strncmp(cmd->policy_name, policy_name, sizeof(cmd->policy_name)))
+ return false;
+
+ /*
+ * Ensure policy major versions match.
+ */
+ if (cmd->policy_version[0] != policy_version[0])
+ return false;
+
+ /*
+ * Ensure policy hint sizes match.
+ */
+ if (cmd->policy_hint_size != policy_hint_size)
+ return false;
+
+ return true;
+}
+
static bool hints_array_initialized(struct dm_cache_metadata *cmd)
{
return cmd->hint_root && cmd->policy_hint_size;
}
static bool hints_array_available(struct dm_cache_metadata *cmd,
- const char *policy_name)
+ struct dm_cache_policy *policy)
{
- bool policy_names_match = !strncmp(cmd->policy_name, policy_name,
- sizeof(cmd->policy_name));
-
- return cmd->clean_when_opened && policy_names_match &&
+ return cmd->clean_when_opened && policy_unchanged(cmd, policy) &&
hints_array_initialized(cmd);
}
@@ -899,7 +933,8 @@ static int __load_mapping(void *context, uint64_t cblock, void *leaf)
return r;
}
-static int __load_mappings(struct dm_cache_metadata *cmd, const char *policy_name,
+static int __load_mappings(struct dm_cache_metadata *cmd,
+ struct dm_cache_policy *policy,
load_mapping_fn fn, void *context)
{
struct thunk thunk;
@@ -909,18 +944,19 @@ static int __load_mappings(struct dm_cache_metadata *cmd, const char *policy_nam
thunk.cmd = cmd;
thunk.respect_dirty_flags = cmd->clean_when_opened;
- thunk.hints_valid = hints_array_available(cmd, policy_name);
+ thunk.hints_valid = hints_array_available(cmd, policy);
return dm_array_walk(&cmd->info, cmd->root, __load_mapping, &thunk);
}
-int dm_cache_load_mappings(struct dm_cache_metadata *cmd, const char *policy_name,
+int dm_cache_load_mappings(struct dm_cache_metadata *cmd,
+ struct dm_cache_policy *policy,
load_mapping_fn fn, void *context)
{
int r;
down_read(&cmd->root_lock);
- r = __load_mappings(cmd, policy_name, fn, context);
+ r = __load_mappings(cmd, policy, fn, context);
up_read(&cmd->root_lock);
return r;
@@ -979,7 +1015,7 @@ static int __dirty(struct dm_cache_metadata *cmd, dm_cblock_t cblock, bool dirty
/* nothing to be done */
return 0;
- value = pack_value(oblock, flags | (dirty ? M_DIRTY : 0));
+ value = pack_value(oblock, (flags & ~M_DIRTY) | (dirty ? M_DIRTY : 0));
__dm_bless_for_disk(&value);
r = dm_array_set_value(&cmd->info, cmd->root, from_cblock(cblock),
@@ -1008,7 +1044,7 @@ void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
struct dm_cache_statistics *stats)
{
down_read(&cmd->root_lock);
- memcpy(stats, &cmd->stats, sizeof(*stats));
+ *stats = cmd->stats;
up_read(&cmd->root_lock);
}
@@ -1016,7 +1052,7 @@ void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
struct dm_cache_statistics *stats)
{
down_write(&cmd->root_lock);
- memcpy(&cmd->stats, stats, sizeof(*stats));
+ cmd->stats = *stats;
up_write(&cmd->root_lock);
}
@@ -1070,13 +1106,15 @@ static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *po
__le32 value;
size_t hint_size;
const char *policy_name = dm_cache_policy_get_name(policy);
+ const unsigned *policy_version = dm_cache_policy_get_version(policy);
if (!policy_name[0] ||
(strlen(policy_name) > sizeof(cmd->policy_name) - 1))
return -EINVAL;
- if (strcmp(cmd->policy_name, policy_name)) {
+ if (!policy_unchanged(cmd, policy)) {
strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name));
+ memcpy(cmd->policy_version, policy_version, sizeof(cmd->policy_version));
hint_size = dm_cache_policy_get_hint_size(policy);
if (!hint_size)
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index 135864ea0eee..f45cef21f3d0 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -89,7 +89,7 @@ typedef int (*load_mapping_fn)(void *context, dm_oblock_t oblock,
dm_cblock_t cblock, bool dirty,
uint32_t hint, bool hint_valid);
int dm_cache_load_mappings(struct dm_cache_metadata *cmd,
- const char *policy_name,
+ struct dm_cache_policy *policy,
load_mapping_fn fn,
void *context);
diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c
index cc05d70b3cb8..b04d1f904d07 100644
--- a/drivers/md/dm-cache-policy-cleaner.c
+++ b/drivers/md/dm-cache-policy-cleaner.c
@@ -17,7 +17,6 @@
/*----------------------------------------------------------------*/
#define DM_MSG_PREFIX "cache cleaner"
-#define CLEANER_VERSION "1.0.0"
/* Cache entry struct. */
struct wb_cache_entry {
@@ -434,6 +433,7 @@ static struct dm_cache_policy *wb_create(dm_cblock_t cache_size,
static struct dm_cache_policy_type wb_policy_type = {
.name = "cleaner",
+ .version = {1, 0, 0},
.hint_size = 0,
.owner = THIS_MODULE,
.create = wb_create
@@ -446,7 +446,10 @@ static int __init wb_init(void)
if (r < 0)
DMERR("register failed %d", r);
else
- DMINFO("version " CLEANER_VERSION " loaded");
+ DMINFO("version %u.%u.%u loaded",
+ wb_policy_type.version[0],
+ wb_policy_type.version[1],
+ wb_policy_type.version[2]);
return r;
}
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
index 52a75beeced5..0928abdc49f0 100644
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -117,6 +117,8 @@ void dm_cache_policy_destroy(struct dm_cache_policy *p);
*/
const char *dm_cache_policy_get_name(struct dm_cache_policy *p);
+const unsigned *dm_cache_policy_get_version(struct dm_cache_policy *p);
+
size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p);
/*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 964153255076..dc112a7137fe 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -14,7 +14,6 @@
#include <linux/vmalloc.h>
#define DM_MSG_PREFIX "cache-policy-mq"
-#define MQ_VERSION "1.0.0"
static struct kmem_cache *mq_entry_cache;
@@ -1133,6 +1132,7 @@ bad_cache_alloc:
static struct dm_cache_policy_type mq_policy_type = {
.name = "mq",
+ .version = {1, 0, 0},
.hint_size = 4,
.owner = THIS_MODULE,
.create = mq_create
@@ -1140,6 +1140,7 @@ static struct dm_cache_policy_type mq_policy_type = {
static struct dm_cache_policy_type default_policy_type = {
.name = "default",
+ .version = {1, 0, 0},
.hint_size = 4,
.owner = THIS_MODULE,
.create = mq_create
@@ -1164,7 +1165,10 @@ static int __init mq_init(void)
r = dm_cache_policy_register(&default_policy_type);
if (!r) {
- DMINFO("version " MQ_VERSION " loaded");
+ DMINFO("version %u.%u.%u loaded",
+ mq_policy_type.version[0],
+ mq_policy_type.version[1],
+ mq_policy_type.version[2]);
return 0;
}
diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c
index 2cbf5fdaac52..21c03c570c06 100644
--- a/drivers/md/dm-cache-policy.c
+++ b/drivers/md/dm-cache-policy.c
@@ -150,6 +150,14 @@ const char *dm_cache_policy_get_name(struct dm_cache_policy *p)
}
EXPORT_SYMBOL_GPL(dm_cache_policy_get_name);
+const unsigned *dm_cache_policy_get_version(struct dm_cache_policy *p)
+{
+ struct dm_cache_policy_type *t = p->private;
+
+ return t->version;
+}
+EXPORT_SYMBOL_GPL(dm_cache_policy_get_version);
+
size_t dm_cache_policy_get_hint_size(struct dm_cache_policy *p)
{
struct dm_cache_policy_type *t = p->private;
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index f0f51b260544..33369ca9614f 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -130,8 +130,8 @@ struct dm_cache_policy {
*
* Must not block.
*
- * Returns 1 iff in cache, 0 iff not, < 0 on error (-EWOULDBLOCK
- * would be typical).
+ * Returns 0 if in cache, -ENOENT if not, < 0 for other errors
+ * (-EWOULDBLOCK would be typical).
*/
int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock);
@@ -196,6 +196,7 @@ struct dm_cache_policy {
* We maintain a little register of the different policy types.
*/
#define CACHE_POLICY_NAME_SIZE 16
+#define CACHE_POLICY_VERSION_SIZE 3
struct dm_cache_policy_type {
/* For use by the register code only. */
@@ -206,6 +207,7 @@ struct dm_cache_policy_type {
* what gets passed on the target line to select your policy.
*/
char name[CACHE_POLICY_NAME_SIZE];
+ unsigned version[CACHE_POLICY_VERSION_SIZE];
/*
* Policies may store a hint for each each cache block.
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 0f4e84b15c30..df44b60e66f2 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -6,6 +6,7 @@
#include "dm.h"
#include "dm-bio-prison.h"
+#include "dm-bio-record.h"
#include "dm-cache-metadata.h"
#include <linux/dm-io.h>
@@ -142,6 +143,7 @@ struct cache {
spinlock_t lock;
struct bio_list deferred_bios;
struct bio_list deferred_flush_bios;
+ struct bio_list deferred_writethrough_bios;
struct list_head quiesced_migrations;
struct list_head completed_migrations;
struct list_head need_commit_migrations;
@@ -158,7 +160,7 @@ struct cache {
/*
* origin_blocks entries, discarded if set.
*/
- sector_t discard_block_size; /* a power of 2 times sectors per block */
+ uint32_t discard_block_size; /* a power of 2 times sectors per block */
dm_dblock_t discard_nr_blocks;
unsigned long *discard_bitset;
@@ -199,6 +201,16 @@ struct per_bio_data {
bool tick:1;
unsigned req_nr:2;
struct dm_deferred_entry *all_io_entry;
+
+ /*
+ * writethrough fields. These MUST remain at the end of this
+ * structure and the 'cache' member must be the first as it
+ * is used to determine the offset of the writethrough fields.
+ */
+ struct cache *cache;
+ dm_cblock_t cblock;
+ bio_end_io_t *saved_bi_end_io;
+ struct dm_bio_details bio_details;
};
struct dm_cache_migration {
@@ -381,7 +393,7 @@ static int get_cell(struct cache *cache,
return r;
}
- /*----------------------------------------------------------------*/
+/*----------------------------------------------------------------*/
static bool is_dirty(struct cache *cache, dm_cblock_t b)
{
@@ -407,22 +419,30 @@ static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cbl
}
/*----------------------------------------------------------------*/
+
static bool block_size_is_power_of_two(struct cache *cache)
{
return cache->sectors_per_block_shift >= 0;
}
+static dm_block_t block_div(dm_block_t b, uint32_t n)
+{
+ do_div(b, n);
+
+ return b;
+}
+
static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
{
- sector_t discard_blocks = cache->discard_block_size;
+ uint32_t discard_blocks = cache->discard_block_size;
dm_block_t b = from_oblock(oblock);
if (!block_size_is_power_of_two(cache))
- (void) sector_div(discard_blocks, cache->sectors_per_block);
+ discard_blocks = discard_blocks / cache->sectors_per_block;
else
discard_blocks >>= cache->sectors_per_block_shift;
- (void) sector_div(b, discard_blocks);
+ b = block_div(b, discard_blocks);
return to_dblock(b);
}
@@ -500,16 +520,28 @@ static void save_stats(struct cache *cache)
/*----------------------------------------------------------------
* Per bio data
*--------------------------------------------------------------*/
-static struct per_bio_data *get_per_bio_data(struct bio *bio)
+
+/*
+ * If using writeback, leave out struct per_bio_data's writethrough fields.
+ */
+#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
+#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
+
+static size_t get_per_bio_data_size(struct cache *cache)
{
- struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
+ return cache->features.write_through ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
+}
+
+static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
+{
+ struct per_bio_data *pb = dm_per_bio_data(bio, data_size);
BUG_ON(!pb);
return pb;
}
-static struct per_bio_data *init_per_bio_data(struct bio *bio)
+static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
{
- struct per_bio_data *pb = get_per_bio_data(bio);
+ struct per_bio_data *pb = get_per_bio_data(bio, data_size);
pb->tick = false;
pb->req_nr = dm_bio_get_target_bio_nr(bio);
@@ -543,7 +575,8 @@ static void remap_to_cache(struct cache *cache, struct bio *bio,
static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
{
unsigned long flags;
- struct per_bio_data *pb = get_per_bio_data(bio);
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
spin_lock_irqsave(&cache->lock, flags);
if (cache->need_tick_bio &&
@@ -609,6 +642,58 @@ static void issue(struct cache *cache, struct bio *bio)
spin_unlock_irqrestore(&cache->lock, flags);
}
+static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ bio_list_add(&cache->deferred_writethrough_bios, bio);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ wake_worker(cache);
+}
+
+static void writethrough_endio(struct bio *bio, int err)
+{
+ struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
+ bio->bi_end_io = pb->saved_bi_end_io;
+
+ if (err) {
+ bio_endio(bio, err);
+ return;
+ }
+
+ dm_bio_restore(&pb->bio_details, bio);
+ remap_to_cache(pb->cache, bio, pb->cblock);
+
+ /*
+ * We can't issue this bio directly, since we're in interrupt
+ * context. So it gets put on a bio list for processing by the
+ * worker thread.
+ */
+ defer_writethrough_bio(pb->cache, bio);
+}
+
+/*
+ * When running in writethrough mode we need to send writes to clean blocks
+ * to both the cache and origin devices. In future we'd like to clone the
+ * bio and send them in parallel, but for now we're doing them in
+ * series as this is easier.
+ */
+static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
+ dm_oblock_t oblock, dm_cblock_t cblock)
+{
+ struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
+
+ pb->cache = cache;
+ pb->cblock = cblock;
+ pb->saved_bi_end_io = bio->bi_end_io;
+ dm_bio_record(&pb->bio_details, bio);
+ bio->bi_end_io = writethrough_endio;
+
+ remap_to_origin_clear_discard(pb->cache, bio, oblock);
+}
+
/*----------------------------------------------------------------
* Migration processing
*
@@ -972,7 +1057,8 @@ static void defer_bio(struct cache *cache, struct bio *bio)
static void process_flush_bio(struct cache *cache, struct bio *bio)
{
- struct per_bio_data *pb = get_per_bio_data(bio);
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
BUG_ON(bio->bi_size);
if (!pb->req_nr)
@@ -1002,7 +1088,7 @@ static void process_discard_bio(struct cache *cache, struct bio *bio)
dm_block_t end_block = bio->bi_sector + bio_sectors(bio);
dm_block_t b;
- (void) sector_div(end_block, cache->discard_block_size);
+ end_block = block_div(end_block, cache->discard_block_size);
for (b = start_block; b < end_block; b++)
set_discard(cache, to_dblock(b));
@@ -1044,7 +1130,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
dm_oblock_t block = get_bio_block(cache, bio);
struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
struct policy_result lookup_result;
- struct per_bio_data *pb = get_per_bio_data(bio);
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
bool discarded_block = is_discarded_oblock(cache, block);
bool can_migrate = discarded_block || spare_migration_bandwidth(cache);
@@ -1070,14 +1157,9 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
inc_hit_counter(cache, bio);
pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
- if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
- /*
- * No need to mark anything dirty in write through mode.
- */
- pb->req_nr == 0 ?
- remap_to_cache(cache, bio, lookup_result.cblock) :
- remap_to_origin_clear_discard(cache, bio, block);
- } else
+ if (is_writethrough_io(cache, bio, lookup_result.cblock))
+ remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
+ else
remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
issue(cache, bio);
@@ -1086,17 +1168,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
case POLICY_MISS:
inc_miss_counter(cache, bio);
pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
-
- if (pb->req_nr != 0) {
- /*
- * This is a duplicate writethrough io that is no
- * longer needed because the block has been demoted.
- */
- bio_endio(bio, 0);
- } else {
- remap_to_origin_clear_discard(cache, bio, block);
- issue(cache, bio);
- }
+ remap_to_origin_clear_discard(cache, bio, block);
+ issue(cache, bio);
break;
case POLICY_NEW:
@@ -1217,6 +1290,23 @@ static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
submit_bios ? generic_make_request(bio) : bio_io_error(bio);
}
+static void process_deferred_writethrough_bios(struct cache *cache)
+{
+ unsigned long flags;
+ struct bio_list bios;
+ struct bio *bio;
+
+ bio_list_init(&bios);
+
+ spin_lock_irqsave(&cache->lock, flags);
+ bio_list_merge(&bios, &cache->deferred_writethrough_bios);
+ bio_list_init(&cache->deferred_writethrough_bios);
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ while ((bio = bio_list_pop(&bios)))
+ generic_make_request(bio);
+}
+
static void writeback_some_dirty_blocks(struct cache *cache)
{
int r = 0;
@@ -1313,6 +1403,7 @@ static int more_work(struct cache *cache)
else
return !bio_list_empty(&cache->deferred_bios) ||
!bio_list_empty(&cache->deferred_flush_bios) ||
+ !bio_list_empty(&cache->deferred_writethrough_bios) ||
!list_empty(&cache->quiesced_migrations) ||
!list_empty(&cache->completed_migrations) ||
!list_empty(&cache->need_commit_migrations);
@@ -1331,6 +1422,8 @@ static void do_worker(struct work_struct *ws)
writeback_some_dirty_blocks(cache);
+ process_deferred_writethrough_bios(cache);
+
if (commit_if_needed(cache)) {
process_deferred_flush_bios(cache, false);
@@ -1353,6 +1446,7 @@ static void do_worker(struct work_struct *ws)
static void do_waker(struct work_struct *ws)
{
struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
+ policy_tick(cache->policy);
wake_worker(cache);
queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
}
@@ -1717,7 +1811,37 @@ static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
static struct kmem_cache *migration_cache;
-static int set_config_values(struct dm_cache_policy *p, int argc, const char **argv)
+#define NOT_CORE_OPTION 1
+
+static int process_config_option(struct cache *cache, const char *key, const char *value)
+{
+ unsigned long tmp;
+
+ if (!strcasecmp(key, "migration_threshold")) {
+ if (kstrtoul(value, 10, &tmp))
+ return -EINVAL;
+
+ cache->migration_threshold = tmp;
+ return 0;
+ }
+
+ return NOT_CORE_OPTION;
+}
+
+static int set_config_value(struct cache *cache, const char *key, const char *value)
+{
+ int r = process_config_option(cache, key, value);
+
+ if (r == NOT_CORE_OPTION)
+ r = policy_set_config_value(cache->policy, key, value);
+
+ if (r)
+ DMWARN("bad config value for %s: %s", key, value);
+
+ return r;
+}
+
+static int set_config_values(struct cache *cache, int argc, const char **argv)
{
int r = 0;
@@ -1727,12 +1851,9 @@ static int set_config_values(struct dm_cache_policy *p, int argc, const char **a
}
while (argc) {
- r = policy_set_config_value(p, argv[0], argv[1]);
- if (r) {
- DMWARN("policy_set_config_value failed: key = '%s', value = '%s'",
- argv[0], argv[1]);
- return r;
- }
+ r = set_config_value(cache, argv[0], argv[1]);
+ if (r)
+ break;
argc -= 2;
argv += 2;
@@ -1744,8 +1865,6 @@ static int set_config_values(struct dm_cache_policy *p, int argc, const char **a
static int create_cache_policy(struct cache *cache, struct cache_args *ca,
char **error)
{
- int r;
-
cache->policy = dm_cache_policy_create(ca->policy_name,
cache->cache_size,
cache->origin_sectors,
@@ -1755,11 +1874,7 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca,
return -ENOMEM;
}
- r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv);
- if (r)
- dm_cache_policy_destroy(cache->policy);
-
- return r;
+ return 0;
}
/*
@@ -1791,9 +1906,7 @@ static sector_t calculate_discard_block_size(sector_t cache_block_size,
return discard_block_size;
}
-#define DEFAULT_MIGRATION_THRESHOLD (2048 * 100)
-
-static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio);
+#define DEFAULT_MIGRATION_THRESHOLD 2048
static int cache_create(struct cache_args *ca, struct cache **result)
{
@@ -1811,7 +1924,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
cache->ti = ca->ti;
ti->private = cache;
- ti->per_bio_data_size = sizeof(struct per_bio_data);
ti->num_flush_bios = 2;
ti->flush_supported = true;
@@ -1819,10 +1931,8 @@ static int cache_create(struct cache_args *ca, struct cache **result)
ti->discards_supported = true;
ti->discard_zeroes_data_unsupported = true;
- memcpy(&cache->features, &ca->features, sizeof(cache->features));
-
- if (cache->features.write_through)
- ti->num_write_bios = cache_num_write_bios;
+ cache->features = ca->features;
+ ti->per_bio_data_size = get_per_bio_data_size(cache);
cache->callbacks.congested_fn = cache_is_congested;
dm_table_add_target_callbacks(ti->table, &cache->callbacks);
@@ -1835,7 +1945,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
/* FIXME: factor out this whole section */
origin_blocks = cache->origin_sectors = ca->origin_sectors;
- (void) sector_div(origin_blocks, ca->block_size);
+ origin_blocks = block_div(origin_blocks, ca->block_size);
cache->origin_blocks = to_oblock(origin_blocks);
cache->sectors_per_block = ca->block_size;
@@ -1848,7 +1958,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
dm_block_t cache_size = ca->cache_sectors;
cache->sectors_per_block_shift = -1;
- (void) sector_div(cache_size, ca->block_size);
+ cache_size = block_div(cache_size, ca->block_size);
cache->cache_size = to_cblock(cache_size);
} else {
cache->sectors_per_block_shift = __ffs(ca->block_size);
@@ -1858,7 +1968,15 @@ static int cache_create(struct cache_args *ca, struct cache **result)
r = create_cache_policy(cache, ca, error);
if (r)
goto bad;
+
cache->policy_nr_args = ca->policy_argc;
+ cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
+
+ r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
+ if (r) {
+ *error = "Error setting cache policy's config values";
+ goto bad;
+ }
cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
ca->block_size, may_format,
@@ -1873,13 +1991,14 @@ static int cache_create(struct cache_args *ca, struct cache **result)
spin_lock_init(&cache->lock);
bio_list_init(&cache->deferred_bios);
bio_list_init(&cache->deferred_flush_bios);
+ bio_list_init(&cache->deferred_writethrough_bios);
INIT_LIST_HEAD(&cache->quiesced_migrations);
INIT_LIST_HEAD(&cache->completed_migrations);
INIT_LIST_HEAD(&cache->need_commit_migrations);
- cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
atomic_set(&cache->nr_migrations, 0);
init_waitqueue_head(&cache->migration_wait);
+ r = -ENOMEM;
cache->nr_dirty = 0;
cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
if (!cache->dirty_bitset) {
@@ -2002,6 +2121,8 @@ static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto out;
r = cache_create(ca, &cache);
+ if (r)
+ goto out;
r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
if (r) {
@@ -2016,26 +2137,13 @@ out:
return r;
}
-static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio)
-{
- int r;
- struct cache *cache = ti->private;
- dm_oblock_t block = get_bio_block(cache, bio);
- dm_cblock_t cblock;
-
- r = policy_lookup(cache->policy, block, &cblock);
- if (r < 0)
- return 2; /* assume the worst */
-
- return (!r && !is_dirty(cache, cblock)) ? 2 : 1;
-}
-
static int cache_map(struct dm_target *ti, struct bio *bio)
{
struct cache *cache = ti->private;
int r;
dm_oblock_t block = get_bio_block(cache, bio);
+ size_t pb_data_size = get_per_bio_data_size(cache);
bool can_migrate = false;
bool discarded_block;
struct dm_bio_prison_cell *cell;
@@ -2052,7 +2160,7 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_REMAPPED;
}
- pb = init_per_bio_data(bio);
+ pb = init_per_bio_data(bio, pb_data_size);
if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
defer_bio(cache, bio);
@@ -2097,18 +2205,12 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
inc_hit_counter(cache, bio);
pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
- if (is_writethrough_io(cache, bio, lookup_result.cblock)) {
- /*
- * No need to mark anything dirty in write through mode.
- */
- pb->req_nr == 0 ?
- remap_to_cache(cache, bio, lookup_result.cblock) :
- remap_to_origin_clear_discard(cache, bio, block);
- cell_defer(cache, cell, false);
- } else {
+ if (is_writethrough_io(cache, bio, lookup_result.cblock))
+ remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
+ else
remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
- cell_defer(cache, cell, false);
- }
+
+ cell_defer(cache, cell, false);
break;
case POLICY_MISS:
@@ -2143,7 +2245,8 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
{
struct cache *cache = ti->private;
unsigned long flags;
- struct per_bio_data *pb = get_per_bio_data(bio);
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
if (pb->tick) {
policy_tick(cache->policy);
@@ -2319,8 +2422,7 @@ static int cache_preresume(struct dm_target *ti)
}
if (!cache->loaded_mappings) {
- r = dm_cache_load_mappings(cache->cmd,
- dm_cache_policy_get_name(cache->policy),
+ r = dm_cache_load_mappings(cache->cmd, cache->policy,
load_mapping, cache);
if (r) {
DMERR("could not load cache mappings");
@@ -2443,23 +2545,6 @@ err:
DMEMIT("Error");
}
-#define NOT_CORE_OPTION 1
-
-static int process_config_option(struct cache *cache, char **argv)
-{
- unsigned long tmp;
-
- if (!strcasecmp(argv[0], "migration_threshold")) {
- if (kstrtoul(argv[1], 10, &tmp))
- return -EINVAL;
-
- cache->migration_threshold = tmp;
- return 0;
- }
-
- return NOT_CORE_OPTION;
-}
-
/*
* Supports <key> <value>.
*
@@ -2467,17 +2552,12 @@ static int process_config_option(struct cache *cache, char **argv)
*/
static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
{
- int r;
struct cache *cache = ti->private;
if (argc != 2)
return -EINVAL;
- r = process_config_option(cache, argv);
- if (r == NOT_CORE_OPTION)
- return policy_set_config_value(cache->policy, argv[0], argv[1]);
-
- return r;
+ return set_config_value(cache, argv[0], argv[1]);
}
static int cache_iterate_devices(struct dm_target *ti,
@@ -2535,7 +2615,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type cache_target = {
.name = "cache",
- .version = {1, 0, 0},
+ .version = {1, 1, 1},
.module = THIS_MODULE,
.ctr = cache_ctr,
.dtr = cache_dtr,
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 13c15480d940..6d2d41ae9e32 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -858,8 +858,7 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
unsigned int i;
struct bio_vec *bv;
- for (i = 0; i < clone->bi_vcnt; i++) {
- bv = bio_iovec_idx(clone, i);
+ bio_for_each_segment_all(bv, clone, i) {
BUG_ON(!bv->bv_page);
mempool_free(bv->bv_page, cc->page_pool);
bv->bv_page = NULL;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 51bb81676be3..bdf26f5bd326 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -907,6 +907,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
+ ti->num_write_same_bios = 1;
return 0;
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 311e3d35b272..1d3fe1a40a9b 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1279,6 +1279,31 @@ static int raid_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_SUBMITTED;
}
+static const char *decipher_sync_action(struct mddev *mddev)
+{
+ if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
+ return "frozen";
+
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+ (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
+ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ return "reshape";
+
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+ if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ return "resync";
+ else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
+ return "check";
+ return "repair";
+ }
+
+ if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
+ return "recover";
+ }
+
+ return "idle";
+}
+
static void raid_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
{
@@ -1298,8 +1323,18 @@ static void raid_status(struct dm_target *ti, status_type_t type,
sync = rs->md.recovery_cp;
if (sync >= rs->md.resync_max_sectors) {
+ /*
+ * Sync complete.
+ */
array_in_sync = 1;
sync = rs->md.resync_max_sectors;
+ } else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) {
+ /*
+ * If "check" or "repair" is occurring, the array has
+ * undergone and initial sync and the health characters
+ * should not be 'a' anymore.
+ */
+ array_in_sync = 1;
} else {
/*
* The array may be doing an initial sync, or it may
@@ -1311,6 +1346,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
array_in_sync = 1;
}
+
/*
* Status characters:
* 'D' = Dead/Failed device
@@ -1339,6 +1375,21 @@ static void raid_status(struct dm_target *ti, status_type_t type,
(unsigned long long) sync,
(unsigned long long) rs->md.resync_max_sectors);
+ /*
+ * Sync action:
+ * See Documentation/device-mapper/dm-raid.c for
+ * information on each of these states.
+ */
+ DMEMIT(" %s", decipher_sync_action(&rs->md));
+
+ /*
+ * resync_mismatches/mismatch_cnt
+ * This field shows the number of discrepancies found when
+ * performing a "check" of the array.
+ */
+ DMEMIT(" %llu",
+ (unsigned long long)
+ atomic64_read(&rs->md.resync_mismatches));
break;
case STATUSTYPE_TABLE:
/* The string you would use to construct this array */
@@ -1425,7 +1476,62 @@ static void raid_status(struct dm_target *ti, status_type_t type,
}
}
-static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
+static int raid_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+ struct raid_set *rs = ti->private;
+ struct mddev *mddev = &rs->md;
+
+ if (!strcasecmp(argv[0], "reshape")) {
+ DMERR("Reshape not supported.");
+ return -EINVAL;
+ }
+
+ if (!mddev->pers || !mddev->pers->sync_request)
+ return -EINVAL;
+
+ if (!strcasecmp(argv[0], "frozen"))
+ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ else
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+
+ if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) {
+ if (mddev->sync_thread) {
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_reap_sync_thread(mddev);
+ }
+ } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
+ return -EBUSY;
+ else if (!strcasecmp(argv[0], "resync"))
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ else if (!strcasecmp(argv[0], "recover")) {
+ set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ } else {
+ if (!strcasecmp(argv[0], "check"))
+ set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ else if (!!strcasecmp(argv[0], "repair"))
+ return -EINVAL;
+ set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ }
+ if (mddev->ro == 2) {
+ /* A write to sync_action is enough to justify
+ * canceling read-auto mode
+ */
+ mddev->ro = 0;
+ if (!mddev->suspended)
+ md_wakeup_thread(mddev->sync_thread);
+ }
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ if (!mddev->suspended)
+ md_wakeup_thread(mddev->thread);
+
+ return 0;
+}
+
+static int raid_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
{
struct raid_set *rs = ti->private;
unsigned i;
@@ -1482,12 +1588,13 @@ static void raid_resume(struct dm_target *ti)
static struct target_type raid_target = {
.name = "raid",
- .version = {1, 4, 2},
+ .version = {1, 5, 0},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,
.map = raid_map,
.status = raid_status,
+ .message = raid_message,
.iterate_devices = raid_iterate_devices,
.io_hints = raid_io_hints,
.presuspend = raid_presuspend,
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index d053098c6a91..699b5be68d31 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -458,7 +458,7 @@ static void map_region(struct dm_io_region *io, struct mirror *m,
{
io->bdev = m->dev->bdev;
io->sector = map_sector(m, bio);
- io->count = bio->bi_size >> 9;
+ io->count = bio_sectors(bio);
}
static void hold_bio(struct mirror_set *ms, struct bio *bio)
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index c0e07026a8d1..c434e5aab2df 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1121,6 +1121,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
s->pending_pool = mempool_create_slab_pool(MIN_IOS, pending_cache);
if (!s->pending_pool) {
ti->error = "Could not allocate mempool for pending exceptions";
+ r = -ENOMEM;
goto bad_pending_pool;
}
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index d8837d313f54..d907ca6227ce 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -94,7 +94,7 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct stripe_c *sc;
- sector_t width;
+ sector_t width, tmp_len;
uint32_t stripes;
uint32_t chunk_size;
int r;
@@ -116,15 +116,16 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
width = ti->len;
- if (sector_div(width, chunk_size)) {
+ if (sector_div(width, stripes)) {
ti->error = "Target length not divisible by "
- "chunk size";
+ "number of stripes";
return -EINVAL;
}
- if (sector_div(width, stripes)) {
+ tmp_len = width;
+ if (sector_div(tmp_len, chunk_size)) {
ti->error = "Target length not divisible by "
- "number of stripes";
+ "chunk size";
return -EINVAL;
}
@@ -258,7 +259,7 @@ static int stripe_map_range(struct stripe_c *sc, struct bio *bio,
sector_t begin, end;
stripe_map_range_sector(sc, bio->bi_sector, target_stripe, &begin);
- stripe_map_range_sector(sc, bio->bi_sector + bio_sectors(bio),
+ stripe_map_range_sector(sc, bio_end_sector(bio),
target_stripe, &end);
if (begin < end) {
bio->bi_bdev = sc->stripe[target_stripe].dev->bdev;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index e50dad0c65f4..1ff252ab7d46 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1442,7 +1442,7 @@ static bool dm_table_supports_write_same(struct dm_table *t)
return false;
if (!ti->type->iterate_devices ||
- !ti->type->iterate_devices(ti, device_not_write_same_capable, NULL))
+ ti->type->iterate_devices(ti, device_not_write_same_capable, NULL))
return false;
}
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 00cee02f8fc9..60bce435f4fa 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1645,12 +1645,12 @@ int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
return r;
}
-static int __resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
+static int __resize_space_map(struct dm_space_map *sm, dm_block_t new_count)
{
int r;
dm_block_t old_count;
- r = dm_sm_get_nr_blocks(pmd->data_sm, &old_count);
+ r = dm_sm_get_nr_blocks(sm, &old_count);
if (r)
return r;
@@ -1658,11 +1658,11 @@ static int __resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
return 0;
if (new_count < old_count) {
- DMERR("cannot reduce size of data device");
+ DMERR("cannot reduce size of space map");
return -EINVAL;
}
- return dm_sm_extend(pmd->data_sm, new_count - old_count);
+ return dm_sm_extend(sm, new_count - old_count);
}
int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
@@ -1671,7 +1671,19 @@ int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
down_write(&pmd->root_lock);
if (!pmd->fail_io)
- r = __resize_data_dev(pmd, new_count);
+ r = __resize_space_map(pmd->data_sm, new_count);
+ up_write(&pmd->root_lock);
+
+ return r;
+}
+
+int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
+{
+ int r = -EINVAL;
+
+ down_write(&pmd->root_lock);
+ if (!pmd->fail_io)
+ r = __resize_space_map(pmd->metadata_sm, new_count);
up_write(&pmd->root_lock);
return r;
@@ -1684,3 +1696,17 @@ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
dm_bm_set_read_only(pmd->bm);
up_write(&pmd->root_lock);
}
+
+int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
+ dm_block_t threshold,
+ dm_sm_threshold_fn fn,
+ void *context)
+{
+ int r;
+
+ down_write(&pmd->root_lock);
+ r = dm_sm_register_threshold_callback(pmd->metadata_sm, threshold, fn, context);
+ up_write(&pmd->root_lock);
+
+ return r;
+}
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 0cecc3702885..845ebbe589a9 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -8,6 +8,7 @@
#define DM_THIN_METADATA_H
#include "persistent-data/dm-block-manager.h"
+#include "persistent-data/dm-space-map.h"
#define THIN_METADATA_BLOCK_SIZE 4096
@@ -185,6 +186,7 @@ int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result);
* blocks would be lost.
*/
int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_size);
+int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_size);
/*
* Flicks the underlying block manager into read only mode, so you know
@@ -192,6 +194,11 @@ int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_size);
*/
void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd);
+int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
+ dm_block_t threshold,
+ dm_sm_threshold_fn fn,
+ void *context);
+
/*----------------------------------------------------------------*/
#endif
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 009339d62828..759cffc45cab 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -922,7 +922,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
return r;
if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
- DMWARN("%s: reached low water mark, sending event.",
+ DMWARN("%s: reached low water mark for data device: sending event.",
dm_device_name(pool->pool_md));
spin_lock_irqsave(&pool->lock, flags);
pool->low_water_triggered = 1;
@@ -1281,6 +1281,10 @@ static void process_bio_fail(struct thin_c *tc, struct bio *bio)
bio_io_error(bio);
}
+/*
+ * FIXME: should we also commit due to size of transaction, measured in
+ * metadata blocks?
+ */
static int need_commit_due_to_time(struct pool *pool)
{
return jiffies < pool->last_commit_jiffies ||
@@ -1577,6 +1581,11 @@ static bool data_dev_supports_discard(struct pool_c *pt)
return q && blk_queue_discard(q);
}
+static bool is_factor(sector_t block_size, uint32_t n)
+{
+ return !sector_div(block_size, n);
+}
+
/*
* If discard_passdown was enabled verify that the data device
* supports discards. Disable discard_passdown if not.
@@ -1602,7 +1611,7 @@ static void disable_passdown_if_not_supported(struct pool_c *pt)
else if (data_limits->discard_granularity > block_size)
reason = "discard granularity larger than a block";
- else if (block_size & (data_limits->discard_granularity - 1))
+ else if (!is_factor(block_size, data_limits->discard_granularity))
reason = "discard granularity not a factor of block size";
if (reason) {
@@ -1904,6 +1913,56 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
return r;
}
+static void metadata_low_callback(void *context)
+{
+ struct pool *pool = context;
+
+ DMWARN("%s: reached low water mark for metadata device: sending event.",
+ dm_device_name(pool->pool_md));
+
+ dm_table_event(pool->ti->table);
+}
+
+static sector_t get_metadata_dev_size(struct block_device *bdev)
+{
+ sector_t metadata_dev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+ char buffer[BDEVNAME_SIZE];
+
+ if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) {
+ DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
+ bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS);
+ metadata_dev_size = THIN_METADATA_MAX_SECTORS_WARNING;
+ }
+
+ return metadata_dev_size;
+}
+
+static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)
+{
+ sector_t metadata_dev_size = get_metadata_dev_size(bdev);
+
+ sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
+
+ return metadata_dev_size;
+}
+
+/*
+ * When a metadata threshold is crossed a dm event is triggered, and
+ * userland should respond by growing the metadata device. We could let
+ * userland set the threshold, like we do with the data threshold, but I'm
+ * not sure they know enough to do this well.
+ */
+static dm_block_t calc_metadata_threshold(struct pool_c *pt)
+{
+ /*
+ * 4M is ample for all ops with the possible exception of thin
+ * device deletion which is harmless if it fails (just retry the
+ * delete after you've grown the device).
+ */
+ dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4;
+ return min((dm_block_t)1024ULL /* 4M */, quarter);
+}
+
/*
* thin-pool <metadata dev> <data dev>
* <data block size (sectors)>
@@ -1926,8 +1985,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
unsigned long block_size;
dm_block_t low_water_blocks;
struct dm_dev *metadata_dev;
- sector_t metadata_dev_size;
- char b[BDEVNAME_SIZE];
+ fmode_t metadata_mode;
/*
* FIXME Remove validation from scope of lock.
@@ -1939,19 +1997,32 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
r = -EINVAL;
goto out_unlock;
}
+
as.argc = argc;
as.argv = argv;
- r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev);
+ /*
+ * Set default pool features.
+ */
+ pool_features_init(&pf);
+
+ dm_consume_args(&as, 4);
+ r = parse_pool_features(&as, &pf, ti);
+ if (r)
+ goto out_unlock;
+
+ metadata_mode = FMODE_READ | ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE);
+ r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev);
if (r) {
ti->error = "Error opening metadata block device";
goto out_unlock;
}
- metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
- if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
- DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
- bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
+ /*
+ * Run for the side-effect of possibly issuing a warning if the
+ * device is too big.
+ */
+ (void) get_metadata_dev_size(metadata_dev->bdev);
r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
if (r) {
@@ -1974,16 +2045,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto out;
}
- /*
- * Set default pool features.
- */
- pool_features_init(&pf);
-
- dm_consume_args(&as, 4);
- r = parse_pool_features(&as, &pf, ti);
- if (r)
- goto out;
-
pt = kzalloc(sizeof(*pt), GFP_KERNEL);
if (!pt) {
r = -ENOMEM;
@@ -2035,6 +2096,13 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
}
ti->private = pt;
+ r = dm_pool_register_metadata_threshold(pt->pool->pmd,
+ calc_metadata_threshold(pt),
+ metadata_low_callback,
+ pool);
+ if (r)
+ goto out_free_pt;
+
pt->callbacks.congested_fn = pool_is_congested;
dm_table_add_target_callbacks(ti->table, &pt->callbacks);
@@ -2074,18 +2142,7 @@ static int pool_map(struct dm_target *ti, struct bio *bio)
return r;
}
-/*
- * Retrieves the number of blocks of the data device from
- * the superblock and compares it to the actual device size,
- * thus resizing the data device in case it has grown.
- *
- * This both copes with opening preallocated data devices in the ctr
- * being followed by a resume
- * -and-
- * calling the resume method individually after userspace has
- * grown the data device in reaction to a table event.
- */
-static int pool_preresume(struct dm_target *ti)
+static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
{
int r;
struct pool_c *pt = ti->private;
@@ -2093,12 +2150,7 @@ static int pool_preresume(struct dm_target *ti)
sector_t data_size = ti->len;
dm_block_t sb_data_size;
- /*
- * Take control of the pool object.
- */
- r = bind_control_target(pool, ti);
- if (r)
- return r;
+ *need_commit = false;
(void) sector_div(data_size, pool->sectors_per_block);
@@ -2109,7 +2161,7 @@ static int pool_preresume(struct dm_target *ti)
}
if (data_size < sb_data_size) {
- DMERR("pool target too small, is %llu blocks (expected %llu)",
+ DMERR("pool target (%llu blocks) too small: expected %llu",
(unsigned long long)data_size, sb_data_size);
return -EINVAL;
@@ -2117,17 +2169,90 @@ static int pool_preresume(struct dm_target *ti)
r = dm_pool_resize_data_dev(pool->pmd, data_size);
if (r) {
DMERR("failed to resize data device");
- /* FIXME Stricter than necessary: Rollback transaction instead here */
set_pool_mode(pool, PM_READ_ONLY);
return r;
}
- (void) commit_or_fallback(pool);
+ *need_commit = true;
}
return 0;
}
+static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
+{
+ int r;
+ struct pool_c *pt = ti->private;
+ struct pool *pool = pt->pool;
+ dm_block_t metadata_dev_size, sb_metadata_dev_size;
+
+ *need_commit = false;
+
+ metadata_dev_size = get_metadata_dev_size(pool->md_dev);
+
+ r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size);
+ if (r) {
+ DMERR("failed to retrieve data device size");
+ return r;
+ }
+
+ if (metadata_dev_size < sb_metadata_dev_size) {
+ DMERR("metadata device (%llu sectors) too small: expected %llu",
+ metadata_dev_size, sb_metadata_dev_size);
+ return -EINVAL;
+
+ } else if (metadata_dev_size > sb_metadata_dev_size) {
+ r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
+ if (r) {
+ DMERR("failed to resize metadata device");
+ return r;
+ }
+
+ *need_commit = true;
+ }
+
+ return 0;
+}
+
+/*
+ * Retrieves the number of blocks of the data device from
+ * the superblock and compares it to the actual device size,
+ * thus resizing the data device in case it has grown.
+ *
+ * This both copes with opening preallocated data devices in the ctr
+ * being followed by a resume
+ * -and-
+ * calling the resume method individually after userspace has
+ * grown the data device in reaction to a table event.
+ */
+static int pool_preresume(struct dm_target *ti)
+{
+ int r;
+ bool need_commit1, need_commit2;
+ struct pool_c *pt = ti->private;
+ struct pool *pool = pt->pool;
+
+ /*
+ * Take control of the pool object.
+ */
+ r = bind_control_target(pool, ti);
+ if (r)
+ return r;
+
+ r = maybe_resize_data_dev(ti, &need_commit1);
+ if (r)
+ return r;
+
+ r = maybe_resize_metadata_dev(ti, &need_commit2);
+ if (r)
+ return r;
+
+ if (need_commit1 || need_commit2)
+ (void) commit_or_fallback(pool);
+
+ return 0;
+}
+
static void pool_resume(struct dm_target *ti)
{
struct pool_c *pt = ti->private;
@@ -2544,7 +2669,7 @@ static struct target_type pool_target = {
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
- .version = {1, 6, 1},
+ .version = {1, 8, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
@@ -2831,7 +2956,7 @@ static int thin_iterate_devices(struct dm_target *ti,
static struct target_type thin_target = {
.name = "thin",
- .version = {1, 7, 1},
+ .version = {1, 8, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 6ad538375c3c..b948fd864d45 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -93,6 +93,13 @@ struct dm_verity_io {
*/
};
+struct dm_verity_prefetch_work {
+ struct work_struct work;
+ struct dm_verity *v;
+ sector_t block;
+ unsigned n_blocks;
+};
+
static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io)
{
return (struct shash_desc *)(io + 1);
@@ -424,15 +431,18 @@ static void verity_end_io(struct bio *bio, int error)
* The root buffer is not prefetched, it is assumed that it will be cached
* all the time.
*/
-static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io)
+static void verity_prefetch_io(struct work_struct *work)
{
+ struct dm_verity_prefetch_work *pw =
+ container_of(work, struct dm_verity_prefetch_work, work);
+ struct dm_verity *v = pw->v;
int i;
for (i = v->levels - 2; i >= 0; i--) {
sector_t hash_block_start;
sector_t hash_block_end;
- verity_hash_at_level(v, io->block, i, &hash_block_start, NULL);
- verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL);
+ verity_hash_at_level(v, pw->block, i, &hash_block_start, NULL);
+ verity_hash_at_level(v, pw->block + pw->n_blocks - 1, i, &hash_block_end, NULL);
if (!i) {
unsigned cluster = ACCESS_ONCE(dm_verity_prefetch_cluster);
@@ -452,6 +462,25 @@ no_prefetch_cluster:
dm_bufio_prefetch(v->bufio, hash_block_start,
hash_block_end - hash_block_start + 1);
}
+
+ kfree(pw);
+}
+
+static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io)
+{
+ struct dm_verity_prefetch_work *pw;
+
+ pw = kmalloc(sizeof(struct dm_verity_prefetch_work),
+ GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+
+ if (!pw)
+ return;
+
+ INIT_WORK(&pw->work, verity_prefetch_io);
+ pw->v = v;
+ pw->block = io->block;
+ pw->n_blocks = io->n_blocks;
+ queue_work(v->verify_wq, &pw->work);
}
/*
@@ -472,7 +501,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
return -EIO;
}
- if ((bio->bi_sector + bio_sectors(bio)) >>
+ if (bio_end_sector(bio) >>
(v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) {
DMERR_LIMIT("io out of range");
return -EIO;
@@ -490,7 +519,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
bio->bi_end_io = verity_end_io;
bio->bi_private = io;
- io->io_vec_size = bio->bi_vcnt - bio->bi_idx;
+ io->io_vec_size = bio_segments(bio);
if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE)
io->io_vec = io->io_vec_inline;
else
@@ -498,7 +527,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
memcpy(io->io_vec, bio_iovec(bio),
io->io_vec_size * sizeof(struct bio_vec));
- verity_prefetch_io(v, io);
+ verity_submit_prefetch(v, io);
generic_make_request(bio);
@@ -858,7 +887,7 @@ bad:
static struct target_type verity_target = {
.name = "verity",
- .version = {1, 1, 1},
+ .version = {1, 2, 0},
.module = THIS_MODULE,
.ctr = verity_ctr,
.dtr = verity_dtr,
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7e469260fe5e..d5370a94b2c1 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -339,7 +339,7 @@ out:
return md ? 0 : -ENXIO;
}
-static int dm_blk_close(struct gendisk *disk, fmode_t mode)
+static void dm_blk_close(struct gendisk *disk, fmode_t mode)
{
struct mapped_device *md = disk->private_data;
@@ -349,8 +349,6 @@ static int dm_blk_close(struct gendisk *disk, fmode_t mode)
dm_put(md);
spin_unlock(&_minor_lock);
-
- return 0;
}
int dm_open_count(struct mapped_device *md)
@@ -611,6 +609,7 @@ static void dec_pending(struct dm_io *io, int error)
queue_io(md, bio);
} else {
/* done with normal IO or empty flush */
+ trace_block_bio_complete(md->queue, bio, io_error);
bio_endio(bio, io_error);
}
}
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 5e7dc772f5de..3193aefe982b 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -185,8 +185,7 @@ static void make_request(struct mddev *mddev, struct bio *bio)
return;
}
- if (check_sector(conf, bio->bi_sector, bio->bi_sector+(bio->bi_size>>9),
- WRITE))
+ if (check_sector(conf, bio->bi_sector, bio_end_sector(bio), WRITE))
failit = 1;
if (check_mode(conf, WritePersistent)) {
add_sector(conf, bio->bi_sector, WritePersistent);
@@ -196,8 +195,7 @@ static void make_request(struct mddev *mddev, struct bio *bio)
failit = 1;
} else {
/* read request */
- if (check_sector(conf, bio->bi_sector, bio->bi_sector + (bio->bi_size>>9),
- READ))
+ if (check_sector(conf, bio->bi_sector, bio_end_sector(bio), READ))
failit = 1;
if (check_mode(conf, ReadTransient))
failit = 1;
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 21014836bdbf..f03fabd2b37b 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -317,8 +317,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
bio_io_error(bio);
return;
}
- if (unlikely(bio->bi_sector + (bio->bi_size >> 9) >
- tmp_dev->end_sector)) {
+ if (unlikely(bio_end_sector(bio) > tmp_dev->end_sector)) {
/* This bio crosses a device boundary, so we have to
* split it.
*/
diff --git a/drivers/md/md.c b/drivers/md/md.c
index fcb878f88796..681d1099a2d5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -72,6 +72,9 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
static struct workqueue_struct *md_wq;
static struct workqueue_struct *md_misc_wq;
+static int remove_and_add_spares(struct mddev *mddev,
+ struct md_rdev *this);
+
#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
/*
@@ -194,21 +197,12 @@ void md_trim_bio(struct bio *bio, int offset, int size)
if (offset == 0 && size == bio->bi_size)
return;
- bio->bi_sector += offset;
- bio->bi_size = size;
- offset <<= 9;
clear_bit(BIO_SEG_VALID, &bio->bi_flags);
- while (bio->bi_idx < bio->bi_vcnt &&
- bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
- /* remove this whole bio_vec */
- offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
- bio->bi_idx++;
- }
- if (bio->bi_idx < bio->bi_vcnt) {
- bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
- bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
- }
+ bio_advance(bio, offset << 9);
+
+ bio->bi_size = size;
+
/* avoid any complications with bi_idx being non-zero*/
if (bio->bi_idx) {
memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
@@ -1564,8 +1558,8 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
sector, count, 1) == 0)
return -EINVAL;
}
- } else if (sb->bblog_offset == 0)
- rdev->badblocks.shift = -1;
+ } else if (sb->bblog_offset != 0)
+ rdev->badblocks.shift = 0;
if (!refdev) {
ret = 1;
@@ -2411,6 +2405,11 @@ static void md_update_sb(struct mddev * mddev, int force_change)
int nospares = 0;
int any_badblocks_changed = 0;
+ if (mddev->ro) {
+ if (force_change)
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ return;
+ }
repeat:
/* First make sure individual recovery_offsets are correct */
rdev_for_each(rdev, mddev) {
@@ -2800,12 +2799,10 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
/* personality does all needed checks */
if (rdev->mddev->pers->hot_remove_disk == NULL)
return -EINVAL;
- err = rdev->mddev->pers->
- hot_remove_disk(rdev->mddev, rdev);
- if (err)
- return err;
- sysfs_unlink_rdev(rdev->mddev, rdev);
- rdev->raid_disk = -1;
+ clear_bit(Blocked, &rdev->flags);
+ remove_and_add_spares(rdev->mddev, rdev);
+ if (rdev->raid_disk >= 0)
+ return -EBUSY;
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
md_wakeup_thread(rdev->mddev->thread);
} else if (rdev->mddev->pers) {
@@ -3221,7 +3218,7 @@ int md_rdev_init(struct md_rdev *rdev)
* be used - I wonder if that matters
*/
rdev->badblocks.count = 0;
- rdev->badblocks.shift = 0;
+ rdev->badblocks.shift = -1; /* disabled until explicitly enabled */
rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
seqlock_init(&rdev->badblocks.lock);
if (rdev->badblocks.page == NULL)
@@ -3293,9 +3290,6 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
goto abort_free;
}
}
- if (super_format == -1)
- /* hot-add for 0.90, or non-persistent: so no badblocks */
- rdev->badblocks.shift = -1;
return rdev;
@@ -4225,8 +4219,6 @@ action_show(struct mddev *mddev, char *page)
return sprintf(page, "%s\n", type);
}
-static void reap_sync_thread(struct mddev *mddev);
-
static ssize_t
action_store(struct mddev *mddev, const char *page, size_t len)
{
@@ -4241,7 +4233,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- reap_sync_thread(mddev);
+ md_reap_sync_thread(mddev);
}
} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
@@ -5279,7 +5271,7 @@ static void __md_stop_writes(struct mddev *mddev)
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- reap_sync_thread(mddev);
+ md_reap_sync_thread(mddev);
}
del_timer_sync(&mddev->safemode_timer);
@@ -5287,7 +5279,8 @@ static void __md_stop_writes(struct mddev *mddev)
bitmap_flush(mddev);
md_super_wait(mddev);
- if (!mddev->in_sync || mddev->flags) {
+ if (mddev->ro == 0 &&
+ (!mddev->in_sync || mddev->flags)) {
/* mark array as shutdown cleanly */
mddev->in_sync = 1;
md_update_sb(mddev, 1);
@@ -5810,7 +5803,7 @@ static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
else
sysfs_notify_dirent_safe(rdev->sysfs_state);
- md_update_sb(mddev, 1);
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
if (mddev->degraded)
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -5877,6 +5870,9 @@ static int hot_remove_disk(struct mddev * mddev, dev_t dev)
if (!rdev)
return -ENXIO;
+ clear_bit(Blocked, &rdev->flags);
+ remove_and_add_spares(mddev, rdev);
+
if (rdev->raid_disk >= 0)
goto busy;
@@ -6490,6 +6486,28 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
err = md_set_readonly(mddev, bdev);
goto done_unlock;
+ case HOT_REMOVE_DISK:
+ err = hot_remove_disk(mddev, new_decode_dev(arg));
+ goto done_unlock;
+
+ case ADD_NEW_DISK:
+ /* We can support ADD_NEW_DISK on read-only arrays
+ * on if we are re-adding a preexisting device.
+ * So require mddev->pers and MD_DISK_SYNC.
+ */
+ if (mddev->pers) {
+ mdu_disk_info_t info;
+ if (copy_from_user(&info, argp, sizeof(info)))
+ err = -EFAULT;
+ else if (!(info.state & (1<<MD_DISK_SYNC)))
+ /* Need to clear read-only for this */
+ break;
+ else
+ err = add_new_disk(mddev, &info);
+ goto done_unlock;
+ }
+ break;
+
case BLKROSET:
if (get_user(ro, (int __user *)(arg))) {
err = -EFAULT;
@@ -6560,10 +6578,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
goto done_unlock;
}
- case HOT_REMOVE_DISK:
- err = hot_remove_disk(mddev, new_decode_dev(arg));
- goto done_unlock;
-
case HOT_ADD_DISK:
err = hot_add_disk(mddev, new_decode_dev(arg));
goto done_unlock;
@@ -6651,15 +6665,13 @@ static int md_open(struct block_device *bdev, fmode_t mode)
return err;
}
-static int md_release(struct gendisk *disk, fmode_t mode)
+static void md_release(struct gendisk *disk, fmode_t mode)
{
struct mddev *mddev = disk->private_data;
BUG_ON(!mddev);
atomic_dec(&mddev->openers);
mddev_put(mddev);
-
- return 0;
}
static int md_media_changed(struct gendisk *disk)
@@ -7644,14 +7656,16 @@ void md_do_sync(struct md_thread *thread)
}
EXPORT_SYMBOL_GPL(md_do_sync);
-static int remove_and_add_spares(struct mddev *mddev)
+static int remove_and_add_spares(struct mddev *mddev,
+ struct md_rdev *this)
{
struct md_rdev *rdev;
int spares = 0;
int removed = 0;
rdev_for_each(rdev, mddev)
- if (rdev->raid_disk >= 0 &&
+ if ((this == NULL || rdev == this) &&
+ rdev->raid_disk >= 0 &&
!test_bit(Blocked, &rdev->flags) &&
(test_bit(Faulty, &rdev->flags) ||
! test_bit(In_sync, &rdev->flags)) &&
@@ -7663,79 +7677,55 @@ static int remove_and_add_spares(struct mddev *mddev)
removed++;
}
}
- if (removed)
- sysfs_notify(&mddev->kobj, NULL,
- "degraded");
+ if (removed && mddev->kobj.sd)
+ sysfs_notify(&mddev->kobj, NULL, "degraded");
+ if (this)
+ goto no_add;
rdev_for_each(rdev, mddev) {
if (rdev->raid_disk >= 0 &&
!test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags))
spares++;
- if (rdev->raid_disk < 0
- && !test_bit(Faulty, &rdev->flags)) {
- rdev->recovery_offset = 0;
- if (mddev->pers->
- hot_add_disk(mddev, rdev) == 0) {
- if (sysfs_link_rdev(mddev, rdev))
- /* failure here is OK */;
- spares++;
- md_new_event(mddev);
- set_bit(MD_CHANGE_DEVS, &mddev->flags);
- }
+ if (rdev->raid_disk >= 0)
+ continue;
+ if (test_bit(Faulty, &rdev->flags))
+ continue;
+ if (mddev->ro &&
+ rdev->saved_raid_disk < 0)
+ continue;
+
+ rdev->recovery_offset = 0;
+ if (rdev->saved_raid_disk >= 0 && mddev->in_sync) {
+ spin_lock_irq(&mddev->write_lock);
+ if (mddev->in_sync)
+ /* OK, this device, which is in_sync,
+ * will definitely be noticed before
+ * the next write, so recovery isn't
+ * needed.
+ */
+ rdev->recovery_offset = mddev->recovery_cp;
+ spin_unlock_irq(&mddev->write_lock);
+ }
+ if (mddev->ro && rdev->recovery_offset != MaxSector)
+ /* not safe to add this disk now */
+ continue;
+ if (mddev->pers->
+ hot_add_disk(mddev, rdev) == 0) {
+ if (sysfs_link_rdev(mddev, rdev))
+ /* failure here is OK */;
+ spares++;
+ md_new_event(mddev);
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
}
}
+no_add:
if (removed)
set_bit(MD_CHANGE_DEVS, &mddev->flags);
return spares;
}
-static void reap_sync_thread(struct mddev *mddev)
-{
- struct md_rdev *rdev;
-
- /* resync has finished, collect result */
- md_unregister_thread(&mddev->sync_thread);
- if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
- !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
- /* success...*/
- /* activate any spares */
- if (mddev->pers->spare_active(mddev)) {
- sysfs_notify(&mddev->kobj, NULL,
- "degraded");
- set_bit(MD_CHANGE_DEVS, &mddev->flags);
- }
- }
- if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
- mddev->pers->finish_reshape)
- mddev->pers->finish_reshape(mddev);
-
- /* If array is no-longer degraded, then any saved_raid_disk
- * information must be scrapped. Also if any device is now
- * In_sync we must scrape the saved_raid_disk for that device
- * do the superblock for an incrementally recovered device
- * written out.
- */
- rdev_for_each(rdev, mddev)
- if (!mddev->degraded ||
- test_bit(In_sync, &rdev->flags))
- rdev->saved_raid_disk = -1;
-
- md_update_sb(mddev, 1);
- clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
- clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
- clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
- clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
- clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
- /* flag recovery needed just to double check */
- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- sysfs_notify_dirent_safe(mddev->sysfs_action);
- md_new_event(mddev);
- if (mddev->event_work.func)
- queue_work(md_misc_wq, &mddev->event_work);
-}
-
/*
* This routine is regularly called by all per-raid-array threads to
* deal with generic issues like resync and super-block update.
@@ -7791,22 +7781,16 @@ void md_check_recovery(struct mddev *mddev)
int spares = 0;
if (mddev->ro) {
- /* Only thing we do on a ro array is remove
- * failed devices.
+ /* On a read-only array we can:
+ * - remove failed devices
+ * - add already-in_sync devices if the array itself
+ * is in-sync.
+ * As we only add devices that are already in-sync,
+ * we can activate the spares immediately.
*/
- struct md_rdev *rdev;
- rdev_for_each(rdev, mddev)
- if (rdev->raid_disk >= 0 &&
- !test_bit(Blocked, &rdev->flags) &&
- test_bit(Faulty, &rdev->flags) &&
- atomic_read(&rdev->nr_pending)==0) {
- if (mddev->pers->hot_remove_disk(
- mddev, rdev) == 0) {
- sysfs_unlink_rdev(mddev, rdev);
- rdev->raid_disk = -1;
- }
- }
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ remove_and_add_spares(mddev, NULL);
+ mddev->pers->spare_active(mddev);
goto unlock;
}
@@ -7838,7 +7822,7 @@ void md_check_recovery(struct mddev *mddev)
goto unlock;
}
if (mddev->sync_thread) {
- reap_sync_thread(mddev);
+ md_reap_sync_thread(mddev);
goto unlock;
}
/* Set RUNNING before clearing NEEDED to avoid
@@ -7869,7 +7853,7 @@ void md_check_recovery(struct mddev *mddev)
goto unlock;
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
- } else if ((spares = remove_and_add_spares(mddev))) {
+ } else if ((spares = remove_and_add_spares(mddev, NULL))) {
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
@@ -7919,6 +7903,51 @@ void md_check_recovery(struct mddev *mddev)
}
}
+void md_reap_sync_thread(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+
+ /* resync has finished, collect result */
+ md_unregister_thread(&mddev->sync_thread);
+ if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+ /* success...*/
+ /* activate any spares */
+ if (mddev->pers->spare_active(mddev)) {
+ sysfs_notify(&mddev->kobj, NULL,
+ "degraded");
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ }
+ }
+ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ mddev->pers->finish_reshape)
+ mddev->pers->finish_reshape(mddev);
+
+ /* If array is no-longer degraded, then any saved_raid_disk
+ * information must be scrapped. Also if any device is now
+ * In_sync we must scrape the saved_raid_disk for that device
+ * do the superblock for an incrementally recovered device
+ * written out.
+ */
+ rdev_for_each(rdev, mddev)
+ if (!mddev->degraded ||
+ test_bit(In_sync, &rdev->flags))
+ rdev->saved_raid_disk = -1;
+
+ md_update_sb(mddev, 1);
+ clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ /* flag recovery needed just to double check */
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ sysfs_notify_dirent_safe(mddev->sysfs_action);
+ md_new_event(mddev);
+ if (mddev->event_work.func)
+ queue_work(md_misc_wq, &mddev->event_work);
+}
+
void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
{
sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -8644,6 +8673,7 @@ EXPORT_SYMBOL(md_register_thread);
EXPORT_SYMBOL(md_unregister_thread);
EXPORT_SYMBOL(md_wakeup_thread);
EXPORT_SYMBOL(md_check_recovery);
+EXPORT_SYMBOL(md_reap_sync_thread);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("MD RAID framework");
MODULE_ALIAS("md");
diff --git a/drivers/md/md.h b/drivers/md/md.h
index eca59c3074ef..653f992b687a 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -506,7 +506,7 @@ static inline char * mdname (struct mddev * mddev)
static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
char nm[20];
- if (!test_bit(Replacement, &rdev->flags)) {
+ if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) {
sprintf(nm, "rd%d", rdev->raid_disk);
return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
} else
@@ -516,7 +516,7 @@ static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
char nm[20];
- if (!test_bit(Replacement, &rdev->flags)) {
+ if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) {
sprintf(nm, "rd%d", rdev->raid_disk);
sysfs_remove_link(&mddev->kobj, nm);
}
@@ -567,6 +567,7 @@ extern struct md_thread *md_register_thread(
extern void md_unregister_thread(struct md_thread **threadp);
extern void md_wakeup_thread(struct md_thread *thread);
extern void md_check_recovery(struct mddev *mddev);
+extern void md_reap_sync_thread(struct mddev *mddev);
extern void md_write_start(struct mddev *mddev, struct bio *bi);
extern void md_write_end(struct mddev *mddev);
extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index c4f28133ef82..b88757cd0d1d 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -139,15 +139,8 @@ struct child {
struct btree_node *n;
};
-static struct dm_btree_value_type le64_type = {
- .context = NULL,
- .size = sizeof(__le64),
- .inc = NULL,
- .dec = NULL,
- .equal = NULL
-};
-
-static int init_child(struct dm_btree_info *info, struct btree_node *parent,
+static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt,
+ struct btree_node *parent,
unsigned index, struct child *result)
{
int r, inc;
@@ -164,7 +157,7 @@ static int init_child(struct dm_btree_info *info, struct btree_node *parent,
result->n = dm_block_data(result->block);
if (inc)
- inc_children(info->tm, result->n, &le64_type);
+ inc_children(info->tm, result->n, vt);
*((__le64 *) value_ptr(parent, index)) =
cpu_to_le64(dm_block_location(result->block));
@@ -236,7 +229,7 @@ static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent,
}
static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
- unsigned left_index)
+ struct dm_btree_value_type *vt, unsigned left_index)
{
int r;
struct btree_node *parent;
@@ -244,11 +237,11 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
parent = dm_block_data(shadow_current(s));
- r = init_child(info, parent, left_index, &left);
+ r = init_child(info, vt, parent, left_index, &left);
if (r)
return r;
- r = init_child(info, parent, left_index + 1, &right);
+ r = init_child(info, vt, parent, left_index + 1, &right);
if (r) {
exit_child(info, &left);
return r;
@@ -368,7 +361,7 @@ static void __rebalance3(struct dm_btree_info *info, struct btree_node *parent,
}
static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
- unsigned left_index)
+ struct dm_btree_value_type *vt, unsigned left_index)
{
int r;
struct btree_node *parent = dm_block_data(shadow_current(s));
@@ -377,17 +370,17 @@ static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
/*
* FIXME: fill out an array?
*/
- r = init_child(info, parent, left_index, &left);
+ r = init_child(info, vt, parent, left_index, &left);
if (r)
return r;
- r = init_child(info, parent, left_index + 1, &center);
+ r = init_child(info, vt, parent, left_index + 1, &center);
if (r) {
exit_child(info, &left);
return r;
}
- r = init_child(info, parent, left_index + 2, &right);
+ r = init_child(info, vt, parent, left_index + 2, &right);
if (r) {
exit_child(info, &left);
exit_child(info, &center);
@@ -434,7 +427,8 @@ static int get_nr_entries(struct dm_transaction_manager *tm,
}
static int rebalance_children(struct shadow_spine *s,
- struct dm_btree_info *info, uint64_t key)
+ struct dm_btree_info *info,
+ struct dm_btree_value_type *vt, uint64_t key)
{
int i, r, has_left_sibling, has_right_sibling;
uint32_t child_entries;
@@ -472,13 +466,13 @@ static int rebalance_children(struct shadow_spine *s,
has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
if (!has_left_sibling)
- r = rebalance2(s, info, i);
+ r = rebalance2(s, info, vt, i);
else if (!has_right_sibling)
- r = rebalance2(s, info, i - 1);
+ r = rebalance2(s, info, vt, i - 1);
else
- r = rebalance3(s, info, i - 1);
+ r = rebalance3(s, info, vt, i - 1);
return r;
}
@@ -529,7 +523,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
if (le32_to_cpu(n->header.flags) & LEAF_NODE)
return do_leaf(n, key, index);
- r = rebalance_children(s, info, key);
+ r = rebalance_children(s, info, vt, key);
if (r)
break;
@@ -550,6 +544,14 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
return r;
}
+static struct dm_btree_value_type le64_type = {
+ .context = NULL,
+ .size = sizeof(__le64),
+ .inc = NULL,
+ .dec = NULL,
+ .equal = NULL
+};
+
int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
uint64_t *keys, dm_block_t *new_root)
{
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
index f6d29e614ab7..e735a6d5a793 100644
--- a/drivers/md/persistent-data/dm-space-map-disk.c
+++ b/drivers/md/persistent-data/dm-space-map-disk.c
@@ -248,7 +248,8 @@ static struct dm_space_map ops = {
.new_block = sm_disk_new_block,
.commit = sm_disk_commit,
.root_size = sm_disk_root_size,
- .copy_root = sm_disk_copy_root
+ .copy_root = sm_disk_copy_root,
+ .register_threshold_callback = NULL
};
struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm,
diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index 906cf3df71af..1c959684caef 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -17,6 +17,55 @@
/*----------------------------------------------------------------*/
/*
+ * An edge triggered threshold.
+ */
+struct threshold {
+ bool threshold_set;
+ bool value_set;
+ dm_block_t threshold;
+ dm_block_t current_value;
+ dm_sm_threshold_fn fn;
+ void *context;
+};
+
+static void threshold_init(struct threshold *t)
+{
+ t->threshold_set = false;
+ t->value_set = false;
+}
+
+static void set_threshold(struct threshold *t, dm_block_t value,
+ dm_sm_threshold_fn fn, void *context)
+{
+ t->threshold_set = true;
+ t->threshold = value;
+ t->fn = fn;
+ t->context = context;
+}
+
+static bool below_threshold(struct threshold *t, dm_block_t value)
+{
+ return t->threshold_set && value <= t->threshold;
+}
+
+static bool threshold_already_triggered(struct threshold *t)
+{
+ return t->value_set && below_threshold(t, t->current_value);
+}
+
+static void check_threshold(struct threshold *t, dm_block_t value)
+{
+ if (below_threshold(t, value) &&
+ !threshold_already_triggered(t))
+ t->fn(t->context);
+
+ t->value_set = true;
+ t->current_value = value;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
* Space map interface.
*
* The low level disk format is written using the standard btree and
@@ -54,6 +103,8 @@ struct sm_metadata {
unsigned allocated_this_transaction;
unsigned nr_uncommitted;
struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS];
+
+ struct threshold threshold;
};
static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b)
@@ -144,12 +195,6 @@ static void sm_metadata_destroy(struct dm_space_map *sm)
kfree(smm);
}
-static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
-{
- DMERR("doesn't support extend");
- return -EINVAL;
-}
-
static int sm_metadata_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
{
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
@@ -335,9 +380,19 @@ static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b)
static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b)
{
+ dm_block_t count;
+ struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+
int r = sm_metadata_new_block_(sm, b);
if (r)
DMERR("unable to allocate new metadata block");
+
+ r = sm_metadata_get_nr_free(sm, &count);
+ if (r)
+ DMERR("couldn't get free block count");
+
+ check_threshold(&smm->threshold, count);
+
return r;
}
@@ -357,6 +412,18 @@ static int sm_metadata_commit(struct dm_space_map *sm)
return 0;
}
+static int sm_metadata_register_threshold_callback(struct dm_space_map *sm,
+ dm_block_t threshold,
+ dm_sm_threshold_fn fn,
+ void *context)
+{
+ struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+
+ set_threshold(&smm->threshold, threshold, fn, context);
+
+ return 0;
+}
+
static int sm_metadata_root_size(struct dm_space_map *sm, size_t *result)
{
*result = sizeof(struct disk_sm_root);
@@ -382,6 +449,8 @@ static int sm_metadata_copy_root(struct dm_space_map *sm, void *where_le, size_t
return 0;
}
+static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks);
+
static struct dm_space_map ops = {
.destroy = sm_metadata_destroy,
.extend = sm_metadata_extend,
@@ -395,7 +464,8 @@ static struct dm_space_map ops = {
.new_block = sm_metadata_new_block,
.commit = sm_metadata_commit,
.root_size = sm_metadata_root_size,
- .copy_root = sm_metadata_copy_root
+ .copy_root = sm_metadata_copy_root,
+ .register_threshold_callback = sm_metadata_register_threshold_callback
};
/*----------------------------------------------------------------*/
@@ -410,7 +480,7 @@ static void sm_bootstrap_destroy(struct dm_space_map *sm)
static int sm_bootstrap_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
{
- DMERR("boostrap doesn't support extend");
+ DMERR("bootstrap doesn't support extend");
return -EINVAL;
}
@@ -450,7 +520,7 @@ static int sm_bootstrap_count_is_more_than_one(struct dm_space_map *sm,
static int sm_bootstrap_set_count(struct dm_space_map *sm, dm_block_t b,
uint32_t count)
{
- DMERR("boostrap doesn't support set_count");
+ DMERR("bootstrap doesn't support set_count");
return -EINVAL;
}
@@ -491,7 +561,7 @@ static int sm_bootstrap_commit(struct dm_space_map *sm)
static int sm_bootstrap_root_size(struct dm_space_map *sm, size_t *result)
{
- DMERR("boostrap doesn't support root_size");
+ DMERR("bootstrap doesn't support root_size");
return -EINVAL;
}
@@ -499,7 +569,7 @@ static int sm_bootstrap_root_size(struct dm_space_map *sm, size_t *result)
static int sm_bootstrap_copy_root(struct dm_space_map *sm, void *where,
size_t max)
{
- DMERR("boostrap doesn't support copy_root");
+ DMERR("bootstrap doesn't support copy_root");
return -EINVAL;
}
@@ -517,11 +587,42 @@ static struct dm_space_map bootstrap_ops = {
.new_block = sm_bootstrap_new_block,
.commit = sm_bootstrap_commit,
.root_size = sm_bootstrap_root_size,
- .copy_root = sm_bootstrap_copy_root
+ .copy_root = sm_bootstrap_copy_root,
+ .register_threshold_callback = NULL
};
/*----------------------------------------------------------------*/
+static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
+{
+ int r, i;
+ enum allocation_event ev;
+ struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
+ dm_block_t old_len = smm->ll.nr_blocks;
+
+ /*
+ * Flick into a mode where all blocks get allocated in the new area.
+ */
+ smm->begin = old_len;
+ memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
+
+ /*
+ * Extend.
+ */
+ r = sm_ll_extend(&smm->ll, extra_blocks);
+
+ /*
+ * Switch back to normal behaviour.
+ */
+ memcpy(&smm->sm, &ops, sizeof(smm->sm));
+ for (i = old_len; !r && i < smm->begin; i++)
+ r = sm_ll_inc(&smm->ll, i, &ev);
+
+ return r;
+}
+
+/*----------------------------------------------------------------*/
+
struct dm_space_map *dm_sm_metadata_init(void)
{
struct sm_metadata *smm;
@@ -549,6 +650,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
smm->recursion_count = 0;
smm->allocated_this_transaction = 0;
smm->nr_uncommitted = 0;
+ threshold_init(&smm->threshold);
memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
@@ -590,6 +692,7 @@ int dm_sm_metadata_open(struct dm_space_map *sm,
smm->recursion_count = 0;
smm->allocated_this_transaction = 0;
smm->nr_uncommitted = 0;
+ threshold_init(&smm->threshold);
memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
return 0;
diff --git a/drivers/md/persistent-data/dm-space-map.h b/drivers/md/persistent-data/dm-space-map.h
index 1cbfc6b1638a..3e6d1153b7c4 100644
--- a/drivers/md/persistent-data/dm-space-map.h
+++ b/drivers/md/persistent-data/dm-space-map.h
@@ -9,6 +9,8 @@
#include "dm-block-manager.h"
+typedef void (*dm_sm_threshold_fn)(void *context);
+
/*
* struct dm_space_map keeps a record of how many times each block in a device
* is referenced. It needs to be fixed on disk as part of the transaction.
@@ -59,6 +61,15 @@ struct dm_space_map {
*/
int (*root_size)(struct dm_space_map *sm, size_t *result);
int (*copy_root)(struct dm_space_map *sm, void *copy_to_here_le, size_t len);
+
+ /*
+ * You can register one threshold callback which is edge-triggered
+ * when the free space in the space map drops below the threshold.
+ */
+ int (*register_threshold_callback)(struct dm_space_map *sm,
+ dm_block_t threshold,
+ dm_sm_threshold_fn fn,
+ void *context);
};
/*----------------------------------------------------------------*/
@@ -131,4 +142,16 @@ static inline int dm_sm_copy_root(struct dm_space_map *sm, void *copy_to_here_le
return sm->copy_root(sm, copy_to_here_le, len);
}
+static inline int dm_sm_register_threshold_callback(struct dm_space_map *sm,
+ dm_block_t threshold,
+ dm_sm_threshold_fn fn,
+ void *context)
+{
+ if (sm->register_threshold_callback)
+ return sm->register_threshold_callback(sm, threshold, fn, context);
+
+ return -EINVAL;
+}
+
+
#endif /* _LINUX_DM_SPACE_MAP_H */
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 0505452de8d6..fcf65e512cf5 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -502,11 +502,11 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
{
if (likely(is_power_of_2(chunk_sects))) {
return chunk_sects >= ((bio->bi_sector & (chunk_sects-1))
- + (bio->bi_size >> 9));
+ + bio_sectors(bio));
} else{
sector_t sector = bio->bi_sector;
return chunk_sects >= (sector_div(sector, chunk_sects)
- + (bio->bi_size >> 9));
+ + bio_sectors(bio));
}
}
@@ -527,8 +527,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
sector_t sector = bio->bi_sector;
struct bio_pair *bp;
/* Sanity check -- queue functions should prevent this happening */
- if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) ||
- bio->bi_idx != 0)
+ if (bio_segments(bio) > 1)
goto bad_map;
/* This is a one page bio that upper layers
* refuse to split for us, so we need to split it.
@@ -567,7 +566,7 @@ bad_map:
printk("md/raid0:%s: make_request bug: can't convert block across chunks"
" or bigger than %dk %llu %d\n",
mdname(mddev), chunk_sects / 2,
- (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
+ (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2);
bio_io_error(bio);
return;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index fd86b372692d..55951182af73 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -92,7 +92,6 @@ static void r1bio_pool_free(void *r1_bio, void *data)
static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
{
struct pool_info *pi = data;
- struct page *page;
struct r1bio *r1_bio;
struct bio *bio;
int i, j;
@@ -122,14 +121,10 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
j = 1;
while(j--) {
bio = r1_bio->bios[j];
- for (i = 0; i < RESYNC_PAGES; i++) {
- page = alloc_page(gfp_flags);
- if (unlikely(!page))
- goto out_free_pages;
+ bio->bi_vcnt = RESYNC_PAGES;
- bio->bi_io_vec[i].bv_page = page;
- bio->bi_vcnt = i+1;
- }
+ if (bio_alloc_pages(bio, gfp_flags))
+ goto out_free_bio;
}
/* If not user-requests, copy the page pointers to all bios */
if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
@@ -143,11 +138,6 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
return r1_bio;
-out_free_pages:
- for (j=0 ; j < pi->raid_disks; j++)
- for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++)
- put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
- j = -1;
out_free_bio:
while (++j < pi->raid_disks)
bio_put(r1_bio->bios[j]);
@@ -267,7 +257,7 @@ static void raid_end_bio_io(struct r1bio *r1_bio)
(bio_data_dir(bio) == WRITE) ? "write" : "read",
(unsigned long long) bio->bi_sector,
(unsigned long long) bio->bi_sector +
- (bio->bi_size >> 9) - 1);
+ bio_sectors(bio) - 1);
call_bio_endio(r1_bio);
}
@@ -458,7 +448,7 @@ static void raid1_end_write_request(struct bio *bio, int error)
" %llu-%llu\n",
(unsigned long long) mbio->bi_sector,
(unsigned long long) mbio->bi_sector +
- (mbio->bi_size >> 9) - 1);
+ bio_sectors(mbio) - 1);
call_bio_endio(r1_bio);
}
}
@@ -925,7 +915,7 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
if (unlikely(!bvecs))
return;
- bio_for_each_segment(bvec, bio, i) {
+ bio_for_each_segment_all(bvec, bio, i) {
bvecs[i] = *bvec;
bvecs[i].bv_page = alloc_page(GFP_NOIO);
if (unlikely(!bvecs[i].bv_page))
@@ -981,7 +971,12 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
while (bio) { /* submit pending writes */
struct bio *next = bio->bi_next;
bio->bi_next = NULL;
- generic_make_request(bio);
+ if (unlikely((bio->bi_rw & REQ_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
+ /* Just ignore it */
+ bio_endio(bio, 0);
+ else
+ generic_make_request(bio);
bio = next;
}
kfree(plug);
@@ -1018,7 +1013,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
md_write_start(mddev, bio); /* wait on superblock update early */
if (bio_data_dir(bio) == WRITE &&
- bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo &&
+ bio_end_sector(bio) > mddev->suspend_lo &&
bio->bi_sector < mddev->suspend_hi) {
/* As the suspend_* range is controlled by
* userspace, we want an interruptible
@@ -1029,7 +1024,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
flush_signals(current);
prepare_to_wait(&conf->wait_barrier,
&w, TASK_INTERRUPTIBLE);
- if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo ||
+ if (bio_end_sector(bio) <= mddev->suspend_lo ||
bio->bi_sector >= mddev->suspend_hi)
break;
schedule();
@@ -1049,7 +1044,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
r1_bio->master_bio = bio;
- r1_bio->sectors = bio->bi_size >> 9;
+ r1_bio->sectors = bio_sectors(bio);
r1_bio->state = 0;
r1_bio->mddev = mddev;
r1_bio->sector = bio->bi_sector;
@@ -1127,7 +1122,7 @@ read_again:
r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
r1_bio->master_bio = bio;
- r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+ r1_bio->sectors = bio_sectors(bio) - sectors_handled;
r1_bio->state = 0;
r1_bio->mddev = mddev;
r1_bio->sector = bio->bi_sector + sectors_handled;
@@ -1284,14 +1279,10 @@ read_again:
struct bio_vec *bvec;
int j;
- /* Yes, I really want the '__' version so that
- * we clear any unused pointer in the io_vec, rather
- * than leave them unchanged. This is important
- * because when we come to free the pages, we won't
- * know the original bi_idx, so we just free
- * them all
+ /*
+ * We trimmed the bio, so _all is legit
*/
- __bio_for_each_segment(bvec, mbio, j, 0)
+ bio_for_each_segment_all(bvec, mbio, j)
bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
atomic_inc(&r1_bio->behind_remaining);
@@ -1329,14 +1320,14 @@ read_again:
/* Mustn't call r1_bio_write_done before this next test,
* as it could result in the bio being freed.
*/
- if (sectors_handled < (bio->bi_size >> 9)) {
+ if (sectors_handled < bio_sectors(bio)) {
r1_bio_write_done(r1_bio);
/* We need another r1_bio. It has already been counted
* in bio->bi_phys_segments
*/
r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
r1_bio->master_bio = bio;
- r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+ r1_bio->sectors = bio_sectors(bio) - sectors_handled;
r1_bio->state = 0;
r1_bio->mddev = mddev;
r1_bio->sector = bio->bi_sector + sectors_handled;
@@ -1862,7 +1853,7 @@ static int process_checks(struct r1bio *r1_bio)
struct bio *sbio = r1_bio->bios[i];
int size;
- if (r1_bio->bios[i]->bi_end_io != end_sync_read)
+ if (sbio->bi_end_io != end_sync_read)
continue;
if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
@@ -1887,16 +1878,15 @@ static int process_checks(struct r1bio *r1_bio)
continue;
}
/* fixup the bio for reuse */
+ bio_reset(sbio);
sbio->bi_vcnt = vcnt;
sbio->bi_size = r1_bio->sectors << 9;
- sbio->bi_idx = 0;
- sbio->bi_phys_segments = 0;
- sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
- sbio->bi_flags |= 1 << BIO_UPTODATE;
- sbio->bi_next = NULL;
sbio->bi_sector = r1_bio->sector +
conf->mirrors[i].rdev->data_offset;
sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
+ sbio->bi_end_io = end_sync_read;
+ sbio->bi_private = r1_bio;
+
size = sbio->bi_size;
for (j = 0; j < vcnt ; j++) {
struct bio_vec *bi;
@@ -1907,10 +1897,9 @@ static int process_checks(struct r1bio *r1_bio)
else
bi->bv_len = size;
size -= PAGE_SIZE;
- memcpy(page_address(bi->bv_page),
- page_address(pbio->bi_io_vec[j].bv_page),
- PAGE_SIZE);
}
+
+ bio_copy_data(sbio, pbio);
}
return 0;
}
@@ -1947,7 +1936,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
wbio->bi_rw = WRITE;
wbio->bi_end_io = end_sync_write;
atomic_inc(&r1_bio->remaining);
- md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
+ md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
generic_make_request(wbio);
}
@@ -2059,32 +2048,11 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
}
}
-static void bi_complete(struct bio *bio, int error)
-{
- complete((struct completion *)bio->bi_private);
-}
-
-static int submit_bio_wait(int rw, struct bio *bio)
-{
- struct completion event;
- rw |= REQ_SYNC;
-
- init_completion(&event);
- bio->bi_private = &event;
- bio->bi_end_io = bi_complete;
- submit_bio(rw, bio);
- wait_for_completion(&event);
-
- return test_bit(BIO_UPTODATE, &bio->bi_flags);
-}
-
static int narrow_write_error(struct r1bio *r1_bio, int i)
{
struct mddev *mddev = r1_bio->mddev;
struct r1conf *conf = mddev->private;
struct md_rdev *rdev = conf->mirrors[i].rdev;
- int vcnt, idx;
- struct bio_vec *vec;
/* bio has the data to be written to device 'i' where
* we just recently had a write error.
@@ -2112,30 +2080,32 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
& ~(sector_t)(block_sectors - 1))
- sector;
- if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
- vcnt = r1_bio->behind_page_count;
- vec = r1_bio->behind_bvecs;
- idx = 0;
- while (vec[idx].bv_page == NULL)
- idx++;
- } else {
- vcnt = r1_bio->master_bio->bi_vcnt;
- vec = r1_bio->master_bio->bi_io_vec;
- idx = r1_bio->master_bio->bi_idx;
- }
while (sect_to_write) {
struct bio *wbio;
if (sectors > sect_to_write)
sectors = sect_to_write;
/* Write at 'sector' for 'sectors'*/
- wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
- memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
- wbio->bi_sector = r1_bio->sector;
+ if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
+ unsigned vcnt = r1_bio->behind_page_count;
+ struct bio_vec *vec = r1_bio->behind_bvecs;
+
+ while (!vec->bv_page) {
+ vec++;
+ vcnt--;
+ }
+
+ wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
+ memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
+
+ wbio->bi_vcnt = vcnt;
+ } else {
+ wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
+ }
+
wbio->bi_rw = WRITE;
- wbio->bi_vcnt = vcnt;
+ wbio->bi_sector = r1_bio->sector;
wbio->bi_size = r1_bio->sectors << 9;
- wbio->bi_idx = idx;
md_trim_bio(wbio, sector - r1_bio->sector, sectors);
wbio->bi_sector += rdev->data_offset;
@@ -2284,8 +2254,7 @@ read_more:
r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
r1_bio->master_bio = mbio;
- r1_bio->sectors = (mbio->bi_size >> 9)
- - sectors_handled;
+ r1_bio->sectors = bio_sectors(mbio) - sectors_handled;
r1_bio->state = 0;
set_bit(R1BIO_ReadError, &r1_bio->state);
r1_bio->mddev = mddev;
@@ -2459,18 +2428,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
for (i = 0; i < conf->raid_disks * 2; i++) {
struct md_rdev *rdev;
bio = r1_bio->bios[i];
-
- /* take from bio_init */
- bio->bi_next = NULL;
- bio->bi_flags &= ~(BIO_POOL_MASK-1);
- bio->bi_flags |= 1 << BIO_UPTODATE;
- bio->bi_rw = READ;
- bio->bi_vcnt = 0;
- bio->bi_idx = 0;
- bio->bi_phys_segments = 0;
- bio->bi_size = 0;
- bio->bi_end_io = NULL;
- bio->bi_private = NULL;
+ bio_reset(bio);
rdev = rcu_dereference(conf->mirrors[i].rdev);
if (rdev == NULL ||
@@ -2901,6 +2859,7 @@ static int stop(struct mddev *mddev)
if (conf->r1bio_pool)
mempool_destroy(conf->r1bio_pool);
kfree(conf->mirrors);
+ safe_put_page(conf->tmppage);
kfree(conf->poolinfo);
kfree(conf);
mddev->private = NULL;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 77b562d18a90..59d4daa5f4c7 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1133,7 +1133,12 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
while (bio) { /* submit pending writes */
struct bio *next = bio->bi_next;
bio->bi_next = NULL;
- generic_make_request(bio);
+ if (unlikely((bio->bi_rw & REQ_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
+ /* Just ignore it */
+ bio_endio(bio, 0);
+ else
+ generic_make_request(bio);
bio = next;
}
kfree(plug);
@@ -1169,14 +1174,13 @@ static void make_request(struct mddev *mddev, struct bio * bio)
/* If this request crosses a chunk boundary, we need to
* split it. This will only happen for 1 PAGE (or less) requests.
*/
- if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9)
+ if (unlikely((bio->bi_sector & chunk_mask) + bio_sectors(bio)
> chunk_sects
&& (conf->geo.near_copies < conf->geo.raid_disks
|| conf->prev.near_copies < conf->prev.raid_disks))) {
struct bio_pair *bp;
/* Sanity check -- queue functions should prevent this happening */
- if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) ||
- bio->bi_idx != 0)
+ if (bio_segments(bio) > 1)
goto bad_map;
/* This is a one page bio that upper layers
* refuse to split for us, so we need to split it.
@@ -1209,7 +1213,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
bad_map:
printk("md/raid10:%s: make_request bug: can't convert block across chunks"
" or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
- (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
+ (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2);
bio_io_error(bio);
return;
@@ -1224,7 +1228,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
*/
wait_barrier(conf);
- sectors = bio->bi_size >> 9;
+ sectors = bio_sectors(bio);
while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
bio->bi_sector < conf->reshape_progress &&
bio->bi_sector + sectors > conf->reshape_progress) {
@@ -1326,8 +1330,7 @@ read_again:
r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
r10_bio->master_bio = bio;
- r10_bio->sectors = ((bio->bi_size >> 9)
- - sectors_handled);
+ r10_bio->sectors = bio_sectors(bio) - sectors_handled;
r10_bio->state = 0;
r10_bio->mddev = mddev;
r10_bio->sector = bio->bi_sector + sectors_handled;
@@ -1569,7 +1572,7 @@ retry_write:
* after checking if we need to go around again.
*/
- if (sectors_handled < (bio->bi_size >> 9)) {
+ if (sectors_handled < bio_sectors(bio)) {
one_write_done(r10_bio);
/* We need another r10_bio. It has already been counted
* in bio->bi_phys_segments.
@@ -1577,7 +1580,7 @@ retry_write:
r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
r10_bio->master_bio = bio;
- r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+ r10_bio->sectors = bio_sectors(bio) - sectors_handled;
r10_bio->mddev = mddev;
r10_bio->sector = bio->bi_sector + sectors_handled;
@@ -2079,13 +2082,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
* First we need to fixup bv_offset, bv_len and
* bi_vecs, as the read request might have corrupted these
*/
+ bio_reset(tbio);
+
tbio->bi_vcnt = vcnt;
tbio->bi_size = r10_bio->sectors << 9;
- tbio->bi_idx = 0;
- tbio->bi_phys_segments = 0;
- tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
- tbio->bi_flags |= 1 << BIO_UPTODATE;
- tbio->bi_next = NULL;
tbio->bi_rw = WRITE;
tbio->bi_private = r10_bio;
tbio->bi_sector = r10_bio->devs[i].addr;
@@ -2103,7 +2103,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
d = r10_bio->devs[i].devnum;
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
atomic_inc(&r10_bio->remaining);
- md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
+ md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
@@ -2128,7 +2128,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
d = r10_bio->devs[i].devnum;
atomic_inc(&r10_bio->remaining);
md_sync_acct(conf->mirrors[d].replacement->bdev,
- tbio->bi_size >> 9);
+ bio_sectors(tbio));
generic_make_request(tbio);
}
@@ -2254,13 +2254,13 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
wbio2 = r10_bio->devs[1].repl_bio;
if (wbio->bi_end_io) {
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
- md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
+ md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
generic_make_request(wbio);
}
if (wbio2 && wbio2->bi_end_io) {
atomic_inc(&conf->mirrors[d].replacement->nr_pending);
md_sync_acct(conf->mirrors[d].replacement->bdev,
- wbio2->bi_size >> 9);
+ bio_sectors(wbio2));
generic_make_request(wbio2);
}
}
@@ -2531,25 +2531,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
}
}
-static void bi_complete(struct bio *bio, int error)
-{
- complete((struct completion *)bio->bi_private);
-}
-
-static int submit_bio_wait(int rw, struct bio *bio)
-{
- struct completion event;
- rw |= REQ_SYNC;
-
- init_completion(&event);
- bio->bi_private = &event;
- bio->bi_end_io = bi_complete;
- submit_bio(rw, bio);
- wait_for_completion(&event);
-
- return test_bit(BIO_UPTODATE, &bio->bi_flags);
-}
-
static int narrow_write_error(struct r10bio *r10_bio, int i)
{
struct bio *bio = r10_bio->master_bio;
@@ -2690,8 +2671,7 @@ read_more:
r10_bio = mempool_alloc(conf->r10bio_pool,
GFP_NOIO);
r10_bio->master_bio = mbio;
- r10_bio->sectors = (mbio->bi_size >> 9)
- - sectors_handled;
+ r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
r10_bio->state = 0;
set_bit(R10BIO_ReadError,
&r10_bio->state);
@@ -2913,6 +2893,22 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
if (init_resync(conf))
return 0;
+ /*
+ * Allow skipping a full rebuild for incremental assembly
+ * of a clean array, like RAID1 does.
+ */
+ if (mddev->bitmap == NULL &&
+ mddev->recovery_cp == MaxSector &&
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
+ conf->fullsync == 0) {
+ *skipped = 1;
+ max_sector = mddev->dev_sectors;
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ max_sector = mddev->resync_max_sectors;
+ return max_sector - sector_nr;
+ }
+
skipped:
max_sector = mddev->dev_sectors;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
@@ -3112,6 +3108,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
}
}
bio = r10_bio->devs[0].bio;
+ bio_reset(bio);
bio->bi_next = biolist;
biolist = bio;
bio->bi_private = r10_bio;
@@ -3136,6 +3133,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
rdev = mirror->rdev;
if (!test_bit(In_sync, &rdev->flags)) {
bio = r10_bio->devs[1].bio;
+ bio_reset(bio);
bio->bi_next = biolist;
biolist = bio;
bio->bi_private = r10_bio;
@@ -3164,6 +3162,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
if (rdev == NULL || bio == NULL ||
test_bit(Faulty, &rdev->flags))
break;
+ bio_reset(bio);
bio->bi_next = biolist;
biolist = bio;
bio->bi_private = r10_bio;
@@ -3262,7 +3261,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
r10_bio->devs[i].repl_bio->bi_end_io = NULL;
bio = r10_bio->devs[i].bio;
- bio->bi_end_io = NULL;
+ bio_reset(bio);
clear_bit(BIO_UPTODATE, &bio->bi_flags);
if (conf->mirrors[d].rdev == NULL ||
test_bit(Faulty, &conf->mirrors[d].rdev->flags))
@@ -3299,6 +3298,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
/* Need to set up for writing to the replacement */
bio = r10_bio->devs[i].repl_bio;
+ bio_reset(bio);
clear_bit(BIO_UPTODATE, &bio->bi_flags);
sector = r10_bio->devs[i].addr;
@@ -3332,17 +3332,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
}
}
- for (bio = biolist; bio ; bio=bio->bi_next) {
-
- bio->bi_flags &= ~(BIO_POOL_MASK - 1);
- if (bio->bi_end_io)
- bio->bi_flags |= 1 << BIO_UPTODATE;
- bio->bi_vcnt = 0;
- bio->bi_idx = 0;
- bio->bi_phys_segments = 0;
- bio->bi_size = 0;
- }
-
nr_sectors = 0;
if (sector_nr + max_sync < max_sector)
max_sector = sector_nr + max_sync;
@@ -3810,6 +3799,7 @@ static int stop(struct mddev *mddev)
if (conf->r10bio_pool)
mempool_destroy(conf->r10bio_pool);
+ safe_put_page(conf->tmppage);
kfree(conf->mirrors);
kfree(conf);
mddev->private = NULL;
@@ -4389,7 +4379,6 @@ read_more:
read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
read_bio->bi_flags |= 1 << BIO_UPTODATE;
read_bio->bi_vcnt = 0;
- read_bio->bi_idx = 0;
read_bio->bi_size = 0;
r10_bio->master_bio = read_bio;
r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
@@ -4413,17 +4402,14 @@ read_more:
}
if (!rdev2 || test_bit(Faulty, &rdev2->flags))
continue;
+
+ bio_reset(b);
b->bi_bdev = rdev2->bdev;
b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
b->bi_private = r10_bio;
b->bi_end_io = end_reshape_write;
b->bi_rw = WRITE;
- b->bi_flags &= ~(BIO_POOL_MASK - 1);
- b->bi_flags |= 1 << BIO_UPTODATE;
b->bi_next = blist;
- b->bi_vcnt = 0;
- b->bi_idx = 0;
- b->bi_size = 0;
blist = b;
}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3ee2912889e7..9359828ffe26 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -90,7 +90,7 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
*/
static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
{
- int sectors = bio->bi_size >> 9;
+ int sectors = bio_sectors(bio);
if (bio->bi_sector + sectors < sector + STRIPE_SECTORS)
return bio->bi_next;
else
@@ -184,6 +184,8 @@ static void return_io(struct bio *return_bi)
return_bi = bi->bi_next;
bi->bi_next = NULL;
bi->bi_size = 0;
+ trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
+ bi, 0);
bio_endio(bi, 0);
bi = return_bi;
}
@@ -567,14 +569,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
bi = &sh->dev[i].req;
rbi = &sh->dev[i].rreq; /* For writing to replacement */
- bi->bi_rw = rw;
- rbi->bi_rw = rw;
- if (rw & WRITE) {
- bi->bi_end_io = raid5_end_write_request;
- rbi->bi_end_io = raid5_end_write_request;
- } else
- bi->bi_end_io = raid5_end_read_request;
-
rcu_read_lock();
rrdev = rcu_dereference(conf->disks[i].replacement);
smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
@@ -649,7 +643,14 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
set_bit(STRIPE_IO_STARTED, &sh->state);
+ bio_reset(bi);
bi->bi_bdev = rdev->bdev;
+ bi->bi_rw = rw;
+ bi->bi_end_io = (rw & WRITE)
+ ? raid5_end_write_request
+ : raid5_end_read_request;
+ bi->bi_private = sh;
+
pr_debug("%s: for %llu schedule op %ld on disc %d\n",
__func__, (unsigned long long)sh->sector,
bi->bi_rw, i);
@@ -663,17 +664,16 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
bi->bi_rw |= REQ_FLUSH;
- bi->bi_flags = 1 << BIO_UPTODATE;
- bi->bi_idx = 0;
bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
bi->bi_io_vec[0].bv_offset = 0;
bi->bi_size = STRIPE_SIZE;
- bi->bi_next = NULL;
if (rrdev)
set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
- trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
- bi, disk_devt(conf->mddev->gendisk),
- sh->dev[i].sector);
+
+ if (conf->mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
+ bi, disk_devt(conf->mddev->gendisk),
+ sh->dev[i].sector);
generic_make_request(bi);
}
if (rrdev) {
@@ -683,7 +683,13 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
set_bit(STRIPE_IO_STARTED, &sh->state);
+ bio_reset(rbi);
rbi->bi_bdev = rrdev->bdev;
+ rbi->bi_rw = rw;
+ BUG_ON(!(rw & WRITE));
+ rbi->bi_end_io = raid5_end_write_request;
+ rbi->bi_private = sh;
+
pr_debug("%s: for %llu schedule op %ld on "
"replacement disc %d\n",
__func__, (unsigned long long)sh->sector,
@@ -695,15 +701,13 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
else
rbi->bi_sector = (sh->sector
+ rrdev->data_offset);
- rbi->bi_flags = 1 << BIO_UPTODATE;
- rbi->bi_idx = 0;
rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
rbi->bi_io_vec[0].bv_offset = 0;
rbi->bi_size = STRIPE_SIZE;
- rbi->bi_next = NULL;
- trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
- rbi, disk_devt(conf->mddev->gendisk),
- sh->dev[i].sector);
+ if (conf->mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
+ rbi, disk_devt(conf->mddev->gendisk),
+ sh->dev[i].sector);
generic_make_request(rbi);
}
if (!rdev && !rrdev) {
@@ -1882,8 +1886,15 @@ static void raid5_end_write_request(struct bio *bi, int error)
&rdev->mddev->recovery);
} else if (is_badblock(rdev, sh->sector,
STRIPE_SECTORS,
- &first_bad, &bad_sectors))
+ &first_bad, &bad_sectors)) {
set_bit(R5_MadeGood, &sh->dev[i].flags);
+ if (test_bit(R5_ReadError, &sh->dev[i].flags))
+ /* That was a successful write so make
+ * sure it looks like we already did
+ * a re-write.
+ */
+ set_bit(R5_ReWrite, &sh->dev[i].flags);
+ }
}
rdev_dec_pending(rdev, conf->mddev);
@@ -2280,17 +2291,6 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
int level = conf->level;
if (rcw) {
- /* if we are not expanding this is a proper write request, and
- * there will be bios with new data to be drained into the
- * stripe cache
- */
- if (!expand) {
- sh->reconstruct_state = reconstruct_state_drain_run;
- set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
- } else
- sh->reconstruct_state = reconstruct_state_run;
-
- set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
@@ -2303,6 +2303,21 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
s->locked++;
}
}
+ /* if we are not expanding this is a proper write request, and
+ * there will be bios with new data to be drained into the
+ * stripe cache
+ */
+ if (!expand) {
+ if (!s->locked)
+ /* False alarm, nothing to do */
+ return;
+ sh->reconstruct_state = reconstruct_state_drain_run;
+ set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
+ } else
+ sh->reconstruct_state = reconstruct_state_run;
+
+ set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
+
if (s->locked + conf->max_degraded == disks)
if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
atomic_inc(&conf->pending_full_writes);
@@ -2311,11 +2326,6 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
- sh->reconstruct_state = reconstruct_state_prexor_drain_run;
- set_bit(STRIPE_OP_PREXOR, &s->ops_request);
- set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
- set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
-
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (i == pd_idx)
@@ -2330,6 +2340,13 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
s->locked++;
}
}
+ if (!s->locked)
+ /* False alarm - nothing to do */
+ return;
+ sh->reconstruct_state = reconstruct_state_prexor_drain_run;
+ set_bit(STRIPE_OP_PREXOR, &s->ops_request);
+ set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
+ set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
}
/* keep the parity disk(s) locked while asynchronous operations
@@ -2384,11 +2401,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
} else
bip = &sh->dev[dd_idx].toread;
while (*bip && (*bip)->bi_sector < bi->bi_sector) {
- if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
+ if (bio_end_sector(*bip) > bi->bi_sector)
goto overlap;
bip = & (*bip)->bi_next;
}
- if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
+ if (*bip && (*bip)->bi_sector < bio_end_sector(bi))
goto overlap;
BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
@@ -2404,8 +2421,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
bi && bi->bi_sector <= sector;
bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
- if (bi->bi_sector + (bi->bi_size>>9) >= sector)
- sector = bi->bi_sector + (bi->bi_size>>9);
+ if (bio_end_sector(bi) >= sector)
+ sector = bio_end_sector(bi);
}
if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
@@ -2564,6 +2581,8 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
int i;
clear_bit(STRIPE_SYNCING, &sh->state);
+ if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
+ wake_up(&conf->wait_for_overlap);
s->syncing = 0;
s->replacing = 0;
/* There is nothing more to do for sync/check/repair.
@@ -2737,6 +2756,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
{
int i;
struct r5dev *dev;
+ int discard_pending = 0;
for (i = disks; i--; )
if (sh->dev[i].written) {
@@ -2765,9 +2785,23 @@ static void handle_stripe_clean_event(struct r5conf *conf,
STRIPE_SECTORS,
!test_bit(STRIPE_DEGRADED, &sh->state),
0);
- }
- } else if (test_bit(R5_Discard, &sh->dev[i].flags))
- clear_bit(R5_Discard, &sh->dev[i].flags);
+ } else if (test_bit(R5_Discard, &dev->flags))
+ discard_pending = 1;
+ }
+ if (!discard_pending &&
+ test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
+ clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
+ clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
+ if (sh->qd_idx >= 0) {
+ clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
+ clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags);
+ }
+ /* now that discard is done we can proceed with any sync */
+ clear_bit(STRIPE_DISCARD, &sh->state);
+ if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
+ set_bit(STRIPE_HANDLE, &sh->state);
+
+ }
if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
if (atomic_dec_and_test(&conf->pending_full_writes))
@@ -2826,8 +2860,10 @@ static void handle_stripe_dirtying(struct r5conf *conf,
set_bit(STRIPE_HANDLE, &sh->state);
if (rmw < rcw && rmw > 0) {
/* prefer read-modify-write, but need to get some data */
- blk_add_trace_msg(conf->mddev->queue, "raid5 rmw %llu %d",
- (unsigned long long)sh->sector, rmw);
+ if (conf->mddev->queue)
+ blk_add_trace_msg(conf->mddev->queue,
+ "raid5 rmw %llu %d",
+ (unsigned long long)sh->sector, rmw);
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if ((dev->towrite || i == sh->pd_idx) &&
@@ -2877,7 +2913,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
}
}
}
- if (rcw)
+ if (rcw && conf->mddev->queue)
blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
(unsigned long long)sh->sector,
rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
@@ -3417,9 +3453,15 @@ static void handle_stripe(struct stripe_head *sh)
return;
}
- if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
- set_bit(STRIPE_SYNCING, &sh->state);
- clear_bit(STRIPE_INSYNC, &sh->state);
+ if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
+ spin_lock(&sh->stripe_lock);
+ /* Cannot process 'sync' concurrently with 'discard' */
+ if (!test_bit(STRIPE_DISCARD, &sh->state) &&
+ test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
+ set_bit(STRIPE_SYNCING, &sh->state);
+ clear_bit(STRIPE_INSYNC, &sh->state);
+ }
+ spin_unlock(&sh->stripe_lock);
}
clear_bit(STRIPE_DELAYED, &sh->state);
@@ -3579,6 +3621,8 @@ static void handle_stripe(struct stripe_head *sh)
test_bit(STRIPE_INSYNC, &sh->state)) {
md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
clear_bit(STRIPE_SYNCING, &sh->state);
+ if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
+ wake_up(&conf->wait_for_overlap);
}
/* If the failed drives are just a ReadError, then we might need
@@ -3804,7 +3848,7 @@ static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
{
sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
unsigned int chunk_sectors = mddev->chunk_sectors;
- unsigned int bio_sectors = bio->bi_size >> 9;
+ unsigned int bio_sectors = bio_sectors(bio);
if (mddev->new_chunk_sectors < mddev->chunk_sectors)
chunk_sectors = mddev->new_chunk_sectors;
@@ -3878,6 +3922,8 @@ static void raid5_align_endio(struct bio *bi, int error)
rdev_dec_pending(rdev, conf->mddev);
if (!error && uptodate) {
+ trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
+ raid_bi, 0);
bio_endio(raid_bi, 0);
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_stripe);
@@ -3894,7 +3940,7 @@ static int bio_fits_rdev(struct bio *bi)
{
struct request_queue *q = bdev_get_queue(bi->bi_bdev);
- if ((bi->bi_size>>9) > queue_max_sectors(q))
+ if (bio_sectors(bi) > queue_max_sectors(q))
return 0;
blk_recount_segments(q, bi);
if (bi->bi_phys_segments > queue_max_segments(q))
@@ -3941,7 +3987,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
0,
&dd_idx, NULL);
- end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9);
+ end_sector = bio_end_sector(align_bi);
rcu_read_lock();
rdev = rcu_dereference(conf->disks[dd_idx].replacement);
if (!rdev || test_bit(Faulty, &rdev->flags) ||
@@ -3964,7 +4010,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
if (!bio_fits_rdev(align_bi) ||
- is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9,
+ is_badblock(rdev, align_bi->bi_sector, bio_sectors(align_bi),
&first_bad, &bad_sectors)) {
/* too big in some way, or has a known bad block */
bio_put(align_bi);
@@ -3982,9 +4028,10 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
atomic_inc(&conf->active_aligned_reads);
spin_unlock_irq(&conf->device_lock);
- trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
- align_bi, disk_devt(mddev->gendisk),
- raid_bio->bi_sector);
+ if (mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
+ align_bi, disk_devt(mddev->gendisk),
+ raid_bio->bi_sector);
generic_make_request(align_bi);
return 1;
} else {
@@ -4078,7 +4125,8 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
}
spin_unlock_irq(&conf->device_lock);
}
- trace_block_unplug(mddev->queue, cnt, !from_schedule);
+ if (mddev->queue)
+ trace_block_unplug(mddev->queue, cnt, !from_schedule);
kfree(cb);
}
@@ -4141,6 +4189,13 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
prepare_to_wait(&conf->wait_for_overlap, &w,
TASK_UNINTERRUPTIBLE);
+ set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
+ if (test_bit(STRIPE_SYNCING, &sh->state)) {
+ release_stripe(sh);
+ schedule();
+ goto again;
+ }
+ clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
spin_lock_irq(&sh->stripe_lock);
for (d = 0; d < conf->raid_disks; d++) {
if (d == sh->pd_idx || d == sh->qd_idx)
@@ -4153,6 +4208,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
goto again;
}
}
+ set_bit(STRIPE_DISCARD, &sh->state);
finish_wait(&conf->wait_for_overlap, &w);
for (d = 0; d < conf->raid_disks; d++) {
if (d == sh->pd_idx || d == sh->qd_idx)
@@ -4216,7 +4272,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
}
logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
- last_sector = bi->bi_sector + (bi->bi_size>>9);
+ last_sector = bio_end_sector(bi);
bi->bi_next = NULL;
bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
@@ -4336,6 +4392,8 @@ static void make_request(struct mddev *mddev, struct bio * bi)
if ( rw == WRITE )
md_write_end(mddev);
+ trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
+ bi, 0);
bio_endio(bi, 0);
}
}
@@ -4620,9 +4678,10 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
*skipped = 1;
return rv;
}
- if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
- !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
- !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
+ if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
+ !conf->fullsync &&
+ !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
+ sync_blocks >= STRIPE_SECTORS) {
/* we can skip this block, and probably more */
sync_blocks /= STRIPE_SECTORS;
*skipped = 1;
@@ -4679,7 +4738,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
sector = raid5_compute_sector(conf, logical_sector,
0, &dd_idx, NULL);
- last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
+ last_sector = bio_end_sector(raid_bio);
for (; logical_sector < last_sector;
logical_sector += STRIPE_SECTORS,
@@ -4712,8 +4771,11 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
handled++;
}
remaining = raid5_dec_bi_active_stripes(raid_bio);
- if (remaining == 0)
+ if (remaining == 0) {
+ trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
+ raid_bio, 0);
bio_endio(raid_bio, 0);
+ }
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_stripe);
return handled;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 18b2c4a8a1fd..b0b663b119a8 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -221,10 +221,6 @@ struct stripe_head {
struct stripe_operations {
int target, target2;
enum sum_check_flags zero_sum_result;
- #ifdef CONFIG_MULTICORE_RAID456
- unsigned long request;
- wait_queue_head_t wait_for_ops;
- #endif
} ops;
struct r5dev {
/* rreq and rvec are used for the replacement device when
@@ -323,6 +319,7 @@ enum {
STRIPE_COMPUTE_RUN,
STRIPE_OPS_REQ_PENDING,
STRIPE_ON_UNPLUG_LIST,
+ STRIPE_DISCARD,
};
/*