summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig22
-rw-r--r--drivers/md/bitmap.c4
-rw-r--r--drivers/md/dm-cache-metadata.c104
-rw-r--r--drivers/md/dm-cache-metadata.h5
-rw-r--r--drivers/md/dm-cache-policy-internal.h7
-rw-r--r--drivers/md/dm-cache-policy-mq.c681
-rw-r--r--drivers/md/dm-cache-policy.c4
-rw-r--r--drivers/md/dm-cache-policy.h21
-rw-r--r--drivers/md/dm-cache-target.c687
-rw-r--r--drivers/md/dm-crypt.c216
-rw-r--r--drivers/md/dm-ioctl.c36
-rw-r--r--drivers/md/dm-mpath.c34
-rw-r--r--drivers/md/dm-table.c23
-rw-r--r--drivers/md/dm.c47
-rw-r--r--drivers/md/dm.h13
-rw-r--r--drivers/md/md.c149
-rw-r--r--drivers/md/md.h2
-rw-r--r--drivers/md/persistent-data/dm-array.c5
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.c18
-rw-r--r--drivers/md/raid1.c162
-rw-r--r--drivers/md/raid1.h15
-rw-r--r--drivers/md/raid10.c6
-rw-r--r--drivers/md/raid5.c427
-rw-r--r--drivers/md/raid5.h18
24 files changed, 2035 insertions, 671 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 30b426ed744b..f2ccbc3b9fe4 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -297,6 +297,17 @@ config DM_MIRROR
Allow volume managers to mirror logical volumes, also
needed for live data migration tools such as 'pvmove'.
+config DM_LOG_USERSPACE
+ tristate "Mirror userspace logging"
+ depends on DM_MIRROR && NET
+ select CONNECTOR
+ ---help---
+ The userspace logging module provides a mechanism for
+ relaying the dm-dirty-log API to userspace. Log designs
+ which are more suited to userspace implementation (e.g.
+ shared storage logs) or experimental logs can be implemented
+ by leveraging this framework.
+
config DM_RAID
tristate "RAID 1/4/5/6/10 target"
depends on BLK_DEV_DM
@@ -323,17 +334,6 @@ config DM_RAID
RAID-5, RAID-6 distributes the syndromes across the drives
in one of the available parity distribution methods.
-config DM_LOG_USERSPACE
- tristate "Mirror userspace logging"
- depends on DM_MIRROR && NET
- select CONNECTOR
- ---help---
- The userspace logging module provides a mechanism for
- relaying the dm-dirty-log API to userspace. Log designs
- which are more suited to userspace implementation (e.g.
- shared storage logs) or experimental logs can be implemented
- by leveraging this framework.
-
config DM_ZERO
tristate "Zero target"
depends on BLK_DEV_DM
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index a7fd82133b12..12dc29ba7399 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1654,9 +1654,9 @@ int bitmap_create(struct mddev *mddev)
bitmap->mddev = mddev;
if (mddev->kobj.sd)
- bm = sysfs_get_dirent(mddev->kobj.sd, NULL, "bitmap");
+ bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap");
if (bm) {
- bitmap->sysfs_can_clear = sysfs_get_dirent(bm, NULL, "can_clear");
+ bitmap->sysfs_can_clear = sysfs_get_dirent(bm, "can_clear");
sysfs_put(bm);
} else
bitmap->sysfs_can_clear = NULL;
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 1af7255bbffb..9ef0752e8a08 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -20,7 +20,13 @@
#define CACHE_SUPERBLOCK_MAGIC 06142003
#define CACHE_SUPERBLOCK_LOCATION 0
-#define CACHE_VERSION 1
+
+/*
+ * defines a range of metadata versions that this module can handle.
+ */
+#define MIN_CACHE_VERSION 1
+#define MAX_CACHE_VERSION 1
+
#define CACHE_METADATA_CACHE_SIZE 64
/*
@@ -134,6 +140,18 @@ static void sb_prepare_for_write(struct dm_block_validator *v,
SUPERBLOCK_CSUM_XOR));
}
+static int check_metadata_version(struct cache_disk_superblock *disk_super)
+{
+ uint32_t metadata_version = le32_to_cpu(disk_super->version);
+ if (metadata_version < MIN_CACHE_VERSION || metadata_version > MAX_CACHE_VERSION) {
+ DMERR("Cache metadata version %u found, but only versions between %u and %u supported.",
+ metadata_version, MIN_CACHE_VERSION, MAX_CACHE_VERSION);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int sb_check(struct dm_block_validator *v,
struct dm_block *b,
size_t sb_block_size)
@@ -164,7 +182,7 @@ static int sb_check(struct dm_block_validator *v,
return -EILSEQ;
}
- return 0;
+ return check_metadata_version(disk_super);
}
static struct dm_block_validator sb_validator = {
@@ -198,7 +216,7 @@ static int superblock_lock(struct dm_cache_metadata *cmd,
/*----------------------------------------------------------------*/
-static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
+static int __superblock_all_zeroes(struct dm_block_manager *bm, bool *result)
{
int r;
unsigned i;
@@ -214,10 +232,10 @@ static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
return r;
data_le = dm_block_data(b);
- *result = 1;
+ *result = true;
for (i = 0; i < sb_block_size; i++) {
if (data_le[i] != zero) {
- *result = 0;
+ *result = false;
break;
}
}
@@ -270,7 +288,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
disk_super->flags = 0;
memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC);
- disk_super->version = cpu_to_le32(CACHE_VERSION);
+ disk_super->version = cpu_to_le32(MAX_CACHE_VERSION);
memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name));
memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version));
disk_super->policy_hint_size = 0;
@@ -411,7 +429,8 @@ bad:
static int __open_or_format_metadata(struct dm_cache_metadata *cmd,
bool format_device)
{
- int r, unformatted;
+ int r;
+ bool unformatted = false;
r = __superblock_all_zeroes(cmd->bm, &unformatted);
if (r)
@@ -666,19 +685,85 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
kfree(cmd);
}
+/*
+ * Checks that the given cache block is either unmapped or clean.
+ */
+static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b,
+ bool *result)
+{
+ int r;
+ __le64 value;
+ dm_oblock_t ob;
+ unsigned flags;
+
+ r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(b), &value);
+ if (r) {
+ DMERR("block_unmapped_or_clean failed");
+ return r;
+ }
+
+ unpack_value(value, &ob, &flags);
+ *result = !((flags & M_VALID) && (flags & M_DIRTY));
+
+ return 0;
+}
+
+static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd,
+ dm_cblock_t begin, dm_cblock_t end,
+ bool *result)
+{
+ int r;
+ *result = true;
+
+ while (begin != end) {
+ r = block_unmapped_or_clean(cmd, begin, result);
+ if (r)
+ return r;
+
+ if (!*result) {
+ DMERR("cache block %llu is dirty",
+ (unsigned long long) from_cblock(begin));
+ return 0;
+ }
+
+ begin = to_cblock(from_cblock(begin) + 1);
+ }
+
+ return 0;
+}
+
int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
{
int r;
+ bool clean;
__le64 null_mapping = pack_value(0, 0);
down_write(&cmd->root_lock);
__dm_bless_for_disk(&null_mapping);
+
+ if (from_cblock(new_cache_size) < from_cblock(cmd->cache_blocks)) {
+ r = blocks_are_unmapped_or_clean(cmd, new_cache_size, cmd->cache_blocks, &clean);
+ if (r) {
+ __dm_unbless_for_disk(&null_mapping);
+ goto out;
+ }
+
+ if (!clean) {
+ DMERR("unable to shrink cache due to dirty blocks");
+ r = -EINVAL;
+ __dm_unbless_for_disk(&null_mapping);
+ goto out;
+ }
+ }
+
r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks),
from_cblock(new_cache_size),
&null_mapping, &cmd->root);
if (!r)
cmd->cache_blocks = new_cache_size;
cmd->changed = true;
+
+out:
up_write(&cmd->root_lock);
return r;
@@ -1182,3 +1267,8 @@ int dm_cache_save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock,
return r;
}
+
+int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result)
+{
+ return blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result);
+}
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index f45cef21f3d0..cd906f14f98d 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -137,6 +137,11 @@ int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *
int dm_cache_save_hint(struct dm_cache_metadata *cmd,
dm_cblock_t cblock, uint32_t hint);
+/*
+ * Query method. Are all the blocks in the cache clean?
+ */
+int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result);
+
/*----------------------------------------------------------------*/
#endif /* DM_CACHE_METADATA_H */
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
index 0928abdc49f0..2256a1f24f73 100644
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -61,7 +61,12 @@ static inline int policy_writeback_work(struct dm_cache_policy *p,
static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
{
- return p->remove_mapping(p, oblock);
+ p->remove_mapping(p, oblock);
+}
+
+static inline int policy_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
+{
+ return p->remove_cblock(p, cblock);
}
static inline void policy_force_mapping(struct dm_cache_policy *p,
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 4296155090b2..416b7b752a6e 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -26,19 +26,6 @@ static unsigned next_power(unsigned n, unsigned min)
/*----------------------------------------------------------------*/
-static unsigned long *alloc_bitset(unsigned nr_entries)
-{
- size_t s = sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
- return vzalloc(s);
-}
-
-static void free_bitset(unsigned long *bits)
-{
- vfree(bits);
-}
-
-/*----------------------------------------------------------------*/
-
/*
* Large, sequential ios are probably better left on the origin device since
* spindles tend to have good bandwidth.
@@ -151,6 +138,21 @@ static void queue_init(struct queue *q)
}
/*
+ * Checks to see if the queue is empty.
+ * FIXME: reduce cpu usage.
+ */
+static bool queue_empty(struct queue *q)
+{
+ unsigned i;
+
+ for (i = 0; i < NR_QUEUE_LEVELS; i++)
+ if (!list_empty(q->qs + i))
+ return false;
+
+ return true;
+}
+
+/*
* Insert an entry to the back of the given level.
*/
static void queue_push(struct queue *q, unsigned level, struct list_head *elt)
@@ -218,17 +220,116 @@ struct entry {
struct hlist_node hlist;
struct list_head list;
dm_oblock_t oblock;
- dm_cblock_t cblock; /* valid iff in_cache */
/*
* FIXME: pack these better
*/
- bool in_cache:1;
+ bool dirty:1;
unsigned hit_count;
unsigned generation;
unsigned tick;
};
+/*
+ * Rather than storing the cblock in an entry, we allocate all entries in
+ * an array, and infer the cblock from the entry position.
+ *
+ * Free entries are linked together into a list.
+ */
+struct entry_pool {
+ struct entry *entries, *entries_end;
+ struct list_head free;
+ unsigned nr_allocated;
+};
+
+static int epool_init(struct entry_pool *ep, unsigned nr_entries)
+{
+ unsigned i;
+
+ ep->entries = vzalloc(sizeof(struct entry) * nr_entries);
+ if (!ep->entries)
+ return -ENOMEM;
+
+ ep->entries_end = ep->entries + nr_entries;
+
+ INIT_LIST_HEAD(&ep->free);
+ for (i = 0; i < nr_entries; i++)
+ list_add(&ep->entries[i].list, &ep->free);
+
+ ep->nr_allocated = 0;
+
+ return 0;
+}
+
+static void epool_exit(struct entry_pool *ep)
+{
+ vfree(ep->entries);
+}
+
+static struct entry *alloc_entry(struct entry_pool *ep)
+{
+ struct entry *e;
+
+ if (list_empty(&ep->free))
+ return NULL;
+
+ e = list_entry(list_pop(&ep->free), struct entry, list);
+ INIT_LIST_HEAD(&e->list);
+ INIT_HLIST_NODE(&e->hlist);
+ ep->nr_allocated++;
+
+ return e;
+}
+
+/*
+ * This assumes the cblock hasn't already been allocated.
+ */
+static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock)
+{
+ struct entry *e = ep->entries + from_cblock(cblock);
+ list_del(&e->list);
+
+ INIT_LIST_HEAD(&e->list);
+ INIT_HLIST_NODE(&e->hlist);
+ ep->nr_allocated++;
+
+ return e;
+}
+
+static void free_entry(struct entry_pool *ep, struct entry *e)
+{
+ BUG_ON(!ep->nr_allocated);
+ ep->nr_allocated--;
+ INIT_HLIST_NODE(&e->hlist);
+ list_add(&e->list, &ep->free);
+}
+
+/*
+ * Returns NULL if the entry is free.
+ */
+static struct entry *epool_find(struct entry_pool *ep, dm_cblock_t cblock)
+{
+ struct entry *e = ep->entries + from_cblock(cblock);
+ return !hlist_unhashed(&e->hlist) ? e : NULL;
+}
+
+static bool epool_empty(struct entry_pool *ep)
+{
+ return list_empty(&ep->free);
+}
+
+static bool in_pool(struct entry_pool *ep, struct entry *e)
+{
+ return e >= ep->entries && e < ep->entries_end;
+}
+
+static dm_cblock_t infer_cblock(struct entry_pool *ep, struct entry *e)
+{
+ return to_cblock(e - ep->entries);
+}
+
+/*----------------------------------------------------------------*/
+
struct mq_policy {
struct dm_cache_policy policy;
@@ -238,13 +339,22 @@ struct mq_policy {
struct io_tracker tracker;
/*
- * We maintain two queues of entries. The cache proper contains
- * the currently active mappings. Whereas the pre_cache tracks
- * blocks that are being hit frequently and potential candidates
- * for promotion to the cache.
+ * Entries come from two pools, one of pre-cache entries, and one
+ * for the cache proper.
+ */
+ struct entry_pool pre_cache_pool;
+ struct entry_pool cache_pool;
+
+ /*
+ * We maintain three queues of entries. The cache proper,
+ * consisting of a clean and dirty queue, contains the currently
+ * active mappings. Whereas the pre_cache tracks blocks that
+ * are being hit frequently and potential candidates for promotion
+ * to the cache.
*/
struct queue pre_cache;
- struct queue cache;
+ struct queue cache_clean;
+ struct queue cache_dirty;
/*
* Keeps track of time, incremented by the core. We use this to
@@ -282,25 +392,6 @@ struct mq_policy {
unsigned promote_threshold;
/*
- * We need cache_size entries for the cache, and choose to have
- * cache_size entries for the pre_cache too. One motivation for
- * using the same size is to make the hit counts directly
- * comparable between pre_cache and cache.
- */
- unsigned nr_entries;
- unsigned nr_entries_allocated;
- struct list_head free;
-
- /*
- * Cache blocks may be unallocated. We store this info in a
- * bitset.
- */
- unsigned long *allocation_bitset;
- unsigned nr_cblocks_allocated;
- unsigned find_free_nr_words;
- unsigned find_free_last_word;
-
- /*
* The hash table allows us to quickly find an entry by origin
* block. Both pre_cache and cache entries are in here.
*/
@@ -310,49 +401,6 @@ struct mq_policy {
};
/*----------------------------------------------------------------*/
-/* Free/alloc mq cache entry structures. */
-static void takeout_queue(struct list_head *lh, struct queue *q)
-{
- unsigned level;
-
- for (level = 0; level < NR_QUEUE_LEVELS; level++)
- list_splice(q->qs + level, lh);
-}
-
-static void free_entries(struct mq_policy *mq)
-{
- struct entry *e, *tmp;
-
- takeout_queue(&mq->free, &mq->pre_cache);
- takeout_queue(&mq->free, &mq->cache);
-
- list_for_each_entry_safe(e, tmp, &mq->free, list)
- kmem_cache_free(mq_entry_cache, e);
-}
-
-static int alloc_entries(struct mq_policy *mq, unsigned elts)
-{
- unsigned u = mq->nr_entries;
-
- INIT_LIST_HEAD(&mq->free);
- mq->nr_entries_allocated = 0;
-
- while (u--) {
- struct entry *e = kmem_cache_zalloc(mq_entry_cache, GFP_KERNEL);
-
- if (!e) {
- free_entries(mq);
- return -ENOMEM;
- }
-
-
- list_add(&e->list, &mq->free);
- }
-
- return 0;
-}
-
-/*----------------------------------------------------------------*/
/*
* Simple hash table implementation. Should replace with the standard hash
@@ -388,96 +436,14 @@ static void hash_remove(struct entry *e)
/*----------------------------------------------------------------*/
-/*
- * Allocates a new entry structure. The memory is allocated in one lump,
- * so we just handing it out here. Returns NULL if all entries have
- * already been allocated. Cannot fail otherwise.
- */
-static struct entry *alloc_entry(struct mq_policy *mq)
-{
- struct entry *e;
-
- if (mq->nr_entries_allocated >= mq->nr_entries) {
- BUG_ON(!list_empty(&mq->free));
- return NULL;
- }
-
- e = list_entry(list_pop(&mq->free), struct entry, list);
- INIT_LIST_HEAD(&e->list);
- INIT_HLIST_NODE(&e->hlist);
-
- mq->nr_entries_allocated++;
- return e;
-}
-
-/*----------------------------------------------------------------*/
-
-/*
- * Mark cache blocks allocated or not in the bitset.
- */
-static void alloc_cblock(struct mq_policy *mq, dm_cblock_t cblock)
-{
- BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size));
- BUG_ON(test_bit(from_cblock(cblock), mq->allocation_bitset));
-
- set_bit(from_cblock(cblock), mq->allocation_bitset);
- mq->nr_cblocks_allocated++;
-}
-
-static void free_cblock(struct mq_policy *mq, dm_cblock_t cblock)
-{
- BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size));
- BUG_ON(!test_bit(from_cblock(cblock), mq->allocation_bitset));
-
- clear_bit(from_cblock(cblock), mq->allocation_bitset);
- mq->nr_cblocks_allocated--;
-}
-
static bool any_free_cblocks(struct mq_policy *mq)
{
- return mq->nr_cblocks_allocated < from_cblock(mq->cache_size);
+ return !epool_empty(&mq->cache_pool);
}
-/*
- * Fills result out with a cache block that isn't in use, or return
- * -ENOSPC. This does _not_ mark the cblock as allocated, the caller is
- * reponsible for that.
- */
-static int __find_free_cblock(struct mq_policy *mq, unsigned begin, unsigned end,
- dm_cblock_t *result, unsigned *last_word)
+static bool any_clean_cblocks(struct mq_policy *mq)
{
- int r = -ENOSPC;
- unsigned w;
-
- for (w = begin; w < end; w++) {
- /*
- * ffz is undefined if no zero exists
- */
- if (mq->allocation_bitset[w] != ~0UL) {
- *last_word = w;
- *result = to_cblock((w * BITS_PER_LONG) + ffz(mq->allocation_bitset[w]));
- if (from_cblock(*result) < from_cblock(mq->cache_size))
- r = 0;
-
- break;
- }
- }
-
- return r;
-}
-
-static int find_free_cblock(struct mq_policy *mq, dm_cblock_t *result)
-{
- int r;
-
- if (!any_free_cblocks(mq))
- return -ENOSPC;
-
- r = __find_free_cblock(mq, mq->find_free_last_word, mq->find_free_nr_words, result, &mq->find_free_last_word);
- if (r == -ENOSPC && mq->find_free_last_word)
- r = __find_free_cblock(mq, 0, mq->find_free_last_word, result, &mq->find_free_last_word);
-
- return r;
+ return !queue_empty(&mq->cache_clean);
}
/*----------------------------------------------------------------*/
@@ -496,33 +462,35 @@ static unsigned queue_level(struct entry *e)
return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u);
}
+static bool in_cache(struct mq_policy *mq, struct entry *e)
+{
+ return in_pool(&mq->cache_pool, e);
+}
+
/*
* Inserts the entry into the pre_cache or the cache. Ensures the cache
- * block is marked as allocated if necc. Inserts into the hash table. Sets the
- * tick which records when the entry was last moved about.
+ * block is marked as allocated if necc. Inserts into the hash table.
+ * Sets the tick which records when the entry was last moved about.
*/
static void push(struct mq_policy *mq, struct entry *e)
{
e->tick = mq->tick;
hash_insert(mq, e);
- if (e->in_cache) {
- alloc_cblock(mq, e->cblock);
- queue_push(&mq->cache, queue_level(e), &e->list);
- } else
+ if (in_cache(mq, e))
+ queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean,
+ queue_level(e), &e->list);
+ else
queue_push(&mq->pre_cache, queue_level(e), &e->list);
}
/*
* Removes an entry from pre_cache or cache. Removes from the hash table.
- * Frees off the cache block if necc.
*/
static void del(struct mq_policy *mq, struct entry *e)
{
queue_remove(&e->list);
hash_remove(e);
- if (e->in_cache)
- free_cblock(mq, e->cblock);
}
/*
@@ -531,14 +499,14 @@ static void del(struct mq_policy *mq, struct entry *e)
*/
static struct entry *pop(struct mq_policy *mq, struct queue *q)
{
- struct entry *e = container_of(queue_pop(q), struct entry, list);
+ struct entry *e;
+ struct list_head *h = queue_pop(q);
- if (e) {
- hash_remove(e);
+ if (!h)
+ return NULL;
- if (e->in_cache)
- free_cblock(mq, e->cblock);
- }
+ e = container_of(h, struct entry, list);
+ hash_remove(e);
return e;
}
@@ -556,7 +524,8 @@ static bool updated_this_tick(struct mq_policy *mq, struct entry *e)
* of the entries.
*
* At the moment the threshold is taken by averaging the hit counts of some
- * of the entries in the cache (the first 20 entries of the first level).
+ * of the entries in the cache (the first 20 entries across all levels in
+ * ascending order, giving preference to the clean entries at each level).
*
* We can be much cleverer than this though. For example, each promotion
* could bump up the threshold helping to prevent churn. Much more to do
@@ -571,14 +540,21 @@ static void check_generation(struct mq_policy *mq)
struct list_head *head;
struct entry *e;
- if ((mq->hit_count >= mq->generation_period) &&
- (mq->nr_cblocks_allocated == from_cblock(mq->cache_size))) {
-
+ if ((mq->hit_count >= mq->generation_period) && (epool_empty(&mq->cache_pool))) {
mq->hit_count = 0;
mq->generation++;
for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) {
- head = mq->cache.qs + level;
+ head = mq->cache_clean.qs + level;
+ list_for_each_entry(e, head, list) {
+ nr++;
+ total += e->hit_count;
+
+ if (++count >= MAX_TO_AVERAGE)
+ break;
+ }
+
+ head = mq->cache_dirty.qs + level;
list_for_each_entry(e, head, list) {
nr++;
total += e->hit_count;
@@ -631,19 +607,30 @@ static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e)
* - set the hit count to a hard coded value other than 1, eg, is it better
* if it goes in at level 2?
*/
-static dm_cblock_t demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
+static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
{
- dm_cblock_t result;
- struct entry *demoted = pop(mq, &mq->cache);
+ struct entry *demoted = pop(mq, &mq->cache_clean);
+
+ if (!demoted)
+ /*
+ * We could get a block from mq->cache_dirty, but that
+ * would add extra latency to the triggering bio as it
+ * waits for the writeback. Better to not promote this
+ * time and hope there's a clean block next time this block
+ * is hit.
+ */
+ return -ENOSPC;
- BUG_ON(!demoted);
- result = demoted->cblock;
*oblock = demoted->oblock;
- demoted->in_cache = false;
- demoted->hit_count = 1;
- push(mq, demoted);
+ free_entry(&mq->cache_pool, demoted);
+
+ /*
+ * We used to put the demoted block into the pre-cache, but I think
+ * it's simpler to just let it work it's way up from zero again.
+ * Stops blocks flickering in and out of the cache.
+ */
- return result;
+ return 0;
}
/*
@@ -662,17 +649,18 @@ static dm_cblock_t demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
static unsigned adjusted_promote_threshold(struct mq_policy *mq,
bool discarded_oblock, int data_dir)
{
- if (discarded_oblock && any_free_cblocks(mq) && data_dir == WRITE)
+ if (data_dir == READ)
+ return mq->promote_threshold + READ_PROMOTE_THRESHOLD;
+
+ if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) {
/*
* We don't need to do any copying at all, so give this a
- * very low threshold. In practice this only triggers
- * during initial population after a format.
+ * very low threshold.
*/
return DISCARDED_PROMOTE_THRESHOLD;
+ }
- return data_dir == READ ?
- (mq->promote_threshold + READ_PROMOTE_THRESHOLD) :
- (mq->promote_threshold + WRITE_PROMOTE_THRESHOLD);
+ return mq->promote_threshold + WRITE_PROMOTE_THRESHOLD;
}
static bool should_promote(struct mq_policy *mq, struct entry *e,
@@ -688,34 +676,49 @@ static int cache_entry_found(struct mq_policy *mq,
{
requeue_and_update_tick(mq, e);
- if (e->in_cache) {
+ if (in_cache(mq, e)) {
result->op = POLICY_HIT;
- result->cblock = e->cblock;
+ result->cblock = infer_cblock(&mq->cache_pool, e);
}
return 0;
}
/*
- * Moves and entry from the pre_cache to the cache. The main work is
+ * Moves an entry from the pre_cache to the cache. The main work is
* finding which cache block to use.
*/
static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
struct policy_result *result)
{
- dm_cblock_t cblock;
+ int r;
+ struct entry *new_e;
- if (find_free_cblock(mq, &cblock) == -ENOSPC) {
+ /* Ensure there's a free cblock in the cache */
+ if (epool_empty(&mq->cache_pool)) {
result->op = POLICY_REPLACE;
- cblock = demote_cblock(mq, &result->old_oblock);
+ r = demote_cblock(mq, &result->old_oblock);
+ if (r) {
+ result->op = POLICY_MISS;
+ return 0;
+ }
} else
result->op = POLICY_NEW;
- result->cblock = e->cblock = cblock;
+ new_e = alloc_entry(&mq->cache_pool);
+ BUG_ON(!new_e);
+
+ new_e->oblock = e->oblock;
+ new_e->dirty = false;
+ new_e->hit_count = e->hit_count;
+ new_e->generation = e->generation;
+ new_e->tick = e->tick;
del(mq, e);
- e->in_cache = true;
- push(mq, e);
+ free_entry(&mq->pre_cache_pool, e);
+ push(mq, new_e);
+
+ result->cblock = infer_cblock(&mq->cache_pool, new_e);
return 0;
}
@@ -743,7 +746,7 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
static void insert_in_pre_cache(struct mq_policy *mq,
dm_oblock_t oblock)
{
- struct entry *e = alloc_entry(mq);
+ struct entry *e = alloc_entry(&mq->pre_cache_pool);
if (!e)
/*
@@ -757,7 +760,7 @@ static void insert_in_pre_cache(struct mq_policy *mq,
return;
}
- e->in_cache = false;
+ e->dirty = false;
e->oblock = oblock;
e->hit_count = 1;
e->generation = mq->generation;
@@ -767,30 +770,36 @@ static void insert_in_pre_cache(struct mq_policy *mq,
static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
struct policy_result *result)
{
+ int r;
struct entry *e;
- dm_cblock_t cblock;
- if (find_free_cblock(mq, &cblock) == -ENOSPC) {
- result->op = POLICY_MISS;
- insert_in_pre_cache(mq, oblock);
- return;
- }
+ if (epool_empty(&mq->cache_pool)) {
+ result->op = POLICY_REPLACE;
+ r = demote_cblock(mq, &result->old_oblock);
+ if (unlikely(r)) {
+ result->op = POLICY_MISS;
+ insert_in_pre_cache(mq, oblock);
+ return;
+ }
- e = alloc_entry(mq);
- if (unlikely(!e)) {
- result->op = POLICY_MISS;
- return;
+ /*
+ * This will always succeed, since we've just demoted.
+ */
+ e = alloc_entry(&mq->cache_pool);
+ BUG_ON(!e);
+
+ } else {
+ e = alloc_entry(&mq->cache_pool);
+ result->op = POLICY_NEW;
}
e->oblock = oblock;
- e->cblock = cblock;
- e->in_cache = true;
+ e->dirty = false;
e->hit_count = 1;
e->generation = mq->generation;
push(mq, e);
- result->op = POLICY_NEW;
- result->cblock = e->cblock;
+ result->cblock = infer_cblock(&mq->cache_pool, e);
}
static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
@@ -821,13 +830,16 @@ static int map(struct mq_policy *mq, dm_oblock_t oblock,
int r = 0;
struct entry *e = hash_lookup(mq, oblock);
- if (e && e->in_cache)
+ if (e && in_cache(mq, e))
r = cache_entry_found(mq, e, result);
+
else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL)
result->op = POLICY_MISS;
+
else if (e)
r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock,
data_dir, result);
+
else
r = no_entry_found(mq, oblock, can_migrate, discarded_oblock,
data_dir, result);
@@ -854,9 +866,9 @@ static void mq_destroy(struct dm_cache_policy *p)
{
struct mq_policy *mq = to_mq_policy(p);
- free_bitset(mq->allocation_bitset);
kfree(mq->table);
- free_entries(mq);
+ epool_exit(&mq->cache_pool);
+ epool_exit(&mq->pre_cache_pool);
kfree(mq);
}
@@ -904,8 +916,8 @@ static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t
return -EWOULDBLOCK;
e = hash_lookup(mq, oblock);
- if (e && e->in_cache) {
- *cblock = e->cblock;
+ if (e && in_cache(mq, e)) {
+ *cblock = infer_cblock(&mq->cache_pool, e);
r = 0;
} else
r = -ENOENT;
@@ -915,6 +927,36 @@ static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t
return r;
}
+static void __mq_set_clear_dirty(struct mq_policy *mq, dm_oblock_t oblock, bool set)
+{
+ struct entry *e;
+
+ e = hash_lookup(mq, oblock);
+ BUG_ON(!e || !in_cache(mq, e));
+
+ del(mq, e);
+ e->dirty = set;
+ push(mq, e);
+}
+
+static void mq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+ struct mq_policy *mq = to_mq_policy(p);
+
+ mutex_lock(&mq->lock);
+ __mq_set_clear_dirty(mq, oblock, true);
+ mutex_unlock(&mq->lock);
+}
+
+static void mq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+ struct mq_policy *mq = to_mq_policy(p);
+
+ mutex_lock(&mq->lock);
+ __mq_set_clear_dirty(mq, oblock, false);
+ mutex_unlock(&mq->lock);
+}
+
static int mq_load_mapping(struct dm_cache_policy *p,
dm_oblock_t oblock, dm_cblock_t cblock,
uint32_t hint, bool hint_valid)
@@ -922,13 +964,9 @@ static int mq_load_mapping(struct dm_cache_policy *p,
struct mq_policy *mq = to_mq_policy(p);
struct entry *e;
- e = alloc_entry(mq);
- if (!e)
- return -ENOMEM;
-
- e->cblock = cblock;
+ e = alloc_particular_entry(&mq->cache_pool, cblock);
e->oblock = oblock;
- e->in_cache = true;
+ e->dirty = false; /* this gets corrected in a minute */
e->hit_count = hint_valid ? hint : 1;
e->generation = mq->generation;
push(mq, e);
@@ -936,57 +974,126 @@ static int mq_load_mapping(struct dm_cache_policy *p,
return 0;
}
+static int mq_save_hints(struct mq_policy *mq, struct queue *q,
+ policy_walk_fn fn, void *context)
+{
+ int r;
+ unsigned level;
+ struct entry *e;
+
+ for (level = 0; level < NR_QUEUE_LEVELS; level++)
+ list_for_each_entry(e, q->qs + level, list) {
+ r = fn(context, infer_cblock(&mq->cache_pool, e),
+ e->oblock, e->hit_count);
+ if (r)
+ return r;
+ }
+
+ return 0;
+}
+
static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
void *context)
{
struct mq_policy *mq = to_mq_policy(p);
int r = 0;
- struct entry *e;
- unsigned level;
mutex_lock(&mq->lock);
- for (level = 0; level < NR_QUEUE_LEVELS; level++)
- list_for_each_entry(e, &mq->cache.qs[level], list) {
- r = fn(context, e->cblock, e->oblock, e->hit_count);
- if (r)
- goto out;
- }
+ r = mq_save_hints(mq, &mq->cache_clean, fn, context);
+ if (!r)
+ r = mq_save_hints(mq, &mq->cache_dirty, fn, context);
-out:
mutex_unlock(&mq->lock);
return r;
}
+static void __remove_mapping(struct mq_policy *mq, dm_oblock_t oblock)
+{
+ struct entry *e;
+
+ e = hash_lookup(mq, oblock);
+ BUG_ON(!e || !in_cache(mq, e));
+
+ del(mq, e);
+ free_entry(&mq->cache_pool, e);
+}
+
static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
{
struct mq_policy *mq = to_mq_policy(p);
- struct entry *e;
mutex_lock(&mq->lock);
+ __remove_mapping(mq, oblock);
+ mutex_unlock(&mq->lock);
+}
- e = hash_lookup(mq, oblock);
+static int __remove_cblock(struct mq_policy *mq, dm_cblock_t cblock)
+{
+ struct entry *e = epool_find(&mq->cache_pool, cblock);
- BUG_ON(!e || !e->in_cache);
+ if (!e)
+ return -ENODATA;
del(mq, e);
- e->in_cache = false;
- push(mq, e);
+ free_entry(&mq->cache_pool, e);
+ return 0;
+}
+
+static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
+{
+ int r;
+ struct mq_policy *mq = to_mq_policy(p);
+
+ mutex_lock(&mq->lock);
+ r = __remove_cblock(mq, cblock);
mutex_unlock(&mq->lock);
+
+ return r;
}
-static void force_mapping(struct mq_policy *mq,
- dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock,
+ dm_cblock_t *cblock)
{
- struct entry *e = hash_lookup(mq, current_oblock);
+ struct entry *e = pop(mq, &mq->cache_dirty);
- BUG_ON(!e || !e->in_cache);
+ if (!e)
+ return -ENODATA;
- del(mq, e);
- e->oblock = new_oblock;
+ *oblock = e->oblock;
+ *cblock = infer_cblock(&mq->cache_pool, e);
+ e->dirty = false;
push(mq, e);
+
+ return 0;
+}
+
+static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
+ dm_cblock_t *cblock)
+{
+ int r;
+ struct mq_policy *mq = to_mq_policy(p);
+
+ mutex_lock(&mq->lock);
+ r = __mq_writeback_work(mq, oblock, cblock);
+ mutex_unlock(&mq->lock);
+
+ return r;
+}
+
+static void __force_mapping(struct mq_policy *mq,
+ dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+{
+ struct entry *e = hash_lookup(mq, current_oblock);
+
+ if (e && in_cache(mq, e)) {
+ del(mq, e);
+ e->oblock = new_oblock;
+ e->dirty = true;
+ push(mq, e);
+ }
}
static void mq_force_mapping(struct dm_cache_policy *p,
@@ -995,16 +1102,20 @@ static void mq_force_mapping(struct dm_cache_policy *p,
struct mq_policy *mq = to_mq_policy(p);
mutex_lock(&mq->lock);
- force_mapping(mq, current_oblock, new_oblock);
+ __force_mapping(mq, current_oblock, new_oblock);
mutex_unlock(&mq->lock);
}
static dm_cblock_t mq_residency(struct dm_cache_policy *p)
{
+ dm_cblock_t r;
struct mq_policy *mq = to_mq_policy(p);
- /* FIXME: lock mutex, not sure we can block here */
- return to_cblock(mq->nr_cblocks_allocated);
+ mutex_lock(&mq->lock);
+ r = to_cblock(mq->cache_pool.nr_allocated);
+ mutex_unlock(&mq->lock);
+
+ return r;
}
static void mq_tick(struct dm_cache_policy *p)
@@ -1057,10 +1168,13 @@ static void init_policy_functions(struct mq_policy *mq)
mq->policy.destroy = mq_destroy;
mq->policy.map = mq_map;
mq->policy.lookup = mq_lookup;
+ mq->policy.set_dirty = mq_set_dirty;
+ mq->policy.clear_dirty = mq_clear_dirty;
mq->policy.load_mapping = mq_load_mapping;
mq->policy.walk_mappings = mq_walk_mappings;
mq->policy.remove_mapping = mq_remove_mapping;
- mq->policy.writeback_work = NULL;
+ mq->policy.remove_cblock = mq_remove_cblock;
+ mq->policy.writeback_work = mq_writeback_work;
mq->policy.force_mapping = mq_force_mapping;
mq->policy.residency = mq_residency;
mq->policy.tick = mq_tick;
@@ -1072,7 +1186,6 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
sector_t origin_size,
sector_t cache_block_size)
{
- int r;
struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
if (!mq)
@@ -1080,8 +1193,18 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
init_policy_functions(mq);
iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT);
-
mq->cache_size = cache_size;
+
+ if (epool_init(&mq->pre_cache_pool, from_cblock(cache_size))) {
+ DMERR("couldn't initialize pool of pre-cache entries");
+ goto bad_pre_cache_init;
+ }
+
+ if (epool_init(&mq->cache_pool, from_cblock(cache_size))) {
+ DMERR("couldn't initialize pool of cache entries");
+ goto bad_cache_init;
+ }
+
mq->tick_protected = 0;
mq->tick = 0;
mq->hit_count = 0;
@@ -1089,20 +1212,12 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
mq->promote_threshold = 0;
mutex_init(&mq->lock);
spin_lock_init(&mq->tick_lock);
- mq->find_free_nr_words = dm_div_up(from_cblock(mq->cache_size), BITS_PER_LONG);
- mq->find_free_last_word = 0;
queue_init(&mq->pre_cache);
- queue_init(&mq->cache);
- mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U);
+ queue_init(&mq->cache_clean);
+ queue_init(&mq->cache_dirty);
- mq->nr_entries = 2 * from_cblock(cache_size);
- r = alloc_entries(mq, mq->nr_entries);
- if (r)
- goto bad_cache_alloc;
-
- mq->nr_entries_allocated = 0;
- mq->nr_cblocks_allocated = 0;
+ mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U);
mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);
mq->hash_bits = ffs(mq->nr_buckets) - 1;
@@ -1110,17 +1225,13 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
if (!mq->table)
goto bad_alloc_table;
- mq->allocation_bitset = alloc_bitset(from_cblock(cache_size));
- if (!mq->allocation_bitset)
- goto bad_alloc_bitset;
-
return &mq->policy;
-bad_alloc_bitset:
- kfree(mq->table);
bad_alloc_table:
- free_entries(mq);
-bad_cache_alloc:
+ epool_exit(&mq->cache_pool);
+bad_cache_init:
+ epool_exit(&mq->pre_cache_pool);
+bad_pre_cache_init:
kfree(mq);
return NULL;
@@ -1130,7 +1241,7 @@ bad_cache_alloc:
static struct dm_cache_policy_type mq_policy_type = {
.name = "mq",
- .version = {1, 0, 0},
+ .version = {1, 1, 0},
.hint_size = 4,
.owner = THIS_MODULE,
.create = mq_create
@@ -1138,7 +1249,7 @@ static struct dm_cache_policy_type mq_policy_type = {
static struct dm_cache_policy_type default_policy_type = {
.name = "default",
- .version = {1, 0, 0},
+ .version = {1, 1, 0},
.hint_size = 4,
.owner = THIS_MODULE,
.create = mq_create
diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c
index 21c03c570c06..d80057968407 100644
--- a/drivers/md/dm-cache-policy.c
+++ b/drivers/md/dm-cache-policy.c
@@ -119,13 +119,13 @@ struct dm_cache_policy *dm_cache_policy_create(const char *name,
type = get_policy(name);
if (!type) {
DMWARN("unknown policy type");
- return NULL;
+ return ERR_PTR(-EINVAL);
}
p = type->create(cache_size, origin_size, cache_block_size);
if (!p) {
put_policy(type);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
p->private = type;
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index 33369ca9614f..052c00a84a5c 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -135,9 +135,6 @@ struct dm_cache_policy {
*/
int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock);
- /*
- * oblock must be a mapped block. Must not block.
- */
void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
@@ -159,8 +156,24 @@ struct dm_cache_policy {
void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock,
dm_oblock_t new_oblock);
- int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock);
+ /*
+ * This is called via the invalidate_cblocks message. It is
+ * possible the particular cblock has already been removed due to a
+ * write io in passthrough mode. In which case this should return
+ * -ENODATA.
+ */
+ int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock);
+ /*
+ * Provide a dirty block to be written back by the core target.
+ *
+ * Returns:
+ *
+ * 0 and @cblock,@oblock: block to write back provided
+ *
+ * -ENODATA: no dirty blocks available
+ */
+ int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock);
/*
* How full is the cache?
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 29569768ffbf..9efcf1059b99 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -61,6 +61,34 @@ static void free_bitset(unsigned long *bits)
/*----------------------------------------------------------------*/
+/*
+ * There are a couple of places where we let a bio run, but want to do some
+ * work before calling its endio function. We do this by temporarily
+ * changing the endio fn.
+ */
+struct dm_hook_info {
+ bio_end_io_t *bi_end_io;
+ void *bi_private;
+};
+
+static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
+ bio_end_io_t *bi_end_io, void *bi_private)
+{
+ h->bi_end_io = bio->bi_end_io;
+ h->bi_private = bio->bi_private;
+
+ bio->bi_end_io = bi_end_io;
+ bio->bi_private = bi_private;
+}
+
+static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
+{
+ bio->bi_end_io = h->bi_end_io;
+ bio->bi_private = h->bi_private;
+}
+
+/*----------------------------------------------------------------*/
+
#define PRISON_CELLS 1024
#define MIGRATION_POOL_SIZE 128
#define COMMIT_PERIOD HZ
@@ -76,14 +104,37 @@ static void free_bitset(unsigned long *bits)
/*
* FIXME: the cache is read/write for the time being.
*/
-enum cache_mode {
+enum cache_metadata_mode {
CM_WRITE, /* metadata may be changed */
CM_READ_ONLY, /* metadata may not be changed */
};
+enum cache_io_mode {
+ /*
+ * Data is written to cached blocks only. These blocks are marked
+ * dirty. If you lose the cache device you will lose data.
+ * Potential performance increase for both reads and writes.
+ */
+ CM_IO_WRITEBACK,
+
+ /*
+ * Data is written to both cache and origin. Blocks are never
+ * dirty. Potential performance benfit for reads only.
+ */
+ CM_IO_WRITETHROUGH,
+
+ /*
+ * A degraded mode useful for various cache coherency situations
+ * (eg, rolling back snapshots). Reads and writes always go to the
+ * origin. If a write goes to a cached oblock, then the cache
+ * block is invalidated.
+ */
+ CM_IO_PASSTHROUGH
+};
+
struct cache_features {
- enum cache_mode mode;
- bool write_through:1;
+ enum cache_metadata_mode mode;
+ enum cache_io_mode io_mode;
};
struct cache_stats {
@@ -99,6 +150,25 @@ struct cache_stats {
atomic_t discard_count;
};
+/*
+ * Defines a range of cblocks, begin to (end - 1) are in the range. end is
+ * the one-past-the-end value.
+ */
+struct cblock_range {
+ dm_cblock_t begin;
+ dm_cblock_t end;
+};
+
+struct invalidation_request {
+ struct list_head list;
+ struct cblock_range *cblocks;
+
+ atomic_t complete;
+ int err;
+
+ wait_queue_head_t result_wait;
+};
+
struct cache {
struct dm_target *ti;
struct dm_target_callbacks callbacks;
@@ -148,6 +218,10 @@ struct cache {
wait_queue_head_t migration_wait;
atomic_t nr_migrations;
+ wait_queue_head_t quiescing_wait;
+ atomic_t quiescing;
+ atomic_t quiescing_ack;
+
/*
* cache_size entries, dirty if set
*/
@@ -186,7 +260,7 @@ struct cache {
bool need_tick_bio:1;
bool sized:1;
- bool quiescing:1;
+ bool invalidate:1;
bool commit_requested:1;
bool loaded_mappings:1;
bool loaded_discards:1;
@@ -197,6 +271,12 @@ struct cache {
struct cache_features features;
struct cache_stats stats;
+
+ /*
+ * Invalidation fields.
+ */
+ spinlock_t invalidation_lock;
+ struct list_head invalidation_requests;
};
struct per_bio_data {
@@ -211,7 +291,7 @@ struct per_bio_data {
*/
struct cache *cache;
dm_cblock_t cblock;
- bio_end_io_t *saved_bi_end_io;
+ struct dm_hook_info hook_info;
struct dm_bio_details bio_details;
};
@@ -228,6 +308,8 @@ struct dm_cache_migration {
bool writeback:1;
bool demote:1;
bool promote:1;
+ bool requeue_holder:1;
+ bool invalidate:1;
struct dm_bio_prison_cell *old_ocell;
struct dm_bio_prison_cell *new_ocell;
@@ -533,9 +615,24 @@ static void save_stats(struct cache *cache)
#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
+static bool writethrough_mode(struct cache_features *f)
+{
+ return f->io_mode == CM_IO_WRITETHROUGH;
+}
+
+static bool writeback_mode(struct cache_features *f)
+{
+ return f->io_mode == CM_IO_WRITEBACK;
+}
+
+static bool passthrough_mode(struct cache_features *f)
+{
+ return f->io_mode == CM_IO_PASSTHROUGH;
+}
+
static size_t get_per_bio_data_size(struct cache *cache)
{
- return cache->features.write_through ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
+ return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
}
static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
@@ -605,6 +702,7 @@ static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
dm_oblock_t oblock, dm_cblock_t cblock)
{
+ check_if_tick_bio_needed(cache, bio);
remap_to_cache(cache, bio, cblock);
if (bio_data_dir(bio) == WRITE) {
set_dirty(cache, oblock, cblock);
@@ -662,7 +760,8 @@ static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
static void writethrough_endio(struct bio *bio, int err)
{
struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
- bio->bi_end_io = pb->saved_bi_end_io;
+
+ dm_unhook_bio(&pb->hook_info, bio);
if (err) {
bio_endio(bio, err);
@@ -693,9 +792,8 @@ static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
pb->cache = cache;
pb->cblock = cblock;
- pb->saved_bi_end_io = bio->bi_end_io;
+ dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL);
dm_bio_record(&pb->bio_details, bio);
- bio->bi_end_io = writethrough_endio;
remap_to_origin_clear_discard(pb->cache, bio, oblock);
}
@@ -748,8 +846,9 @@ static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
static void cleanup_migration(struct dm_cache_migration *mg)
{
- dec_nr_migrations(mg->cache);
+ struct cache *cache = mg->cache;
free_migration(mg);
+ dec_nr_migrations(cache);
}
static void migration_failure(struct dm_cache_migration *mg)
@@ -765,13 +864,13 @@ static void migration_failure(struct dm_cache_migration *mg)
DMWARN_LIMIT("demotion failed; couldn't copy block");
policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
- cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
+ cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
if (mg->promote)
- cell_defer(cache, mg->new_ocell, 1);
+ cell_defer(cache, mg->new_ocell, true);
} else {
DMWARN_LIMIT("promotion failed; couldn't copy block");
policy_remove_mapping(cache->policy, mg->new_oblock);
- cell_defer(cache, mg->new_ocell, 1);
+ cell_defer(cache, mg->new_ocell, true);
}
cleanup_migration(mg);
@@ -823,7 +922,7 @@ static void migration_success_post_commit(struct dm_cache_migration *mg)
return;
} else if (mg->demote) {
- cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
+ cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
if (mg->promote) {
mg->demote = false;
@@ -832,11 +931,19 @@ static void migration_success_post_commit(struct dm_cache_migration *mg)
list_add_tail(&mg->list, &cache->quiesced_migrations);
spin_unlock_irqrestore(&cache->lock, flags);
- } else
+ } else {
+ if (mg->invalidate)
+ policy_remove_mapping(cache->policy, mg->old_oblock);
cleanup_migration(mg);
+ }
} else {
- cell_defer(cache, mg->new_ocell, true);
+ if (mg->requeue_holder)
+ cell_defer(cache, mg->new_ocell, true);
+ else {
+ bio_endio(mg->new_ocell->holder, 0);
+ cell_defer(cache, mg->new_ocell, false);
+ }
clear_dirty(cache, mg->new_oblock, mg->cblock);
cleanup_migration(mg);
}
@@ -881,8 +988,46 @@ static void issue_copy_real(struct dm_cache_migration *mg)
r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
}
- if (r < 0)
+ if (r < 0) {
+ DMERR_LIMIT("issuing migration failed");
migration_failure(mg);
+ }
+}
+
+static void overwrite_endio(struct bio *bio, int err)
+{
+ struct dm_cache_migration *mg = bio->bi_private;
+ struct cache *cache = mg->cache;
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+ unsigned long flags;
+
+ if (err)
+ mg->err = true;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ list_add_tail(&mg->list, &cache->completed_migrations);
+ dm_unhook_bio(&pb->hook_info, bio);
+ mg->requeue_holder = false;
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ wake_worker(cache);
+}
+
+static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
+{
+ size_t pb_data_size = get_per_bio_data_size(mg->cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+ dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
+ remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
+ generic_make_request(bio);
+}
+
+static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
+{
+ return (bio_data_dir(bio) == WRITE) &&
+ (bio->bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
}
static void avoid_copy(struct dm_cache_migration *mg)
@@ -899,9 +1044,17 @@ static void issue_copy(struct dm_cache_migration *mg)
if (mg->writeback || mg->demote)
avoid = !is_dirty(cache, mg->cblock) ||
is_discarded_oblock(cache, mg->old_oblock);
- else
+ else {
+ struct bio *bio = mg->new_ocell->holder;
+
avoid = is_discarded_oblock(cache, mg->new_oblock);
+ if (!avoid && bio_writes_complete_block(cache, bio)) {
+ issue_overwrite(mg, bio);
+ return;
+ }
+ }
+
avoid ? avoid_copy(mg) : issue_copy_real(mg);
}
@@ -991,6 +1144,8 @@ static void promote(struct cache *cache, struct prealloc *structs,
mg->writeback = false;
mg->demote = false;
mg->promote = true;
+ mg->requeue_holder = true;
+ mg->invalidate = false;
mg->cache = cache;
mg->new_oblock = oblock;
mg->cblock = cblock;
@@ -1012,6 +1167,8 @@ static void writeback(struct cache *cache, struct prealloc *structs,
mg->writeback = true;
mg->demote = false;
mg->promote = false;
+ mg->requeue_holder = true;
+ mg->invalidate = false;
mg->cache = cache;
mg->old_oblock = oblock;
mg->cblock = cblock;
@@ -1035,6 +1192,8 @@ static void demote_then_promote(struct cache *cache, struct prealloc *structs,
mg->writeback = false;
mg->demote = true;
mg->promote = true;
+ mg->requeue_holder = true;
+ mg->invalidate = false;
mg->cache = cache;
mg->old_oblock = old_oblock;
mg->new_oblock = new_oblock;
@@ -1047,6 +1206,33 @@ static void demote_then_promote(struct cache *cache, struct prealloc *structs,
quiesce_migration(mg);
}
+/*
+ * Invalidate a cache entry. No writeback occurs; any changes in the cache
+ * block are thrown away.
+ */
+static void invalidate(struct cache *cache, struct prealloc *structs,
+ dm_oblock_t oblock, dm_cblock_t cblock,
+ struct dm_bio_prison_cell *cell)
+{
+ struct dm_cache_migration *mg = prealloc_get_migration(structs);
+
+ mg->err = false;
+ mg->writeback = false;
+ mg->demote = true;
+ mg->promote = false;
+ mg->requeue_holder = true;
+ mg->invalidate = true;
+ mg->cache = cache;
+ mg->old_oblock = oblock;
+ mg->cblock = cblock;
+ mg->old_ocell = cell;
+ mg->new_ocell = NULL;
+ mg->start_jiffies = jiffies;
+
+ inc_nr_migrations(cache);
+ quiesce_migration(mg);
+}
+
/*----------------------------------------------------------------
* bio processing
*--------------------------------------------------------------*/
@@ -1109,13 +1295,6 @@ static bool spare_migration_bandwidth(struct cache *cache)
return current_volume < cache->migration_threshold;
}
-static bool is_writethrough_io(struct cache *cache, struct bio *bio,
- dm_cblock_t cblock)
-{
- return bio_data_dir(bio) == WRITE &&
- cache->features.write_through && !is_dirty(cache, cblock);
-}
-
static void inc_hit_counter(struct cache *cache, struct bio *bio)
{
atomic_inc(bio_data_dir(bio) == READ ?
@@ -1128,6 +1307,15 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio)
&cache->stats.read_miss : &cache->stats.write_miss);
}
+static void issue_cache_bio(struct cache *cache, struct bio *bio,
+ struct per_bio_data *pb,
+ dm_oblock_t oblock, dm_cblock_t cblock)
+{
+ pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+ remap_to_cache_dirty(cache, bio, oblock, cblock);
+ issue(cache, bio);
+}
+
static void process_bio(struct cache *cache, struct prealloc *structs,
struct bio *bio)
{
@@ -1139,7 +1327,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
size_t pb_data_size = get_per_bio_data_size(cache);
struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
bool discarded_block = is_discarded_oblock(cache, block);
- bool can_migrate = discarded_block || spare_migration_bandwidth(cache);
+ bool passthrough = passthrough_mode(&cache->features);
+ bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
/*
* Check to see if that block is currently migrating.
@@ -1160,15 +1349,39 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
switch (lookup_result.op) {
case POLICY_HIT:
- inc_hit_counter(cache, bio);
- pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+ if (passthrough) {
+ inc_miss_counter(cache, bio);
- if (is_writethrough_io(cache, bio, lookup_result.cblock))
- remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
- else
- remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+ /*
+ * Passthrough always maps to the origin,
+ * invalidating any cache blocks that are written
+ * to.
+ */
+
+ if (bio_data_dir(bio) == WRITE) {
+ atomic_inc(&cache->stats.demotion);
+ invalidate(cache, structs, block, lookup_result.cblock, new_ocell);
+ release_cell = false;
+
+ } else {
+ /* FIXME: factor out issue_origin() */
+ pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+ remap_to_origin_clear_discard(cache, bio, block);
+ issue(cache, bio);
+ }
+ } else {
+ inc_hit_counter(cache, bio);
+
+ if (bio_data_dir(bio) == WRITE &&
+ writethrough_mode(&cache->features) &&
+ !is_dirty(cache, lookup_result.cblock)) {
+ pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+ remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
+ issue(cache, bio);
+ } else
+ issue_cache_bio(cache, bio, pb, block, lookup_result.cblock);
+ }
- issue(cache, bio);
break;
case POLICY_MISS:
@@ -1227,15 +1440,17 @@ static int need_commit_due_to_time(struct cache *cache)
static int commit_if_needed(struct cache *cache)
{
- if (dm_cache_changed_this_transaction(cache->cmd) &&
- (cache->commit_requested || need_commit_due_to_time(cache))) {
+ int r = 0;
+
+ if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
+ dm_cache_changed_this_transaction(cache->cmd)) {
atomic_inc(&cache->stats.commit_count);
- cache->last_commit_jiffies = jiffies;
cache->commit_requested = false;
- return dm_cache_commit(cache->cmd, false);
+ r = dm_cache_commit(cache->cmd, false);
+ cache->last_commit_jiffies = jiffies;
}
- return 0;
+ return r;
}
static void process_deferred_bios(struct cache *cache)
@@ -1344,36 +1559,88 @@ static void writeback_some_dirty_blocks(struct cache *cache)
}
/*----------------------------------------------------------------
- * Main worker loop
+ * Invalidations.
+ * Dropping something from the cache *without* writing back.
*--------------------------------------------------------------*/
-static void start_quiescing(struct cache *cache)
+
+static void process_invalidation_request(struct cache *cache, struct invalidation_request *req)
{
- unsigned long flags;
+ int r = 0;
+ uint64_t begin = from_cblock(req->cblocks->begin);
+ uint64_t end = from_cblock(req->cblocks->end);
- spin_lock_irqsave(&cache->lock, flags);
- cache->quiescing = 1;
- spin_unlock_irqrestore(&cache->lock, flags);
+ while (begin != end) {
+ r = policy_remove_cblock(cache->policy, to_cblock(begin));
+ if (!r) {
+ r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
+ if (r)
+ break;
+
+ } else if (r == -ENODATA) {
+ /* harmless, already unmapped */
+ r = 0;
+
+ } else {
+ DMERR("policy_remove_cblock failed");
+ break;
+ }
+
+ begin++;
+ }
+
+ cache->commit_requested = true;
+
+ req->err = r;
+ atomic_set(&req->complete, 1);
+
+ wake_up(&req->result_wait);
}
-static void stop_quiescing(struct cache *cache)
+static void process_invalidation_requests(struct cache *cache)
{
- unsigned long flags;
+ struct list_head list;
+ struct invalidation_request *req, *tmp;
- spin_lock_irqsave(&cache->lock, flags);
- cache->quiescing = 0;
- spin_unlock_irqrestore(&cache->lock, flags);
+ INIT_LIST_HEAD(&list);
+ spin_lock(&cache->invalidation_lock);
+ list_splice_init(&cache->invalidation_requests, &list);
+ spin_unlock(&cache->invalidation_lock);
+
+ list_for_each_entry_safe (req, tmp, &list, list)
+ process_invalidation_request(cache, req);
}
+/*----------------------------------------------------------------
+ * Main worker loop
+ *--------------------------------------------------------------*/
static bool is_quiescing(struct cache *cache)
{
- int r;
- unsigned long flags;
+ return atomic_read(&cache->quiescing);
+}
- spin_lock_irqsave(&cache->lock, flags);
- r = cache->quiescing;
- spin_unlock_irqrestore(&cache->lock, flags);
+static void ack_quiescing(struct cache *cache)
+{
+ if (is_quiescing(cache)) {
+ atomic_inc(&cache->quiescing_ack);
+ wake_up(&cache->quiescing_wait);
+ }
+}
- return r;
+static void wait_for_quiescing_ack(struct cache *cache)
+{
+ wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
+}
+
+static void start_quiescing(struct cache *cache)
+{
+ atomic_inc(&cache->quiescing);
+ wait_for_quiescing_ack(cache);
+}
+
+static void stop_quiescing(struct cache *cache)
+{
+ atomic_set(&cache->quiescing, 0);
+ atomic_set(&cache->quiescing_ack, 0);
}
static void wait_for_migrations(struct cache *cache)
@@ -1412,7 +1679,8 @@ static int more_work(struct cache *cache)
!bio_list_empty(&cache->deferred_writethrough_bios) ||
!list_empty(&cache->quiesced_migrations) ||
!list_empty(&cache->completed_migrations) ||
- !list_empty(&cache->need_commit_migrations);
+ !list_empty(&cache->need_commit_migrations) ||
+ cache->invalidate;
}
static void do_worker(struct work_struct *ws)
@@ -1420,16 +1688,16 @@ static void do_worker(struct work_struct *ws)
struct cache *cache = container_of(ws, struct cache, worker);
do {
- if (!is_quiescing(cache))
+ if (!is_quiescing(cache)) {
+ writeback_some_dirty_blocks(cache);
+ process_deferred_writethrough_bios(cache);
process_deferred_bios(cache);
+ process_invalidation_requests(cache);
+ }
process_migrations(cache, &cache->quiesced_migrations, issue_copy);
process_migrations(cache, &cache->completed_migrations, complete_migration);
- writeback_some_dirty_blocks(cache);
-
- process_deferred_writethrough_bios(cache);
-
if (commit_if_needed(cache)) {
process_deferred_flush_bios(cache, false);
@@ -1442,6 +1710,9 @@ static void do_worker(struct work_struct *ws)
process_migrations(cache, &cache->need_commit_migrations,
migration_success_post_commit);
}
+
+ ack_quiescing(cache);
+
} while (more_work(cache));
}
@@ -1715,7 +1986,7 @@ static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
static void init_features(struct cache_features *cf)
{
cf->mode = CM_WRITE;
- cf->write_through = false;
+ cf->io_mode = CM_IO_WRITEBACK;
}
static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
@@ -1740,10 +2011,13 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
arg = dm_shift_arg(as);
if (!strcasecmp(arg, "writeback"))
- cf->write_through = false;
+ cf->io_mode = CM_IO_WRITEBACK;
else if (!strcasecmp(arg, "writethrough"))
- cf->write_through = true;
+ cf->io_mode = CM_IO_WRITETHROUGH;
+
+ else if (!strcasecmp(arg, "passthrough"))
+ cf->io_mode = CM_IO_PASSTHROUGH;
else {
*error = "Unrecognised cache feature requested";
@@ -1872,14 +2146,15 @@ static int set_config_values(struct cache *cache, int argc, const char **argv)
static int create_cache_policy(struct cache *cache, struct cache_args *ca,
char **error)
{
- cache->policy = dm_cache_policy_create(ca->policy_name,
- cache->cache_size,
- cache->origin_sectors,
- cache->sectors_per_block);
- if (!cache->policy) {
+ struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
+ cache->cache_size,
+ cache->origin_sectors,
+ cache->sectors_per_block);
+ if (IS_ERR(p)) {
*error = "Error creating cache's policy";
- return -ENOMEM;
+ return PTR_ERR(p);
}
+ cache->policy = p;
return 0;
}
@@ -1995,6 +2270,22 @@ static int cache_create(struct cache_args *ca, struct cache **result)
}
cache->cmd = cmd;
+ if (passthrough_mode(&cache->features)) {
+ bool all_clean;
+
+ r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
+ if (r) {
+ *error = "dm_cache_metadata_all_clean() failed";
+ goto bad;
+ }
+
+ if (!all_clean) {
+ *error = "Cannot enter passthrough mode unless all blocks are clean";
+ r = -EINVAL;
+ goto bad;
+ }
+ }
+
spin_lock_init(&cache->lock);
bio_list_init(&cache->deferred_bios);
bio_list_init(&cache->deferred_flush_bios);
@@ -2005,6 +2296,10 @@ static int cache_create(struct cache_args *ca, struct cache **result)
atomic_set(&cache->nr_migrations, 0);
init_waitqueue_head(&cache->migration_wait);
+ init_waitqueue_head(&cache->quiescing_wait);
+ atomic_set(&cache->quiescing, 0);
+ atomic_set(&cache->quiescing_ack, 0);
+
r = -ENOMEM;
cache->nr_dirty = 0;
cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
@@ -2064,7 +2359,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
cache->need_tick_bio = true;
cache->sized = false;
- cache->quiescing = false;
+ cache->invalidate = false;
cache->commit_requested = false;
cache->loaded_mappings = false;
cache->loaded_discards = false;
@@ -2078,6 +2373,9 @@ static int cache_create(struct cache_args *ca, struct cache **result)
atomic_set(&cache->stats.commit_count, 0);
atomic_set(&cache->stats.discard_count, 0);
+ spin_lock_init(&cache->invalidation_lock);
+ INIT_LIST_HEAD(&cache->invalidation_requests);
+
*result = cache;
return 0;
@@ -2207,17 +2505,37 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_SUBMITTED;
}
+ r = DM_MAPIO_REMAPPED;
switch (lookup_result.op) {
case POLICY_HIT:
- inc_hit_counter(cache, bio);
- pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+ if (passthrough_mode(&cache->features)) {
+ if (bio_data_dir(bio) == WRITE) {
+ /*
+ * We need to invalidate this block, so
+ * defer for the worker thread.
+ */
+ cell_defer(cache, cell, true);
+ r = DM_MAPIO_SUBMITTED;
+
+ } else {
+ pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+ inc_miss_counter(cache, bio);
+ remap_to_origin_clear_discard(cache, bio, block);
+
+ cell_defer(cache, cell, false);
+ }
- if (is_writethrough_io(cache, bio, lookup_result.cblock))
- remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
- else
- remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+ } else {
+ inc_hit_counter(cache, bio);
- cell_defer(cache, cell, false);
+ if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
+ !is_dirty(cache, lookup_result.cblock))
+ remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
+ else
+ remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+
+ cell_defer(cache, cell, false);
+ }
break;
case POLICY_MISS:
@@ -2242,10 +2560,10 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
(unsigned) lookup_result.op);
bio_io_error(bio);
- return DM_MAPIO_SUBMITTED;
+ r = DM_MAPIO_SUBMITTED;
}
- return DM_MAPIO_REMAPPED;
+ return r;
}
static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
@@ -2406,26 +2724,71 @@ static int load_discard(void *context, sector_t discard_block_size,
return 0;
}
+static dm_cblock_t get_cache_dev_size(struct cache *cache)
+{
+ sector_t size = get_dev_size(cache->cache_dev);
+ (void) sector_div(size, cache->sectors_per_block);
+ return to_cblock(size);
+}
+
+static bool can_resize(struct cache *cache, dm_cblock_t new_size)
+{
+ if (from_cblock(new_size) > from_cblock(cache->cache_size))
+ return true;
+
+ /*
+ * We can't drop a dirty block when shrinking the cache.
+ */
+ while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
+ new_size = to_cblock(from_cblock(new_size) + 1);
+ if (is_dirty(cache, new_size)) {
+ DMERR("unable to shrink cache; cache block %llu is dirty",
+ (unsigned long long) from_cblock(new_size));
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
+{
+ int r;
+
+ r = dm_cache_resize(cache->cmd, cache->cache_size);
+ if (r) {
+ DMERR("could not resize cache metadata");
+ return r;
+ }
+
+ cache->cache_size = new_size;
+
+ return 0;
+}
+
static int cache_preresume(struct dm_target *ti)
{
int r = 0;
struct cache *cache = ti->private;
- sector_t actual_cache_size = get_dev_size(cache->cache_dev);
- (void) sector_div(actual_cache_size, cache->sectors_per_block);
+ dm_cblock_t csize = get_cache_dev_size(cache);
/*
* Check to see if the cache has resized.
*/
- if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) {
- cache->cache_size = to_cblock(actual_cache_size);
-
- r = dm_cache_resize(cache->cmd, cache->cache_size);
- if (r) {
- DMERR("could not resize cache metadata");
+ if (!cache->sized) {
+ r = resize_cache_dev(cache, csize);
+ if (r)
return r;
- }
cache->sized = true;
+
+ } else if (csize != cache->cache_size) {
+ if (!can_resize(cache, csize))
+ return -EINVAL;
+
+ r = resize_cache_dev(cache, csize);
+ if (r)
+ return r;
}
if (!cache->loaded_mappings) {
@@ -2518,10 +2881,19 @@ static void cache_status(struct dm_target *ti, status_type_t type,
(unsigned long long) from_cblock(residency),
cache->nr_dirty);
- if (cache->features.write_through)
+ if (writethrough_mode(&cache->features))
DMEMIT("1 writethrough ");
- else
- DMEMIT("0 ");
+
+ else if (passthrough_mode(&cache->features))
+ DMEMIT("1 passthrough ");
+
+ else if (writeback_mode(&cache->features))
+ DMEMIT("1 writeback ");
+
+ else {
+ DMERR("internal error: unknown io mode: %d", (int) cache->features.io_mode);
+ goto err;
+ }
DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
if (sz < maxlen) {
@@ -2553,7 +2925,128 @@ err:
}
/*
- * Supports <key> <value>.
+ * A cache block range can take two forms:
+ *
+ * i) A single cblock, eg. '3456'
+ * ii) A begin and end cblock with dots between, eg. 123-234
+ */
+static int parse_cblock_range(struct cache *cache, const char *str,
+ struct cblock_range *result)
+{
+ char dummy;
+ uint64_t b, e;
+ int r;
+
+ /*
+ * Try and parse form (ii) first.
+ */
+ r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
+ if (r < 0)
+ return r;
+
+ if (r == 2) {
+ result->begin = to_cblock(b);
+ result->end = to_cblock(e);
+ return 0;
+ }
+
+ /*
+ * That didn't work, try form (i).
+ */
+ r = sscanf(str, "%llu%c", &b, &dummy);
+ if (r < 0)
+ return r;
+
+ if (r == 1) {
+ result->begin = to_cblock(b);
+ result->end = to_cblock(from_cblock(result->begin) + 1u);
+ return 0;
+ }
+
+ DMERR("invalid cblock range '%s'", str);
+ return -EINVAL;
+}
+
+static int validate_cblock_range(struct cache *cache, struct cblock_range *range)
+{
+ uint64_t b = from_cblock(range->begin);
+ uint64_t e = from_cblock(range->end);
+ uint64_t n = from_cblock(cache->cache_size);
+
+ if (b >= n) {
+ DMERR("begin cblock out of range: %llu >= %llu", b, n);
+ return -EINVAL;
+ }
+
+ if (e > n) {
+ DMERR("end cblock out of range: %llu > %llu", e, n);
+ return -EINVAL;
+ }
+
+ if (b >= e) {
+ DMERR("invalid cblock range: %llu >= %llu", b, e);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int request_invalidation(struct cache *cache, struct cblock_range *range)
+{
+ struct invalidation_request req;
+
+ INIT_LIST_HEAD(&req.list);
+ req.cblocks = range;
+ atomic_set(&req.complete, 0);
+ req.err = 0;
+ init_waitqueue_head(&req.result_wait);
+
+ spin_lock(&cache->invalidation_lock);
+ list_add(&req.list, &cache->invalidation_requests);
+ spin_unlock(&cache->invalidation_lock);
+ wake_worker(cache);
+
+ wait_event(req.result_wait, atomic_read(&req.complete));
+ return req.err;
+}
+
+static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
+ const char **cblock_ranges)
+{
+ int r = 0;
+ unsigned i;
+ struct cblock_range range;
+
+ if (!passthrough_mode(&cache->features)) {
+ DMERR("cache has to be in passthrough mode for invalidation");
+ return -EPERM;
+ }
+
+ for (i = 0; i < count; i++) {
+ r = parse_cblock_range(cache, cblock_ranges[i], &range);
+ if (r)
+ break;
+
+ r = validate_cblock_range(cache, &range);
+ if (r)
+ break;
+
+ /*
+ * Pass begin and end origin blocks to the worker and wake it.
+ */
+ r = request_invalidation(cache, &range);
+ if (r)
+ break;
+ }
+
+ return r;
+}
+
+/*
+ * Supports
+ * "<key> <value>"
+ * and
+ * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]*
*
* The key migration_threshold is supported by the cache target core.
*/
@@ -2561,6 +3054,12 @@ static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
{
struct cache *cache = ti->private;
+ if (!argc)
+ return -EINVAL;
+
+ if (!strcasecmp(argv[0], "invalidate_cblocks"))
+ return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
+
if (argc != 2)
return -EINVAL;
@@ -2630,7 +3129,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type cache_target = {
.name = "cache",
- .version = {1, 1, 1},
+ .version = {1, 2, 0},
.module = THIS_MODULE,
.ctr = cache_ctr,
.dtr = cache_dtr,
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 0fce0bc1a957..81b0fa660452 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -2,6 +2,7 @@
* Copyright (C) 2003 Christophe Saout <christophe@saout.de>
* Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
* Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2013 Milan Broz <gmazyland@gmail.com>
*
* This file is released under the GPL.
*/
@@ -98,6 +99,13 @@ struct iv_lmk_private {
u8 *seed;
};
+#define TCW_WHITENING_SIZE 16
+struct iv_tcw_private {
+ struct crypto_shash *crc32_tfm;
+ u8 *iv_seed;
+ u8 *whitening;
+};
+
/*
* Crypt: maps a linear range of a block device
* and encrypts / decrypts at the same time.
@@ -139,6 +147,7 @@ struct crypt_config {
struct iv_essiv_private essiv;
struct iv_benbi_private benbi;
struct iv_lmk_private lmk;
+ struct iv_tcw_private tcw;
} iv_gen_private;
sector_t iv_offset;
unsigned int iv_size;
@@ -171,7 +180,8 @@ struct crypt_config {
unsigned long flags;
unsigned int key_size;
- unsigned int key_parts;
+ unsigned int key_parts; /* independent parts in key buffer */
+ unsigned int key_extra_size; /* additional keys length */
u8 key[0];
};
@@ -230,6 +240,16 @@ static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
* version 3: the same as version 2 with additional IV seed
* (it uses 65 keys, last key is used as IV seed)
*
+ * tcw: Compatible implementation of the block chaining mode used
+ * by the TrueCrypt device encryption system (prior to version 4.1).
+ * For more info see: http://www.truecrypt.org
+ * It operates on full 512 byte sectors and uses CBC
+ * with an IV derived from initial key and the sector number.
+ * In addition, whitening value is applied on every sector, whitening
+ * is calculated from initial key, sector number and mixed using CRC32.
+ * Note that this encryption scheme is vulnerable to watermarking attacks
+ * and should be used for old compatible containers access only.
+ *
* plumb: unimplemented, see:
* http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
*/
@@ -530,7 +550,7 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
char ctx[crypto_shash_descsize(lmk->hash_tfm)];
} sdesc;
struct md5_state md5state;
- u32 buf[4];
+ __le32 buf[4];
int i, r;
sdesc.desc.tfm = lmk->hash_tfm;
@@ -608,6 +628,153 @@ static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
return r;
}
+static void crypt_iv_tcw_dtr(struct crypt_config *cc)
+{
+ struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+
+ kzfree(tcw->iv_seed);
+ tcw->iv_seed = NULL;
+ kzfree(tcw->whitening);
+ tcw->whitening = NULL;
+
+ if (tcw->crc32_tfm && !IS_ERR(tcw->crc32_tfm))
+ crypto_free_shash(tcw->crc32_tfm);
+ tcw->crc32_tfm = NULL;
+}
+
+static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti,
+ const char *opts)
+{
+ struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+
+ if (cc->key_size <= (cc->iv_size + TCW_WHITENING_SIZE)) {
+ ti->error = "Wrong key size for TCW";
+ return -EINVAL;
+ }
+
+ tcw->crc32_tfm = crypto_alloc_shash("crc32", 0, 0);
+ if (IS_ERR(tcw->crc32_tfm)) {
+ ti->error = "Error initializing CRC32 in TCW";
+ return PTR_ERR(tcw->crc32_tfm);
+ }
+
+ tcw->iv_seed = kzalloc(cc->iv_size, GFP_KERNEL);
+ tcw->whitening = kzalloc(TCW_WHITENING_SIZE, GFP_KERNEL);
+ if (!tcw->iv_seed || !tcw->whitening) {
+ crypt_iv_tcw_dtr(cc);
+ ti->error = "Error allocating seed storage in TCW";
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int crypt_iv_tcw_init(struct crypt_config *cc)
+{
+ struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+ int key_offset = cc->key_size - cc->iv_size - TCW_WHITENING_SIZE;
+
+ memcpy(tcw->iv_seed, &cc->key[key_offset], cc->iv_size);
+ memcpy(tcw->whitening, &cc->key[key_offset + cc->iv_size],
+ TCW_WHITENING_SIZE);
+
+ return 0;
+}
+
+static int crypt_iv_tcw_wipe(struct crypt_config *cc)
+{
+ struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+
+ memset(tcw->iv_seed, 0, cc->iv_size);
+ memset(tcw->whitening, 0, TCW_WHITENING_SIZE);
+
+ return 0;
+}
+
+static int crypt_iv_tcw_whitening(struct crypt_config *cc,
+ struct dm_crypt_request *dmreq,
+ u8 *data)
+{
+ struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+ u64 sector = cpu_to_le64((u64)dmreq->iv_sector);
+ u8 buf[TCW_WHITENING_SIZE];
+ struct {
+ struct shash_desc desc;
+ char ctx[crypto_shash_descsize(tcw->crc32_tfm)];
+ } sdesc;
+ int i, r;
+
+ /* xor whitening with sector number */
+ memcpy(buf, tcw->whitening, TCW_WHITENING_SIZE);
+ crypto_xor(buf, (u8 *)&sector, 8);
+ crypto_xor(&buf[8], (u8 *)&sector, 8);
+
+ /* calculate crc32 for every 32bit part and xor it */
+ sdesc.desc.tfm = tcw->crc32_tfm;
+ sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+ for (i = 0; i < 4; i++) {
+ r = crypto_shash_init(&sdesc.desc);
+ if (r)
+ goto out;
+ r = crypto_shash_update(&sdesc.desc, &buf[i * 4], 4);
+ if (r)
+ goto out;
+ r = crypto_shash_final(&sdesc.desc, &buf[i * 4]);
+ if (r)
+ goto out;
+ }
+ crypto_xor(&buf[0], &buf[12], 4);
+ crypto_xor(&buf[4], &buf[8], 4);
+
+ /* apply whitening (8 bytes) to whole sector */
+ for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++)
+ crypto_xor(data + i * 8, buf, 8);
+out:
+ memset(buf, 0, sizeof(buf));
+ return r;
+}
+
+static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+ u64 sector = cpu_to_le64((u64)dmreq->iv_sector);
+ u8 *src;
+ int r = 0;
+
+ /* Remove whitening from ciphertext */
+ if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) {
+ src = kmap_atomic(sg_page(&dmreq->sg_in));
+ r = crypt_iv_tcw_whitening(cc, dmreq, src + dmreq->sg_in.offset);
+ kunmap_atomic(src);
+ }
+
+ /* Calculate IV */
+ memcpy(iv, tcw->iv_seed, cc->iv_size);
+ crypto_xor(iv, (u8 *)&sector, 8);
+ if (cc->iv_size > 8)
+ crypto_xor(&iv[8], (u8 *)&sector, cc->iv_size - 8);
+
+ return r;
+}
+
+static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ u8 *dst;
+ int r;
+
+ if (bio_data_dir(dmreq->ctx->bio_in) != WRITE)
+ return 0;
+
+ /* Apply whitening on ciphertext */
+ dst = kmap_atomic(sg_page(&dmreq->sg_out));
+ r = crypt_iv_tcw_whitening(cc, dmreq, dst + dmreq->sg_out.offset);
+ kunmap_atomic(dst);
+
+ return r;
+}
+
static struct crypt_iv_operations crypt_iv_plain_ops = {
.generator = crypt_iv_plain_gen
};
@@ -643,6 +810,15 @@ static struct crypt_iv_operations crypt_iv_lmk_ops = {
.post = crypt_iv_lmk_post
};
+static struct crypt_iv_operations crypt_iv_tcw_ops = {
+ .ctr = crypt_iv_tcw_ctr,
+ .dtr = crypt_iv_tcw_dtr,
+ .init = crypt_iv_tcw_init,
+ .wipe = crypt_iv_tcw_wipe,
+ .generator = crypt_iv_tcw_gen,
+ .post = crypt_iv_tcw_post
+};
+
static void crypt_convert_init(struct crypt_config *cc,
struct convert_context *ctx,
struct bio *bio_out, struct bio *bio_in,
@@ -774,7 +950,7 @@ static int crypt_convert(struct crypt_config *cc,
/* async */
case -EBUSY:
wait_for_completion(&ctx->restart);
- INIT_COMPLETION(ctx->restart);
+ reinit_completion(&ctx->restart);
/* fall through*/
case -EINPROGRESS:
this_cc->req = NULL;
@@ -1274,9 +1450,12 @@ static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
static int crypt_setkey_allcpus(struct crypt_config *cc)
{
- unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count);
+ unsigned subkey_size;
int err = 0, i, r;
+ /* Ignore extra keys (which are used for IV etc) */
+ subkey_size = (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count);
+
for (i = 0; i < cc->tfms_count; i++) {
r = crypto_ablkcipher_setkey(cc->tfms[i],
cc->key + (i * subkey_size),
@@ -1409,6 +1588,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
return -EINVAL;
}
cc->key_parts = cc->tfms_count;
+ cc->key_extra_size = 0;
cc->cipher = kstrdup(cipher, GFP_KERNEL);
if (!cc->cipher)
@@ -1460,13 +1640,6 @@ static int crypt_ctr_cipher(struct dm_target *ti,
goto bad;
}
- /* Initialize and set key */
- ret = crypt_set_key(cc, key);
- if (ret < 0) {
- ti->error = "Error decoding and setting key";
- goto bad;
- }
-
/* Initialize IV */
cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc));
if (cc->iv_size)
@@ -1493,18 +1666,33 @@ static int crypt_ctr_cipher(struct dm_target *ti,
cc->iv_gen_ops = &crypt_iv_null_ops;
else if (strcmp(ivmode, "lmk") == 0) {
cc->iv_gen_ops = &crypt_iv_lmk_ops;
- /* Version 2 and 3 is recognised according
+ /*
+ * Version 2 and 3 is recognised according
* to length of provided multi-key string.
* If present (version 3), last key is used as IV seed.
+ * All keys (including IV seed) are always the same size.
*/
- if (cc->key_size % cc->key_parts)
+ if (cc->key_size % cc->key_parts) {
cc->key_parts++;
+ cc->key_extra_size = cc->key_size / cc->key_parts;
+ }
+ } else if (strcmp(ivmode, "tcw") == 0) {
+ cc->iv_gen_ops = &crypt_iv_tcw_ops;
+ cc->key_parts += 2; /* IV + whitening */
+ cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE;
} else {
ret = -EINVAL;
ti->error = "Invalid IV mode";
goto bad;
}
+ /* Initialize and set key */
+ ret = crypt_set_key(cc, key);
+ if (ret < 0) {
+ ti->error = "Error decoding and setting key";
+ goto bad;
+ }
+
/* Allocate IV */
if (cc->iv_gen_ops && cc->iv_gen_ops->ctr) {
ret = cc->iv_gen_ops->ctr(cc, ti, ivopts);
@@ -1817,7 +2005,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
static struct target_type crypt_target = {
.name = "crypt",
- .version = {1, 12, 1},
+ .version = {1, 13, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index afe08146f73e..51521429fb59 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -57,7 +57,7 @@ struct vers_iter {
static struct list_head _name_buckets[NUM_BUCKETS];
static struct list_head _uuid_buckets[NUM_BUCKETS];
-static void dm_hash_remove_all(int keep_open_devices);
+static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred);
/*
* Guards access to both hash tables.
@@ -86,7 +86,7 @@ static int dm_hash_init(void)
static void dm_hash_exit(void)
{
- dm_hash_remove_all(0);
+ dm_hash_remove_all(false, false, false);
}
/*-----------------------------------------------------------------
@@ -276,7 +276,7 @@ static struct dm_table *__hash_remove(struct hash_cell *hc)
return table;
}
-static void dm_hash_remove_all(int keep_open_devices)
+static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred)
{
int i, dev_skipped;
struct hash_cell *hc;
@@ -293,7 +293,8 @@ retry:
md = hc->md;
dm_get(md);
- if (keep_open_devices && dm_lock_for_deletion(md)) {
+ if (keep_open_devices &&
+ dm_lock_for_deletion(md, mark_deferred, only_deferred)) {
dm_put(md);
dev_skipped++;
continue;
@@ -450,6 +451,11 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
return md;
}
+void dm_deferred_remove(void)
+{
+ dm_hash_remove_all(true, false, true);
+}
+
/*-----------------------------------------------------------------
* Implementation of the ioctl commands
*---------------------------------------------------------------*/
@@ -461,7 +467,7 @@ typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size);
static int remove_all(struct dm_ioctl *param, size_t param_size)
{
- dm_hash_remove_all(1);
+ dm_hash_remove_all(true, !!(param->flags & DM_DEFERRED_REMOVE), false);
param->data_size = 0;
return 0;
}
@@ -683,6 +689,9 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
if (dm_suspended_md(md))
param->flags |= DM_SUSPEND_FLAG;
+ if (dm_test_deferred_remove_flag(md))
+ param->flags |= DM_DEFERRED_REMOVE;
+
param->dev = huge_encode_dev(disk_devt(disk));
/*
@@ -832,8 +841,13 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
/*
* Ensure the device is not open and nothing further can open it.
*/
- r = dm_lock_for_deletion(md);
+ r = dm_lock_for_deletion(md, !!(param->flags & DM_DEFERRED_REMOVE), false);
if (r) {
+ if (r == -EBUSY && param->flags & DM_DEFERRED_REMOVE) {
+ up_write(&_hash_lock);
+ dm_put(md);
+ return 0;
+ }
DMDEBUG_LIMIT("unable to remove open device %s", hc->name);
up_write(&_hash_lock);
dm_put(md);
@@ -848,6 +862,8 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
dm_table_destroy(t);
}
+ param->flags &= ~DM_DEFERRED_REMOVE;
+
if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr))
param->flags |= DM_UEVENT_GENERATED_FLAG;
@@ -1469,6 +1485,14 @@ static int message_for_md(struct mapped_device *md, unsigned argc, char **argv,
if (**argv != '@')
return 2; /* no '@' prefix, deliver to target */
+ if (!strcasecmp(argv[0], "@cancel_deferred_remove")) {
+ if (argc != 1) {
+ DMERR("Invalid arguments for @cancel_deferred_remove");
+ return -EINVAL;
+ }
+ return dm_cancel_deferred_remove(md);
+ }
+
r = dm_stats_message(md, argc, argv, result, maxlen);
if (r < 2)
return r;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index de570a558764..6eb9dc9ef8f3 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -87,6 +87,7 @@ struct multipath {
unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */
unsigned saved_queue_if_no_path:1; /* Saved state during suspension */
unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */
+ unsigned pg_init_disabled:1; /* pg_init is not currently allowed */
unsigned pg_init_retries; /* Number of times to retry pg_init */
unsigned pg_init_count; /* Number of times pg_init called */
@@ -390,13 +391,16 @@ static int map_io(struct multipath *m, struct request *clone,
if (was_queued)
m->queue_size--;
- if ((pgpath && m->queue_io) ||
- (!pgpath && m->queue_if_no_path)) {
+ if (m->pg_init_required) {
+ if (!m->pg_init_in_progress)
+ queue_work(kmultipathd, &m->process_queued_ios);
+ r = DM_MAPIO_REQUEUE;
+ } else if ((pgpath && m->queue_io) ||
+ (!pgpath && m->queue_if_no_path)) {
/* Queue for the daemon to resubmit */
list_add_tail(&clone->queuelist, &m->queued_ios);
m->queue_size++;
- if ((m->pg_init_required && !m->pg_init_in_progress) ||
- !m->queue_io)
+ if (!m->queue_io)
queue_work(kmultipathd, &m->process_queued_ios);
pgpath = NULL;
r = DM_MAPIO_SUBMITTED;
@@ -497,7 +501,8 @@ static void process_queued_ios(struct work_struct *work)
(!pgpath && !m->queue_if_no_path))
must_queue = 0;
- if (m->pg_init_required && !m->pg_init_in_progress && pgpath)
+ if (m->pg_init_required && !m->pg_init_in_progress && pgpath &&
+ !m->pg_init_disabled)
__pg_init_all_paths(m);
spin_unlock_irqrestore(&m->lock, flags);
@@ -942,10 +947,20 @@ static void multipath_wait_for_pg_init_completion(struct multipath *m)
static void flush_multipath_work(struct multipath *m)
{
+ unsigned long flags;
+
+ spin_lock_irqsave(&m->lock, flags);
+ m->pg_init_disabled = 1;
+ spin_unlock_irqrestore(&m->lock, flags);
+
flush_workqueue(kmpath_handlerd);
multipath_wait_for_pg_init_completion(m);
flush_workqueue(kmultipathd);
flush_work(&m->trigger_event);
+
+ spin_lock_irqsave(&m->lock, flags);
+ m->pg_init_disabled = 0;
+ spin_unlock_irqrestore(&m->lock, flags);
}
static void multipath_dtr(struct dm_target *ti)
@@ -1164,7 +1179,7 @@ static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
spin_lock_irqsave(&m->lock, flags);
- if (m->pg_init_count <= m->pg_init_retries)
+ if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled)
m->pg_init_required = 1;
else
limit_reached = 1;
@@ -1665,6 +1680,11 @@ static int multipath_busy(struct dm_target *ti)
spin_lock_irqsave(&m->lock, flags);
+ /* pg_init in progress, requeue until done */
+ if (m->pg_init_in_progress) {
+ busy = 1;
+ goto out;
+ }
/* Guess which priority_group will be used at next mapping time */
if (unlikely(!m->current_pgpath && m->next_pg))
pg = m->next_pg;
@@ -1714,7 +1734,7 @@ out:
*---------------------------------------------------------------*/
static struct target_type multipath_target = {
.name = "multipath",
- .version = {1, 5, 1},
+ .version = {1, 6, 0},
.module = THIS_MODULE,
.ctr = multipath_ctr,
.dtr = multipath_dtr,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 8f8783533ac7..465f08ca62b1 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -545,14 +545,28 @@ static int adjoin(struct dm_table *table, struct dm_target *ti)
/*
* Used to dynamically allocate the arg array.
+ *
+ * We do first allocation with GFP_NOIO because dm-mpath and dm-thin must
+ * process messages even if some device is suspended. These messages have a
+ * small fixed number of arguments.
+ *
+ * On the other hand, dm-switch needs to process bulk data using messages and
+ * excessive use of GFP_NOIO could cause trouble.
*/
static char **realloc_argv(unsigned *array_size, char **old_argv)
{
char **argv;
unsigned new_size;
+ gfp_t gfp;
- new_size = *array_size ? *array_size * 2 : 64;
- argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL);
+ if (*array_size) {
+ new_size = *array_size * 2;
+ gfp = GFP_KERNEL;
+ } else {
+ new_size = 8;
+ gfp = GFP_NOIO;
+ }
+ argv = kmalloc(new_size * sizeof(*argv), gfp);
if (argv) {
memcpy(argv, old_argv, *array_size * sizeof(*argv));
*array_size = new_size;
@@ -1548,8 +1562,11 @@ int dm_table_resume_targets(struct dm_table *t)
continue;
r = ti->type->preresume(ti);
- if (r)
+ if (r) {
+ DMERR("%s: %s: preresume failed, error = %d",
+ dm_device_name(t->md), ti->type->name, r);
return r;
+ }
}
for (i = 0; i < t->num_targets; i++) {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b3e26c7d1417..0704c523a76b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -49,6 +49,11 @@ static unsigned int _major = 0;
static DEFINE_IDR(_minor_idr);
static DEFINE_SPINLOCK(_minor_lock);
+
+static void do_deferred_remove(struct work_struct *w);
+
+static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
+
/*
* For bio-based dm.
* One of these is allocated per bio.
@@ -116,6 +121,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
#define DMF_DELETING 4
#define DMF_NOFLUSH_SUSPENDING 5
#define DMF_MERGE_IS_OPTIONAL 6
+#define DMF_DEFERRED_REMOVE 7
/*
* A dummy definition to make RCU happy.
@@ -299,6 +305,8 @@ out_free_io_cache:
static void local_exit(void)
{
+ flush_scheduled_work();
+
kmem_cache_destroy(_rq_tio_cache);
kmem_cache_destroy(_io_cache);
unregister_blkdev(_major, _name);
@@ -404,7 +412,10 @@ static void dm_blk_close(struct gendisk *disk, fmode_t mode)
spin_lock(&_minor_lock);
- atomic_dec(&md->open_count);
+ if (atomic_dec_and_test(&md->open_count) &&
+ (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
+ schedule_work(&deferred_remove_work);
+
dm_put(md);
spin_unlock(&_minor_lock);
@@ -418,14 +429,18 @@ int dm_open_count(struct mapped_device *md)
/*
* Guarantees nothing is using the device before it's deleted.
*/
-int dm_lock_for_deletion(struct mapped_device *md)
+int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
{
int r = 0;
spin_lock(&_minor_lock);
- if (dm_open_count(md))
+ if (dm_open_count(md)) {
r = -EBUSY;
+ if (mark_deferred)
+ set_bit(DMF_DEFERRED_REMOVE, &md->flags);
+ } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
+ r = -EEXIST;
else
set_bit(DMF_DELETING, &md->flags);
@@ -434,6 +449,27 @@ int dm_lock_for_deletion(struct mapped_device *md)
return r;
}
+int dm_cancel_deferred_remove(struct mapped_device *md)
+{
+ int r = 0;
+
+ spin_lock(&_minor_lock);
+
+ if (test_bit(DMF_DELETING, &md->flags))
+ r = -EBUSY;
+ else
+ clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
+
+ spin_unlock(&_minor_lock);
+
+ return r;
+}
+
+static void do_deferred_remove(struct work_struct *w)
+{
+ dm_deferred_remove();
+}
+
sector_t dm_get_size(struct mapped_device *md)
{
return get_capacity(md->disk);
@@ -2894,6 +2930,11 @@ int dm_suspended_md(struct mapped_device *md)
return test_bit(DMF_SUSPENDED, &md->flags);
}
+int dm_test_deferred_remove_flag(struct mapped_device *md)
+{
+ return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
+}
+
int dm_suspended(struct dm_target *ti)
{
return dm_suspended_md(dm_table_get_md(ti->table));
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 1d1ad7b7e527..c57ba550f69e 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -129,6 +129,16 @@ int dm_deleting_md(struct mapped_device *md);
int dm_suspended_md(struct mapped_device *md);
/*
+ * Test if the device is scheduled for deferred remove.
+ */
+int dm_test_deferred_remove_flag(struct mapped_device *md);
+
+/*
+ * Try to remove devices marked for deferred removal.
+ */
+void dm_deferred_remove(void);
+
+/*
* The device-mapper can be driven through one of two interfaces;
* ioctl or filesystem, depending which patch you have applied.
*/
@@ -158,7 +168,8 @@ void dm_stripe_exit(void);
void dm_destroy(struct mapped_device *md);
void dm_destroy_immediate(struct mapped_device *md);
int dm_open_count(struct mapped_device *md);
-int dm_lock_for_deletion(struct mapped_device *md);
+int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred);
+int dm_cancel_deferred_remove(struct mapped_device *md);
int dm_request_based(struct mapped_device *md);
sector_t dm_get_size(struct mapped_device *md);
struct dm_stats *dm_get_stats(struct mapped_device *md);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 752119068d66..21f4d7ff0da2 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -112,7 +112,7 @@ static inline int speed_max(struct mddev *mddev)
static struct ctl_table_header *raid_table_header;
-static ctl_table raid_table[] = {
+static struct ctl_table raid_table[] = {
{
.procname = "speed_limit_min",
.data = &sysctl_speed_limit_min,
@@ -130,7 +130,7 @@ static ctl_table raid_table[] = {
{ }
};
-static ctl_table raid_dir_table[] = {
+static struct ctl_table raid_dir_table[] = {
{
.procname = "raid",
.maxlen = 0,
@@ -140,7 +140,7 @@ static ctl_table raid_dir_table[] = {
{ }
};
-static ctl_table raid_root_table[] = {
+static struct ctl_table raid_root_table[] = {
{
.procname = "dev",
.maxlen = 0,
@@ -562,11 +562,19 @@ static struct mddev * mddev_find(dev_t unit)
goto retry;
}
-static inline int mddev_lock(struct mddev * mddev)
+static inline int __must_check mddev_lock(struct mddev * mddev)
{
return mutex_lock_interruptible(&mddev->reconfig_mutex);
}
+/* Sometimes we need to take the lock in a situation where
+ * failure due to interrupts is not acceptable.
+ */
+static inline void mddev_lock_nointr(struct mddev * mddev)
+{
+ mutex_lock(&mddev->reconfig_mutex);
+}
+
static inline int mddev_is_locked(struct mddev *mddev)
{
return mutex_is_locked(&mddev->reconfig_mutex);
@@ -768,16 +776,10 @@ void md_super_wait(struct mddev *mddev)
finish_wait(&mddev->sb_wait, &wq);
}
-static void bi_complete(struct bio *bio, int error)
-{
- complete((struct completion*)bio->bi_private);
-}
-
int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
struct page *page, int rw, bool metadata_op)
{
struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
- struct completion event;
int ret;
rw |= REQ_SYNC;
@@ -793,11 +795,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
else
bio->bi_sector = sector + rdev->data_offset;
bio_add_page(bio, page, size, 0);
- init_completion(&event);
- bio->bi_private = &event;
- bio->bi_end_io = bi_complete;
- submit_bio(rw, bio);
- wait_for_completion(&event);
+ submit_bio_wait(rw, bio);
ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
bio_put(bio);
@@ -2978,7 +2976,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
for_each_mddev(mddev, tmp) {
struct md_rdev *rdev2;
- mddev_lock(mddev);
+ mddev_lock_nointr(mddev);
rdev_for_each(rdev2, mddev)
if (rdev->bdev == rdev2->bdev &&
rdev != rdev2 &&
@@ -2994,7 +2992,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
break;
}
}
- mddev_lock(my_mddev);
+ mddev_lock_nointr(my_mddev);
if (overlap) {
/* Someone else could have slipped in a size
* change here, but doing so is just silly.
@@ -3515,7 +3513,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
printk(KERN_WARNING
"md: cannot register extra attributes for %s\n",
mdname(mddev));
- mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action");
+ mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
}
if (mddev->pers->sync_request != NULL &&
pers->sync_request == NULL) {
@@ -3580,6 +3578,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev->in_sync = 1;
del_timer_sync(&mddev->safemode_timer);
}
+ blk_set_stacking_limits(&mddev->queue->limits);
pers->run(mddev);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
mddev_resume(mddev);
@@ -5258,7 +5257,7 @@ static void __md_stop_writes(struct mddev *mddev)
void md_stop_writes(struct mddev *mddev)
{
- mddev_lock(mddev);
+ mddev_lock_nointr(mddev);
__md_stop_writes(mddev);
mddev_unlock(mddev);
}
@@ -5291,20 +5290,35 @@ EXPORT_SYMBOL_GPL(md_stop);
static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
{
int err = 0;
+ int did_freeze = 0;
+
+ if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
+ did_freeze = 1;
+ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ }
+ if (mddev->sync_thread) {
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ /* Thread might be blocked waiting for metadata update
+ * which will now never happen */
+ wake_up_process(mddev->sync_thread->tsk);
+ }
+ mddev_unlock(mddev);
+ wait_event(resync_wait, mddev->sync_thread == NULL);
+ mddev_lock_nointr(mddev);
+
mutex_lock(&mddev->open_mutex);
- if (atomic_read(&mddev->openers) > !!bdev) {
+ if (atomic_read(&mddev->openers) > !!bdev ||
+ mddev->sync_thread ||
+ (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
printk("md: %s still in use.\n",mdname(mddev));
+ if (did_freeze) {
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ }
err = -EBUSY;
goto out;
}
- if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) {
- /* Someone opened the device since we flushed it
- * so page cache could be dirty and it is too late
- * to flush. So abort
- */
- mutex_unlock(&mddev->open_mutex);
- return -EBUSY;
- }
if (mddev->pers) {
__md_stop_writes(mddev);
@@ -5315,7 +5329,7 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
set_disk_ro(mddev->gendisk, 1);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
sysfs_notify_dirent_safe(mddev->sysfs_state);
- err = 0;
+ err = 0;
}
out:
mutex_unlock(&mddev->open_mutex);
@@ -5331,20 +5345,34 @@ static int do_md_stop(struct mddev * mddev, int mode,
{
struct gendisk *disk = mddev->gendisk;
struct md_rdev *rdev;
+ int did_freeze = 0;
+
+ if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
+ did_freeze = 1;
+ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ }
+ if (mddev->sync_thread) {
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ /* Thread might be blocked waiting for metadata update
+ * which will now never happen */
+ wake_up_process(mddev->sync_thread->tsk);
+ }
+ mddev_unlock(mddev);
+ wait_event(resync_wait, mddev->sync_thread == NULL);
+ mddev_lock_nointr(mddev);
mutex_lock(&mddev->open_mutex);
if (atomic_read(&mddev->openers) > !!bdev ||
- mddev->sysfs_active) {
+ mddev->sysfs_active ||
+ mddev->sync_thread ||
+ (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
printk("md: %s still in use.\n",mdname(mddev));
mutex_unlock(&mddev->open_mutex);
- return -EBUSY;
- }
- if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) {
- /* Someone opened the device since we flushed it
- * so page cache could be dirty and it is too late
- * to flush. So abort
- */
- mutex_unlock(&mddev->open_mutex);
+ if (did_freeze) {
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ }
return -EBUSY;
}
if (mddev->pers) {
@@ -6551,7 +6579,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
wait_event(mddev->sb_wait,
!test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
!test_bit(MD_CHANGE_PENDING, &mddev->flags));
- mddev_lock(mddev);
+ mddev_lock_nointr(mddev);
}
} else {
err = -EROFS;
@@ -7361,9 +7389,6 @@ void md_do_sync(struct md_thread *thread)
mddev->curr_resync = 2;
try_again:
- if (kthread_should_stop())
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
goto skip;
for_each_mddev(mddev2, tmp) {
@@ -7388,7 +7413,7 @@ void md_do_sync(struct md_thread *thread)
* be caught by 'softlockup'
*/
prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
- if (!kthread_should_stop() &&
+ if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
mddev2->curr_resync >= mddev->curr_resync) {
printk(KERN_INFO "md: delaying %s of %s"
" until %s has finished (they"
@@ -7464,7 +7489,7 @@ void md_do_sync(struct md_thread *thread)
last_check = 0;
if (j>2) {
- printk(KERN_INFO
+ printk(KERN_INFO
"md: resuming %s of %s from checkpoint.\n",
desc, mdname(mddev));
mddev->curr_resync = j;
@@ -7501,7 +7526,8 @@ void md_do_sync(struct md_thread *thread)
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
- while (j >= mddev->resync_max && !kthread_should_stop()) {
+ while (j >= mddev->resync_max &&
+ !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
/* As this condition is controlled by user-space,
* we can block indefinitely, so use '_interruptible'
* to avoid triggering warnings.
@@ -7509,17 +7535,18 @@ void md_do_sync(struct md_thread *thread)
flush_signals(current); /* just in case */
wait_event_interruptible(mddev->recovery_wait,
mddev->resync_max > j
- || kthread_should_stop());
+ || test_bit(MD_RECOVERY_INTR,
+ &mddev->recovery));
}
- if (kthread_should_stop())
- goto interrupted;
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+ break;
sectors = mddev->pers->sync_request(mddev, j, &skipped,
currspeed < speed_min(mddev));
if (sectors == 0) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- goto out;
+ break;
}
if (!skipped) { /* actual IO requested */
@@ -7556,10 +7583,8 @@ void md_do_sync(struct md_thread *thread)
last_mark = next;
}
-
- if (kthread_should_stop())
- goto interrupted;
-
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+ break;
/*
* this loop exits only if either when we are slower than
@@ -7582,11 +7607,12 @@ void md_do_sync(struct md_thread *thread)
}
}
}
- printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
+ printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc,
+ test_bit(MD_RECOVERY_INTR, &mddev->recovery)
+ ? "interrupted" : "done");
/*
* this also signals 'finished resyncing' to md_stop
*/
- out:
blk_finish_plug(&plug);
wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
@@ -7640,16 +7666,6 @@ void md_do_sync(struct md_thread *thread)
set_bit(MD_RECOVERY_DONE, &mddev->recovery);
md_wakeup_thread(mddev->thread);
return;
-
- interrupted:
- /*
- * got a signal, exit.
- */
- printk(KERN_INFO
- "md: md_do_sync() got signal ... exiting\n");
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- goto out;
-
}
EXPORT_SYMBOL_GPL(md_do_sync);
@@ -7751,7 +7767,7 @@ void md_check_recovery(struct mddev *mddev)
if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
return;
if ( ! (
- (mddev->flags & ~ (1<<MD_CHANGE_PENDING)) ||
+ (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) ||
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
(mddev->external == 0 && mddev->safemode == 1) ||
@@ -7894,6 +7910,7 @@ void md_reap_sync_thread(struct mddev *mddev)
/* resync has finished, collect result */
md_unregister_thread(&mddev->sync_thread);
+ wake_up(&resync_wait);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
/* success...*/
diff --git a/drivers/md/md.h b/drivers/md/md.h
index c96456c16700..2f5cc8a7ef3e 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -501,7 +501,7 @@ extern struct attribute_group md_bitmap_group;
static inline struct sysfs_dirent *sysfs_get_dirent_safe(struct sysfs_dirent *sd, char *name)
{
if (sd)
- return sysfs_get_dirent(sd, NULL, name);
+ return sysfs_get_dirent(sd, name);
return sd;
}
static inline void sysfs_notify_dirent_safe(struct sysfs_dirent *sd)
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
index 172147eb1d40..af96e24ec328 100644
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -509,15 +509,18 @@ static int grow_add_tail_block(struct resize *resize)
static int grow_needs_more_blocks(struct resize *resize)
{
int r;
+ unsigned old_nr_blocks = resize->old_nr_full_blocks;
if (resize->old_nr_entries_in_last_block > 0) {
+ old_nr_blocks++;
+
r = grow_extend_tail_block(resize, resize->max_entries);
if (r)
return r;
}
r = insert_full_ablocks(resize->info, resize->size_of_block,
- resize->old_nr_full_blocks,
+ old_nr_blocks,
resize->new_nr_full_blocks,
resize->max_entries, resize->value,
&resize->root);
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
index e735a6d5a793..cfbf9617e465 100644
--- a/drivers/md/persistent-data/dm-space-map-disk.c
+++ b/drivers/md/persistent-data/dm-space-map-disk.c
@@ -140,26 +140,10 @@ static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b)
static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b)
{
- int r;
- uint32_t old_count;
enum allocation_event ev;
struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
- r = sm_ll_dec(&smd->ll, b, &ev);
- if (!r && (ev == SM_FREE)) {
- /*
- * It's only free if it's also free in the last
- * transaction.
- */
- r = sm_ll_lookup(&smd->old_ll, b, &old_count);
- if (r)
- return r;
-
- if (!old_count)
- smd->nr_allocated_this_transaction--;
- }
-
- return r;
+ return sm_ll_dec(&smd->ll, b, &ev);
}
static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index af6681b19776..1e5a540995e9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -66,7 +66,8 @@
*/
static int max_queued_requests = 1024;
-static void allow_barrier(struct r1conf *conf);
+static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
+ sector_t bi_sector);
static void lower_barrier(struct r1conf *conf);
static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
@@ -84,10 +85,12 @@ static void r1bio_pool_free(void *r1_bio, void *data)
}
#define RESYNC_BLOCK_SIZE (64*1024)
-//#define RESYNC_BLOCK_SIZE PAGE_SIZE
+#define RESYNC_DEPTH 32
#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
-#define RESYNC_WINDOW (2048*1024)
+#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
+#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
+#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
{
@@ -225,6 +228,8 @@ static void call_bio_endio(struct r1bio *r1_bio)
struct bio *bio = r1_bio->master_bio;
int done;
struct r1conf *conf = r1_bio->mddev->private;
+ sector_t start_next_window = r1_bio->start_next_window;
+ sector_t bi_sector = bio->bi_sector;
if (bio->bi_phys_segments) {
unsigned long flags;
@@ -232,6 +237,11 @@ static void call_bio_endio(struct r1bio *r1_bio)
bio->bi_phys_segments--;
done = (bio->bi_phys_segments == 0);
spin_unlock_irqrestore(&conf->device_lock, flags);
+ /*
+ * make_request() might be waiting for
+ * bi_phys_segments to decrease
+ */
+ wake_up(&conf->wait_barrier);
} else
done = 1;
@@ -243,7 +253,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
* Wake up any possible resync thread that waits for the device
* to go idle.
*/
- allow_barrier(conf);
+ allow_barrier(conf, start_next_window, bi_sector);
}
}
@@ -814,8 +824,6 @@ static void flush_pending_writes(struct r1conf *conf)
* there is no normal IO happeing. It must arrange to call
* lower_barrier when the particular background IO completes.
*/
-#define RESYNC_DEPTH 32
-
static void raise_barrier(struct r1conf *conf)
{
spin_lock_irq(&conf->resync_lock);
@@ -827,9 +835,19 @@ static void raise_barrier(struct r1conf *conf)
/* block any new IO from starting */
conf->barrier++;
- /* Now wait for all pending IO to complete */
+ /* For these conditions we must wait:
+ * A: while the array is in frozen state
+ * B: while barrier >= RESYNC_DEPTH, meaning resync reach
+ * the max count which allowed.
+ * C: next_resync + RESYNC_SECTORS > start_next_window, meaning
+ * next resync will reach to the window which normal bios are
+ * handling.
+ */
wait_event_lock_irq(conf->wait_barrier,
- !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
+ !conf->array_frozen &&
+ conf->barrier < RESYNC_DEPTH &&
+ (conf->start_next_window >=
+ conf->next_resync + RESYNC_SECTORS),
conf->resync_lock);
spin_unlock_irq(&conf->resync_lock);
@@ -845,10 +863,33 @@ static void lower_barrier(struct r1conf *conf)
wake_up(&conf->wait_barrier);
}
-static void wait_barrier(struct r1conf *conf)
+static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
{
+ bool wait = false;
+
+ if (conf->array_frozen || !bio)
+ wait = true;
+ else if (conf->barrier && bio_data_dir(bio) == WRITE) {
+ if (conf->next_resync < RESYNC_WINDOW_SECTORS)
+ wait = true;
+ else if ((conf->next_resync - RESYNC_WINDOW_SECTORS
+ >= bio_end_sector(bio)) ||
+ (conf->next_resync + NEXT_NORMALIO_DISTANCE
+ <= bio->bi_sector))
+ wait = false;
+ else
+ wait = true;
+ }
+
+ return wait;
+}
+
+static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
+{
+ sector_t sector = 0;
+
spin_lock_irq(&conf->resync_lock);
- if (conf->barrier) {
+ if (need_to_wait_for_sync(conf, bio)) {
conf->nr_waiting++;
/* Wait for the barrier to drop.
* However if there are already pending
@@ -860,22 +901,67 @@ static void wait_barrier(struct r1conf *conf)
* count down.
*/
wait_event_lock_irq(conf->wait_barrier,
- !conf->barrier ||
- (conf->nr_pending &&
+ !conf->array_frozen &&
+ (!conf->barrier ||
+ ((conf->start_next_window <
+ conf->next_resync + RESYNC_SECTORS) &&
current->bio_list &&
- !bio_list_empty(current->bio_list)),
+ !bio_list_empty(current->bio_list))),
conf->resync_lock);
conf->nr_waiting--;
}
+
+ if (bio && bio_data_dir(bio) == WRITE) {
+ if (conf->next_resync + NEXT_NORMALIO_DISTANCE
+ <= bio->bi_sector) {
+ if (conf->start_next_window == MaxSector)
+ conf->start_next_window =
+ conf->next_resync +
+ NEXT_NORMALIO_DISTANCE;
+
+ if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
+ <= bio->bi_sector)
+ conf->next_window_requests++;
+ else
+ conf->current_window_requests++;
+ }
+ if (bio->bi_sector >= conf->start_next_window)
+ sector = conf->start_next_window;
+ }
+
conf->nr_pending++;
spin_unlock_irq(&conf->resync_lock);
+ return sector;
}
-static void allow_barrier(struct r1conf *conf)
+static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
+ sector_t bi_sector)
{
unsigned long flags;
+
spin_lock_irqsave(&conf->resync_lock, flags);
conf->nr_pending--;
+ if (start_next_window) {
+ if (start_next_window == conf->start_next_window) {
+ if (conf->start_next_window + NEXT_NORMALIO_DISTANCE
+ <= bi_sector)
+ conf->next_window_requests--;
+ else
+ conf->current_window_requests--;
+ } else
+ conf->current_window_requests--;
+
+ if (!conf->current_window_requests) {
+ if (conf->next_window_requests) {
+ conf->current_window_requests =
+ conf->next_window_requests;
+ conf->next_window_requests = 0;
+ conf->start_next_window +=
+ NEXT_NORMALIO_DISTANCE;
+ } else
+ conf->start_next_window = MaxSector;
+ }
+ }
spin_unlock_irqrestore(&conf->resync_lock, flags);
wake_up(&conf->wait_barrier);
}
@@ -884,8 +970,7 @@ static void freeze_array(struct r1conf *conf, int extra)
{
/* stop syncio and normal IO and wait for everything to
* go quite.
- * We increment barrier and nr_waiting, and then
- * wait until nr_pending match nr_queued+extra
+ * We wait until nr_pending match nr_queued+extra
* This is called in the context of one normal IO request
* that has failed. Thus any sync request that might be pending
* will be blocked by nr_pending, and we need to wait for
@@ -895,8 +980,7 @@ static void freeze_array(struct r1conf *conf, int extra)
* we continue.
*/
spin_lock_irq(&conf->resync_lock);
- conf->barrier++;
- conf->nr_waiting++;
+ conf->array_frozen = 1;
wait_event_lock_irq_cmd(conf->wait_barrier,
conf->nr_pending == conf->nr_queued+extra,
conf->resync_lock,
@@ -907,8 +991,7 @@ static void unfreeze_array(struct r1conf *conf)
{
/* reverse the effect of the freeze */
spin_lock_irq(&conf->resync_lock);
- conf->barrier--;
- conf->nr_waiting--;
+ conf->array_frozen = 0;
wake_up(&conf->wait_barrier);
spin_unlock_irq(&conf->resync_lock);
}
@@ -1013,6 +1096,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
int first_clone;
int sectors_handled;
int max_sectors;
+ sector_t start_next_window;
/*
* Register the new request and wait if the reconstruction
@@ -1042,7 +1126,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
finish_wait(&conf->wait_barrier, &w);
}
- wait_barrier(conf);
+ start_next_window = wait_barrier(conf, bio);
bitmap = mddev->bitmap;
@@ -1163,6 +1247,7 @@ read_again:
disks = conf->raid_disks * 2;
retry_write:
+ r1_bio->start_next_window = start_next_window;
blocked_rdev = NULL;
rcu_read_lock();
max_sectors = r1_bio->sectors;
@@ -1231,14 +1316,24 @@ read_again:
if (unlikely(blocked_rdev)) {
/* Wait for this device to become unblocked */
int j;
+ sector_t old = start_next_window;
for (j = 0; j < i; j++)
if (r1_bio->bios[j])
rdev_dec_pending(conf->mirrors[j].rdev, mddev);
r1_bio->state = 0;
- allow_barrier(conf);
+ allow_barrier(conf, start_next_window, bio->bi_sector);
md_wait_for_blocked_rdev(blocked_rdev, mddev);
- wait_barrier(conf);
+ start_next_window = wait_barrier(conf, bio);
+ /*
+ * We must make sure the multi r1bios of bio have
+ * the same value of bi_phys_segments
+ */
+ if (bio->bi_phys_segments && old &&
+ old != start_next_window)
+ /* Wait for the former r1bio(s) to complete */
+ wait_event(conf->wait_barrier,
+ bio->bi_phys_segments == 1);
goto retry_write;
}
@@ -1438,11 +1533,14 @@ static void print_conf(struct r1conf *conf)
static void close_sync(struct r1conf *conf)
{
- wait_barrier(conf);
- allow_barrier(conf);
+ wait_barrier(conf, NULL);
+ allow_barrier(conf, 0, 0);
mempool_destroy(conf->r1buf_pool);
conf->r1buf_pool = NULL;
+
+ conf->next_resync = 0;
+ conf->start_next_window = MaxSector;
}
static int raid1_spare_active(struct mddev *mddev)
@@ -2714,6 +2812,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
conf->pending_count = 0;
conf->recovery_disabled = mddev->recovery_disabled - 1;
+ conf->start_next_window = MaxSector;
+ conf->current_window_requests = conf->next_window_requests = 0;
+
err = -EIO;
for (i = 0; i < conf->raid_disks * 2; i++) {
@@ -2871,8 +2972,8 @@ static int stop(struct mddev *mddev)
atomic_read(&bitmap->behind_writes) == 0);
}
- raise_barrier(conf);
- lower_barrier(conf);
+ freeze_array(conf, 0);
+ unfreeze_array(conf);
md_unregister_thread(&mddev->thread);
if (conf->r1bio_pool)
@@ -3031,10 +3132,10 @@ static void raid1_quiesce(struct mddev *mddev, int state)
wake_up(&conf->wait_barrier);
break;
case 1:
- raise_barrier(conf);
+ freeze_array(conf, 0);
break;
case 0:
- lower_barrier(conf);
+ unfreeze_array(conf);
break;
}
}
@@ -3051,7 +3152,8 @@ static void *raid1_takeover(struct mddev *mddev)
mddev->new_chunk_sectors = 0;
conf = setup_conf(mddev);
if (!IS_ERR(conf))
- conf->barrier = 1;
+ /* Array must appear to be quiesced */
+ conf->array_frozen = 1;
return conf;
}
return ERR_PTR(-EINVAL);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 0ff3715fb7eb..9bebca7bff2f 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -41,6 +41,19 @@ struct r1conf {
*/
sector_t next_resync;
+ /* When raid1 starts resync, we divide array into four partitions
+ * |---------|--------------|---------------------|-------------|
+ * next_resync start_next_window end_window
+ * start_next_window = next_resync + NEXT_NORMALIO_DISTANCE
+ * end_window = start_next_window + NEXT_NORMALIO_DISTANCE
+ * current_window_requests means the count of normalIO between
+ * start_next_window and end_window.
+ * next_window_requests means the count of normalIO after end_window.
+ * */
+ sector_t start_next_window;
+ int current_window_requests;
+ int next_window_requests;
+
spinlock_t device_lock;
/* list of 'struct r1bio' that need to be processed by raid1d,
@@ -65,6 +78,7 @@ struct r1conf {
int nr_waiting;
int nr_queued;
int barrier;
+ int array_frozen;
/* Set to 1 if a full sync is needed, (fresh device added).
* Cleared when a sync completes.
@@ -111,6 +125,7 @@ struct r1bio {
* in this BehindIO request
*/
sector_t sector;
+ sector_t start_next_window;
int sectors;
unsigned long state;
struct mddev *mddev;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 7c3508abb5e1..c504e8389e69 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4384,7 +4384,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
set_bit(MD_CHANGE_DEVS, &mddev->flags);
md_wakeup_thread(mddev->thread);
wait_event(mddev->sb_wait, mddev->flags == 0 ||
- kthread_should_stop());
+ test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+ allow_barrier(conf);
+ return sectors_done;
+ }
conf->reshape_safe = mddev->reshape_position;
allow_barrier(conf);
}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f8b906843926..cc055da02e2a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -85,6 +85,42 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
return &conf->stripe_hashtbl[hash];
}
+static inline int stripe_hash_locks_hash(sector_t sect)
+{
+ return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
+}
+
+static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
+{
+ spin_lock_irq(conf->hash_locks + hash);
+ spin_lock(&conf->device_lock);
+}
+
+static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
+{
+ spin_unlock(&conf->device_lock);
+ spin_unlock_irq(conf->hash_locks + hash);
+}
+
+static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
+{
+ int i;
+ local_irq_disable();
+ spin_lock(conf->hash_locks);
+ for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
+ spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
+ spin_lock(&conf->device_lock);
+}
+
+static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
+{
+ int i;
+ spin_unlock(&conf->device_lock);
+ for (i = NR_STRIPE_HASH_LOCKS; i; i--)
+ spin_unlock(conf->hash_locks + i - 1);
+ local_irq_enable();
+}
+
/* bio's attached to a stripe+device for I/O are linked together in bi_sector
* order without overlap. There may be several bio's per stripe+device, and
* a bio could span several devices.
@@ -249,7 +285,8 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
}
}
-static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
+static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
+ struct list_head *temp_inactive_list)
{
BUG_ON(!list_empty(&sh->lru));
BUG_ON(atomic_read(&conf->active_stripes)==0);
@@ -278,37 +315,68 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
< IO_THRESHOLD)
md_wakeup_thread(conf->mddev->thread);
atomic_dec(&conf->active_stripes);
- if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
- list_add_tail(&sh->lru, &conf->inactive_list);
- wake_up(&conf->wait_for_stripe);
- if (conf->retry_read_aligned)
- md_wakeup_thread(conf->mddev->thread);
- }
+ if (!test_bit(STRIPE_EXPANDING, &sh->state))
+ list_add_tail(&sh->lru, temp_inactive_list);
}
}
-static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
+static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
+ struct list_head *temp_inactive_list)
{
if (atomic_dec_and_test(&sh->count))
- do_release_stripe(conf, sh);
+ do_release_stripe(conf, sh, temp_inactive_list);
}
-static struct llist_node *llist_reverse_order(struct llist_node *head)
+/*
+ * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
+ *
+ * Be careful: Only one task can add/delete stripes from temp_inactive_list at
+ * given time. Adding stripes only takes device lock, while deleting stripes
+ * only takes hash lock.
+ */
+static void release_inactive_stripe_list(struct r5conf *conf,
+ struct list_head *temp_inactive_list,
+ int hash)
{
- struct llist_node *new_head = NULL;
+ int size;
+ bool do_wakeup = false;
+ unsigned long flags;
- while (head) {
- struct llist_node *tmp = head;
- head = head->next;
- tmp->next = new_head;
- new_head = tmp;
+ if (hash == NR_STRIPE_HASH_LOCKS) {
+ size = NR_STRIPE_HASH_LOCKS;
+ hash = NR_STRIPE_HASH_LOCKS - 1;
+ } else
+ size = 1;
+ while (size) {
+ struct list_head *list = &temp_inactive_list[size - 1];
+
+ /*
+ * We don't hold any lock here yet, get_active_stripe() might
+ * remove stripes from the list
+ */
+ if (!list_empty_careful(list)) {
+ spin_lock_irqsave(conf->hash_locks + hash, flags);
+ if (list_empty(conf->inactive_list + hash) &&
+ !list_empty(list))
+ atomic_dec(&conf->empty_inactive_list_nr);
+ list_splice_tail_init(list, conf->inactive_list + hash);
+ do_wakeup = true;
+ spin_unlock_irqrestore(conf->hash_locks + hash, flags);
+ }
+ size--;
+ hash--;
}
- return new_head;
+ if (do_wakeup) {
+ wake_up(&conf->wait_for_stripe);
+ if (conf->retry_read_aligned)
+ md_wakeup_thread(conf->mddev->thread);
+ }
}
/* should hold conf->device_lock already */
-static int release_stripe_list(struct r5conf *conf)
+static int release_stripe_list(struct r5conf *conf,
+ struct list_head *temp_inactive_list)
{
struct stripe_head *sh;
int count = 0;
@@ -317,6 +385,8 @@ static int release_stripe_list(struct r5conf *conf)
head = llist_del_all(&conf->released_stripes);
head = llist_reverse_order(head);
while (head) {
+ int hash;
+
sh = llist_entry(head, struct stripe_head, release_list);
head = llist_next(head);
/* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
@@ -327,7 +397,8 @@ static int release_stripe_list(struct r5conf *conf)
* again, the count is always > 1. This is true for
* STRIPE_ON_UNPLUG_LIST bit too.
*/
- __release_stripe(conf, sh);
+ hash = sh->hash_lock_index;
+ __release_stripe(conf, sh, &temp_inactive_list[hash]);
count++;
}
@@ -338,9 +409,12 @@ static void release_stripe(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
unsigned long flags;
+ struct list_head list;
+ int hash;
bool wakeup;
- if (test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
+ if (unlikely(!conf->mddev->thread) ||
+ test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
goto slow_path;
wakeup = llist_add(&sh->release_list, &conf->released_stripes);
if (wakeup)
@@ -350,8 +424,11 @@ slow_path:
local_irq_save(flags);
/* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
- do_release_stripe(conf, sh);
+ INIT_LIST_HEAD(&list);
+ hash = sh->hash_lock_index;
+ do_release_stripe(conf, sh, &list);
spin_unlock(&conf->device_lock);
+ release_inactive_stripe_list(conf, &list, hash);
}
local_irq_restore(flags);
}
@@ -376,18 +453,21 @@ static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
/* find an idle stripe, make sure it is unhashed, and return it. */
-static struct stripe_head *get_free_stripe(struct r5conf *conf)
+static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
{
struct stripe_head *sh = NULL;
struct list_head *first;
- if (list_empty(&conf->inactive_list))
+ if (list_empty(conf->inactive_list + hash))
goto out;
- first = conf->inactive_list.next;
+ first = (conf->inactive_list + hash)->next;
sh = list_entry(first, struct stripe_head, lru);
list_del_init(first);
remove_hash(sh);
atomic_inc(&conf->active_stripes);
+ BUG_ON(hash != sh->hash_lock_index);
+ if (list_empty(conf->inactive_list + hash))
+ atomic_inc(&conf->empty_inactive_list_nr);
out:
return sh;
}
@@ -430,7 +510,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
{
struct r5conf *conf = sh->raid_conf;
- int i;
+ int i, seq;
BUG_ON(atomic_read(&sh->count) != 0);
BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
@@ -440,7 +520,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
(unsigned long long)sh->sector);
remove_hash(sh);
-
+retry:
+ seq = read_seqcount_begin(&conf->gen_lock);
sh->generation = conf->generation - previous;
sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
sh->sector = sector;
@@ -462,6 +543,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
dev->flags = 0;
raid5_build_block(sh, i, previous);
}
+ if (read_seqcount_retry(&conf->gen_lock, seq))
+ goto retry;
insert_hash(conf, sh);
sh->cpu = smp_processor_id();
}
@@ -566,57 +649,59 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
int previous, int noblock, int noquiesce)
{
struct stripe_head *sh;
+ int hash = stripe_hash_locks_hash(sector);
pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
- spin_lock_irq(&conf->device_lock);
+ spin_lock_irq(conf->hash_locks + hash);
do {
wait_event_lock_irq(conf->wait_for_stripe,
conf->quiesce == 0 || noquiesce,
- conf->device_lock);
+ *(conf->hash_locks + hash));
sh = __find_stripe(conf, sector, conf->generation - previous);
if (!sh) {
if (!conf->inactive_blocked)
- sh = get_free_stripe(conf);
+ sh = get_free_stripe(conf, hash);
if (noblock && sh == NULL)
break;
if (!sh) {
conf->inactive_blocked = 1;
- wait_event_lock_irq(conf->wait_for_stripe,
- !list_empty(&conf->inactive_list) &&
- (atomic_read(&conf->active_stripes)
- < (conf->max_nr_stripes *3/4)
- || !conf->inactive_blocked),
- conf->device_lock);
+ wait_event_lock_irq(
+ conf->wait_for_stripe,
+ !list_empty(conf->inactive_list + hash) &&
+ (atomic_read(&conf->active_stripes)
+ < (conf->max_nr_stripes * 3 / 4)
+ || !conf->inactive_blocked),
+ *(conf->hash_locks + hash));
conf->inactive_blocked = 0;
} else
init_stripe(sh, sector, previous);
} else {
+ spin_lock(&conf->device_lock);
if (atomic_read(&sh->count)) {
BUG_ON(!list_empty(&sh->lru)
&& !test_bit(STRIPE_EXPANDING, &sh->state)
&& !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)
- && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
+ );
} else {
if (!test_bit(STRIPE_HANDLE, &sh->state))
atomic_inc(&conf->active_stripes);
- if (list_empty(&sh->lru) &&
- !test_bit(STRIPE_EXPANDING, &sh->state))
- BUG();
+ BUG_ON(list_empty(&sh->lru));
list_del_init(&sh->lru);
if (sh->group) {
sh->group->stripes_cnt--;
sh->group = NULL;
}
}
+ spin_unlock(&conf->device_lock);
}
} while (sh == NULL);
if (sh)
atomic_inc(&sh->count);
- spin_unlock_irq(&conf->device_lock);
+ spin_unlock_irq(conf->hash_locks + hash);
return sh;
}
@@ -772,7 +857,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
bi->bi_sector = (sh->sector
+ rdev->data_offset);
if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
- bi->bi_rw |= REQ_FLUSH;
+ bi->bi_rw |= REQ_NOMERGE;
bi->bi_vcnt = 1;
bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
@@ -1596,7 +1681,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
put_cpu();
}
-static int grow_one_stripe(struct r5conf *conf)
+static int grow_one_stripe(struct r5conf *conf, int hash)
{
struct stripe_head *sh;
sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
@@ -1612,6 +1697,7 @@ static int grow_one_stripe(struct r5conf *conf)
kmem_cache_free(conf->slab_cache, sh);
return 0;
}
+ sh->hash_lock_index = hash;
/* we just created an active stripe so... */
atomic_set(&sh->count, 1);
atomic_inc(&conf->active_stripes);
@@ -1624,6 +1710,7 @@ static int grow_stripes(struct r5conf *conf, int num)
{
struct kmem_cache *sc;
int devs = max(conf->raid_disks, conf->previous_raid_disks);
+ int hash;
if (conf->mddev->gendisk)
sprintf(conf->cache_name[0],
@@ -1641,9 +1728,13 @@ static int grow_stripes(struct r5conf *conf, int num)
return 1;
conf->slab_cache = sc;
conf->pool_size = devs;
- while (num--)
- if (!grow_one_stripe(conf))
+ hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
+ while (num--) {
+ if (!grow_one_stripe(conf, hash))
return 1;
+ conf->max_nr_stripes++;
+ hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
+ }
return 0;
}
@@ -1701,6 +1792,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
int err;
struct kmem_cache *sc;
int i;
+ int hash, cnt;
if (newsize <= conf->pool_size)
return 0; /* never bother to shrink */
@@ -1740,19 +1832,29 @@ static int resize_stripes(struct r5conf *conf, int newsize)
* OK, we have enough stripes, start collecting inactive
* stripes and copying them over
*/
+ hash = 0;
+ cnt = 0;
list_for_each_entry(nsh, &newstripes, lru) {
- spin_lock_irq(&conf->device_lock);
- wait_event_lock_irq(conf->wait_for_stripe,
- !list_empty(&conf->inactive_list),
- conf->device_lock);
- osh = get_free_stripe(conf);
- spin_unlock_irq(&conf->device_lock);
+ lock_device_hash_lock(conf, hash);
+ wait_event_cmd(conf->wait_for_stripe,
+ !list_empty(conf->inactive_list + hash),
+ unlock_device_hash_lock(conf, hash),
+ lock_device_hash_lock(conf, hash));
+ osh = get_free_stripe(conf, hash);
+ unlock_device_hash_lock(conf, hash);
atomic_set(&nsh->count, 1);
for(i=0; i<conf->pool_size; i++)
nsh->dev[i].page = osh->dev[i].page;
for( ; i<newsize; i++)
nsh->dev[i].page = NULL;
+ nsh->hash_lock_index = hash;
kmem_cache_free(conf->slab_cache, osh);
+ cnt++;
+ if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
+ !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
+ hash++;
+ cnt = 0;
+ }
}
kmem_cache_destroy(conf->slab_cache);
@@ -1811,13 +1913,13 @@ static int resize_stripes(struct r5conf *conf, int newsize)
return err;
}
-static int drop_one_stripe(struct r5conf *conf)
+static int drop_one_stripe(struct r5conf *conf, int hash)
{
struct stripe_head *sh;
- spin_lock_irq(&conf->device_lock);
- sh = get_free_stripe(conf);
- spin_unlock_irq(&conf->device_lock);
+ spin_lock_irq(conf->hash_locks + hash);
+ sh = get_free_stripe(conf, hash);
+ spin_unlock_irq(conf->hash_locks + hash);
if (!sh)
return 0;
BUG_ON(atomic_read(&sh->count));
@@ -1829,8 +1931,10 @@ static int drop_one_stripe(struct r5conf *conf)
static void shrink_stripes(struct r5conf *conf)
{
- while (drop_one_stripe(conf))
- ;
+ int hash;
+ for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++)
+ while (drop_one_stripe(conf, hash))
+ ;
if (conf->slab_cache)
kmem_cache_destroy(conf->slab_cache);
@@ -1935,6 +2039,9 @@ static void raid5_end_read_request(struct bio * bi, int error)
mdname(conf->mddev), bdn);
else
retry = 1;
+ if (set_bad && test_bit(In_sync, &rdev->flags)
+ && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
+ retry = 1;
if (retry)
if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
set_bit(R5_ReadError, &sh->dev[i].flags);
@@ -3914,7 +4021,8 @@ static void raid5_activate_delayed(struct r5conf *conf)
}
}
-static void activate_bit_delay(struct r5conf *conf)
+static void activate_bit_delay(struct r5conf *conf,
+ struct list_head *temp_inactive_list)
{
/* device_lock is held */
struct list_head head;
@@ -3922,9 +4030,11 @@ static void activate_bit_delay(struct r5conf *conf)
list_del_init(&conf->bitmap_list);
while (!list_empty(&head)) {
struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
+ int hash;
list_del_init(&sh->lru);
atomic_inc(&sh->count);
- __release_stripe(conf, sh);
+ hash = sh->hash_lock_index;
+ __release_stripe(conf, sh, &temp_inactive_list[hash]);
}
}
@@ -3940,7 +4050,7 @@ int md_raid5_congested(struct mddev *mddev, int bits)
return 1;
if (conf->quiesce)
return 1;
- if (list_empty_careful(&conf->inactive_list))
+ if (atomic_read(&conf->empty_inactive_list_nr))
return 1;
return 0;
@@ -4270,6 +4380,7 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
struct raid5_plug_cb {
struct blk_plug_cb cb;
struct list_head list;
+ struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
};
static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
@@ -4280,6 +4391,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
struct mddev *mddev = cb->cb.data;
struct r5conf *conf = mddev->private;
int cnt = 0;
+ int hash;
if (cb->list.next && !list_empty(&cb->list)) {
spin_lock_irq(&conf->device_lock);
@@ -4297,11 +4409,14 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
* STRIPE_ON_RELEASE_LIST could be set here. In that
* case, the count is always > 1 here
*/
- __release_stripe(conf, sh);
+ hash = sh->hash_lock_index;
+ __release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
cnt++;
}
spin_unlock_irq(&conf->device_lock);
}
+ release_inactive_stripe_list(conf, cb->temp_inactive_list,
+ NR_STRIPE_HASH_LOCKS);
if (mddev->queue)
trace_block_unplug(mddev->queue, cnt, !from_schedule);
kfree(cb);
@@ -4322,8 +4437,12 @@ static void release_stripe_plug(struct mddev *mddev,
cb = container_of(blk_cb, struct raid5_plug_cb, cb);
- if (cb->list.next == NULL)
+ if (cb->list.next == NULL) {
+ int i;
INIT_LIST_HEAD(&cb->list);
+ for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+ INIT_LIST_HEAD(cb->temp_inactive_list + i);
+ }
if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
list_add_tail(&sh->lru, &cb->list);
@@ -4706,14 +4825,19 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
/* Cannot proceed until we've updated the superblock... */
wait_event(conf->wait_for_overlap,
- atomic_read(&conf->reshape_stripes)==0);
+ atomic_read(&conf->reshape_stripes)==0
+ || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+ if (atomic_read(&conf->reshape_stripes) != 0)
+ return 0;
mddev->reshape_position = conf->reshape_progress;
mddev->curr_resync_completed = sector_nr;
conf->reshape_checkpoint = jiffies;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
md_wakeup_thread(mddev->thread);
wait_event(mddev->sb_wait, mddev->flags == 0 ||
- kthread_should_stop());
+ test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+ return 0;
spin_lock_irq(&conf->device_lock);
conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock);
@@ -4796,7 +4920,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
>= mddev->resync_max - mddev->curr_resync_completed) {
/* Cannot proceed until we've updated the superblock... */
wait_event(conf->wait_for_overlap,
- atomic_read(&conf->reshape_stripes) == 0);
+ atomic_read(&conf->reshape_stripes) == 0
+ || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+ if (atomic_read(&conf->reshape_stripes) != 0)
+ goto ret;
mddev->reshape_position = conf->reshape_progress;
mddev->curr_resync_completed = sector_nr;
conf->reshape_checkpoint = jiffies;
@@ -4804,13 +4931,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
md_wakeup_thread(mddev->thread);
wait_event(mddev->sb_wait,
!test_bit(MD_CHANGE_DEVS, &mddev->flags)
- || kthread_should_stop());
+ || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+ goto ret;
spin_lock_irq(&conf->device_lock);
conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
+ret:
return reshape_sectors;
}
@@ -4968,27 +5098,45 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
}
static int handle_active_stripes(struct r5conf *conf, int group,
- struct r5worker *worker)
+ struct r5worker *worker,
+ struct list_head *temp_inactive_list)
{
struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
- int i, batch_size = 0;
+ int i, batch_size = 0, hash;
+ bool release_inactive = false;
while (batch_size < MAX_STRIPE_BATCH &&
(sh = __get_priority_stripe(conf, group)) != NULL)
batch[batch_size++] = sh;
- if (batch_size == 0)
- return batch_size;
+ if (batch_size == 0) {
+ for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+ if (!list_empty(temp_inactive_list + i))
+ break;
+ if (i == NR_STRIPE_HASH_LOCKS)
+ return batch_size;
+ release_inactive = true;
+ }
spin_unlock_irq(&conf->device_lock);
+ release_inactive_stripe_list(conf, temp_inactive_list,
+ NR_STRIPE_HASH_LOCKS);
+
+ if (release_inactive) {
+ spin_lock_irq(&conf->device_lock);
+ return 0;
+ }
+
for (i = 0; i < batch_size; i++)
handle_stripe(batch[i]);
cond_resched();
spin_lock_irq(&conf->device_lock);
- for (i = 0; i < batch_size; i++)
- __release_stripe(conf, batch[i]);
+ for (i = 0; i < batch_size; i++) {
+ hash = batch[i]->hash_lock_index;
+ __release_stripe(conf, batch[i], &temp_inactive_list[hash]);
+ }
return batch_size;
}
@@ -5009,9 +5157,10 @@ static void raid5_do_work(struct work_struct *work)
while (1) {
int batch_size, released;
- released = release_stripe_list(conf);
+ released = release_stripe_list(conf, worker->temp_inactive_list);
- batch_size = handle_active_stripes(conf, group_id, worker);
+ batch_size = handle_active_stripes(conf, group_id, worker,
+ worker->temp_inactive_list);
worker->working = false;
if (!batch_size && !released)
break;
@@ -5050,7 +5199,7 @@ static void raid5d(struct md_thread *thread)
struct bio *bio;
int batch_size, released;
- released = release_stripe_list(conf);
+ released = release_stripe_list(conf, conf->temp_inactive_list);
if (
!list_empty(&conf->bitmap_list)) {
@@ -5060,7 +5209,7 @@ static void raid5d(struct md_thread *thread)
bitmap_unplug(mddev->bitmap);
spin_lock_irq(&conf->device_lock);
conf->seq_write = conf->seq_flush;
- activate_bit_delay(conf);
+ activate_bit_delay(conf, conf->temp_inactive_list);
}
raid5_activate_delayed(conf);
@@ -5074,7 +5223,8 @@ static void raid5d(struct md_thread *thread)
handled++;
}
- batch_size = handle_active_stripes(conf, ANY_GROUP, NULL);
+ batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
+ conf->temp_inactive_list);
if (!batch_size && !released)
break;
handled += batch_size;
@@ -5110,22 +5260,29 @@ raid5_set_cache_size(struct mddev *mddev, int size)
{
struct r5conf *conf = mddev->private;
int err;
+ int hash;
if (size <= 16 || size > 32768)
return -EINVAL;
+ hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
while (size < conf->max_nr_stripes) {
- if (drop_one_stripe(conf))
+ if (drop_one_stripe(conf, hash))
conf->max_nr_stripes--;
else
break;
+ hash--;
+ if (hash < 0)
+ hash = NR_STRIPE_HASH_LOCKS - 1;
}
err = md_allow_write(mddev);
if (err)
return err;
+ hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
while (size > conf->max_nr_stripes) {
- if (grow_one_stripe(conf))
+ if (grow_one_stripe(conf, hash))
conf->max_nr_stripes++;
else break;
+ hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
}
return 0;
}
@@ -5213,15 +5370,18 @@ raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
return 0;
}
-static int alloc_thread_groups(struct r5conf *conf, int cnt);
+static int alloc_thread_groups(struct r5conf *conf, int cnt,
+ int *group_cnt,
+ int *worker_cnt_per_group,
+ struct r5worker_group **worker_groups);
static ssize_t
raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
{
struct r5conf *conf = mddev->private;
unsigned long new;
int err;
- struct r5worker_group *old_groups;
- int old_group_cnt;
+ struct r5worker_group *new_groups, *old_groups;
+ int group_cnt, worker_cnt_per_group;
if (len >= PAGE_SIZE)
return -EINVAL;
@@ -5237,14 +5397,19 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
mddev_suspend(mddev);
old_groups = conf->worker_groups;
- old_group_cnt = conf->worker_cnt_per_group;
+ if (old_groups)
+ flush_workqueue(raid5_wq);
+
+ err = alloc_thread_groups(conf, new,
+ &group_cnt, &worker_cnt_per_group,
+ &new_groups);
+ if (!err) {
+ spin_lock_irq(&conf->device_lock);
+ conf->group_cnt = group_cnt;
+ conf->worker_cnt_per_group = worker_cnt_per_group;
+ conf->worker_groups = new_groups;
+ spin_unlock_irq(&conf->device_lock);
- conf->worker_groups = NULL;
- err = alloc_thread_groups(conf, new);
- if (err) {
- conf->worker_groups = old_groups;
- conf->worker_cnt_per_group = old_group_cnt;
- } else {
if (old_groups)
kfree(old_groups[0].workers);
kfree(old_groups);
@@ -5274,40 +5439,47 @@ static struct attribute_group raid5_attrs_group = {
.attrs = raid5_attrs,
};
-static int alloc_thread_groups(struct r5conf *conf, int cnt)
+static int alloc_thread_groups(struct r5conf *conf, int cnt,
+ int *group_cnt,
+ int *worker_cnt_per_group,
+ struct r5worker_group **worker_groups)
{
- int i, j;
+ int i, j, k;
ssize_t size;
struct r5worker *workers;
- conf->worker_cnt_per_group = cnt;
+ *worker_cnt_per_group = cnt;
if (cnt == 0) {
- conf->worker_groups = NULL;
+ *group_cnt = 0;
+ *worker_groups = NULL;
return 0;
}
- conf->group_cnt = num_possible_nodes();
+ *group_cnt = num_possible_nodes();
size = sizeof(struct r5worker) * cnt;
- workers = kzalloc(size * conf->group_cnt, GFP_NOIO);
- conf->worker_groups = kzalloc(sizeof(struct r5worker_group) *
- conf->group_cnt, GFP_NOIO);
- if (!conf->worker_groups || !workers) {
+ workers = kzalloc(size * *group_cnt, GFP_NOIO);
+ *worker_groups = kzalloc(sizeof(struct r5worker_group) *
+ *group_cnt, GFP_NOIO);
+ if (!*worker_groups || !workers) {
kfree(workers);
- kfree(conf->worker_groups);
- conf->worker_groups = NULL;
+ kfree(*worker_groups);
return -ENOMEM;
}
- for (i = 0; i < conf->group_cnt; i++) {
+ for (i = 0; i < *group_cnt; i++) {
struct r5worker_group *group;
- group = &conf->worker_groups[i];
+ group = &(*worker_groups)[i];
INIT_LIST_HEAD(&group->handle_list);
group->conf = conf;
group->workers = workers + i * cnt;
for (j = 0; j < cnt; j++) {
- group->workers[j].group = group;
- INIT_WORK(&group->workers[j].work, raid5_do_work);
+ struct r5worker *worker = group->workers + j;
+ worker->group = group;
+ INIT_WORK(&worker->work, raid5_do_work);
+
+ for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
+ INIT_LIST_HEAD(worker->temp_inactive_list + k);
}
}
@@ -5458,6 +5630,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
struct md_rdev *rdev;
struct disk_info *disk;
char pers_name[6];
+ int i;
+ int group_cnt, worker_cnt_per_group;
+ struct r5worker_group *new_group;
if (mddev->new_level != 5
&& mddev->new_level != 4
@@ -5492,7 +5667,12 @@ static struct r5conf *setup_conf(struct mddev *mddev)
if (conf == NULL)
goto abort;
/* Don't enable multi-threading by default*/
- if (alloc_thread_groups(conf, 0))
+ if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
+ &new_group)) {
+ conf->group_cnt = group_cnt;
+ conf->worker_cnt_per_group = worker_cnt_per_group;
+ conf->worker_groups = new_group;
+ } else
goto abort;
spin_lock_init(&conf->device_lock);
seqcount_init(&conf->gen_lock);
@@ -5502,7 +5682,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
INIT_LIST_HEAD(&conf->hold_list);
INIT_LIST_HEAD(&conf->delayed_list);
INIT_LIST_HEAD(&conf->bitmap_list);
- INIT_LIST_HEAD(&conf->inactive_list);
init_llist_head(&conf->released_stripes);
atomic_set(&conf->active_stripes, 0);
atomic_set(&conf->preread_active_stripes, 0);
@@ -5528,6 +5707,21 @@ static struct r5conf *setup_conf(struct mddev *mddev)
if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
goto abort;
+ /* We init hash_locks[0] separately to that it can be used
+ * as the reference lock in the spin_lock_nest_lock() call
+ * in lock_all_device_hash_locks_irq in order to convince
+ * lockdep that we know what we are doing.
+ */
+ spin_lock_init(conf->hash_locks);
+ for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
+ spin_lock_init(conf->hash_locks + i);
+
+ for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+ INIT_LIST_HEAD(conf->inactive_list + i);
+
+ for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
+ INIT_LIST_HEAD(conf->temp_inactive_list + i);
+
conf->level = mddev->new_level;
if (raid5_alloc_percpu(conf) != 0)
goto abort;
@@ -5568,7 +5762,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
else
conf->max_degraded = 1;
conf->algorithm = mddev->new_layout;
- conf->max_nr_stripes = NR_STRIPES;
conf->reshape_progress = mddev->reshape_position;
if (conf->reshape_progress != MaxSector) {
conf->prev_chunk_sectors = mddev->chunk_sectors;
@@ -5577,7 +5770,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
- if (grow_stripes(conf, conf->max_nr_stripes)) {
+ atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
+ if (grow_stripes(conf, NR_STRIPES)) {
printk(KERN_ERR
"md/raid:%s: couldn't allocate %dkB for buffers\n",
mdname(mddev), memory);
@@ -6383,12 +6577,18 @@ static int raid5_start_reshape(struct mddev *mddev)
if (!mddev->sync_thread) {
mddev->recovery = 0;
spin_lock_irq(&conf->device_lock);
+ write_seqcount_begin(&conf->gen_lock);
mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
+ mddev->new_chunk_sectors =
+ conf->chunk_sectors = conf->prev_chunk_sectors;
+ mddev->new_layout = conf->algorithm = conf->prev_algo;
rdev_for_each(rdev, mddev)
rdev->new_data_offset = rdev->data_offset;
smp_wmb();
+ conf->generation --;
conf->reshape_progress = MaxSector;
mddev->reshape_position = MaxSector;
+ write_seqcount_end(&conf->gen_lock);
spin_unlock_irq(&conf->device_lock);
return -EAGAIN;
}
@@ -6476,27 +6676,28 @@ static void raid5_quiesce(struct mddev *mddev, int state)
break;
case 1: /* stop all writes */
- spin_lock_irq(&conf->device_lock);
+ lock_all_device_hash_locks_irq(conf);
/* '2' tells resync/reshape to pause so that all
* active stripes can drain
*/
conf->quiesce = 2;
- wait_event_lock_irq(conf->wait_for_stripe,
+ wait_event_cmd(conf->wait_for_stripe,
atomic_read(&conf->active_stripes) == 0 &&
atomic_read(&conf->active_aligned_reads) == 0,
- conf->device_lock);
+ unlock_all_device_hash_locks_irq(conf),
+ lock_all_device_hash_locks_irq(conf));
conf->quiesce = 1;
- spin_unlock_irq(&conf->device_lock);
+ unlock_all_device_hash_locks_irq(conf);
/* allow reshape to continue */
wake_up(&conf->wait_for_overlap);
break;
case 0: /* re-enable writes */
- spin_lock_irq(&conf->device_lock);
+ lock_all_device_hash_locks_irq(conf);
conf->quiesce = 0;
wake_up(&conf->wait_for_stripe);
wake_up(&conf->wait_for_overlap);
- spin_unlock_irq(&conf->device_lock);
+ unlock_all_device_hash_locks_irq(conf);
break;
}
}
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 2113ffa82c7a..01ad8ae8f578 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -49,7 +49,7 @@
* can't distinguish between a clean block that has been generated
* from parity calculations, and a clean block that has been
* successfully written to the spare ( or to parity when resyncing).
- * To distingush these states we have a stripe bit STRIPE_INSYNC that
+ * To distinguish these states we have a stripe bit STRIPE_INSYNC that
* is set whenever a write is scheduled to the spare, or to the parity
* disc if there is no spare. A sync request clears this bit, and
* when we find it set with no buffers locked, we know the sync is
@@ -205,6 +205,7 @@ struct stripe_head {
short pd_idx; /* parity disk index */
short qd_idx; /* 'Q' disk index for raid6 */
short ddf_layout;/* use DDF ordering to calculate Q */
+ short hash_lock_index;
unsigned long state; /* state flags */
atomic_t count; /* nr of active thread/requests */
int bm_seq; /* sequence number for bitmap flushes */
@@ -367,9 +368,18 @@ struct disk_info {
struct md_rdev *rdev, *replacement;
};
+/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
+ * This is because we sometimes take all the spinlocks
+ * and creating that much locking depth can cause
+ * problems.
+ */
+#define NR_STRIPE_HASH_LOCKS 8
+#define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1)
+
struct r5worker {
struct work_struct work;
struct r5worker_group *group;
+ struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
bool working;
};
@@ -382,6 +392,8 @@ struct r5worker_group {
struct r5conf {
struct hlist_head *stripe_hashtbl;
+ /* only protect corresponding hash list and inactive_list */
+ spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS];
struct mddev *mddev;
int chunk_sectors;
int level, algorithm;
@@ -462,7 +474,8 @@ struct r5conf {
* Free stripes pool
*/
atomic_t active_stripes;
- struct list_head inactive_list;
+ struct list_head inactive_list[NR_STRIPE_HASH_LOCKS];
+ atomic_t empty_inactive_list_nr;
struct llist_head released_stripes;
wait_queue_head_t wait_for_stripe;
wait_queue_head_t wait_for_overlap;
@@ -477,6 +490,7 @@ struct r5conf {
* the new thread here until we fully activate the array.
*/
struct md_thread *thread;
+ struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
struct r5worker_group *worker_groups;
int group_cnt;
int worker_cnt_per_group;