diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Kconfig | 22 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-cache-metadata.c | 104 | ||||
-rw-r--r-- | drivers/md/dm-cache-metadata.h | 5 | ||||
-rw-r--r-- | drivers/md/dm-cache-policy-internal.h | 7 | ||||
-rw-r--r-- | drivers/md/dm-cache-policy-mq.c | 681 | ||||
-rw-r--r-- | drivers/md/dm-cache-policy.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-cache-policy.h | 21 | ||||
-rw-r--r-- | drivers/md/dm-cache-target.c | 687 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 216 | ||||
-rw-r--r-- | drivers/md/dm-ioctl.c | 36 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 34 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 23 | ||||
-rw-r--r-- | drivers/md/dm.c | 47 | ||||
-rw-r--r-- | drivers/md/dm.h | 13 | ||||
-rw-r--r-- | drivers/md/md.c | 149 | ||||
-rw-r--r-- | drivers/md/md.h | 2 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-array.c | 5 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-space-map-disk.c | 18 | ||||
-rw-r--r-- | drivers/md/raid1.c | 162 | ||||
-rw-r--r-- | drivers/md/raid1.h | 15 | ||||
-rw-r--r-- | drivers/md/raid10.c | 6 | ||||
-rw-r--r-- | drivers/md/raid5.c | 427 | ||||
-rw-r--r-- | drivers/md/raid5.h | 18 |
24 files changed, 2035 insertions, 671 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 30b426ed744b..f2ccbc3b9fe4 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -297,6 +297,17 @@ config DM_MIRROR Allow volume managers to mirror logical volumes, also needed for live data migration tools such as 'pvmove'. +config DM_LOG_USERSPACE + tristate "Mirror userspace logging" + depends on DM_MIRROR && NET + select CONNECTOR + ---help--- + The userspace logging module provides a mechanism for + relaying the dm-dirty-log API to userspace. Log designs + which are more suited to userspace implementation (e.g. + shared storage logs) or experimental logs can be implemented + by leveraging this framework. + config DM_RAID tristate "RAID 1/4/5/6/10 target" depends on BLK_DEV_DM @@ -323,17 +334,6 @@ config DM_RAID RAID-5, RAID-6 distributes the syndromes across the drives in one of the available parity distribution methods. -config DM_LOG_USERSPACE - tristate "Mirror userspace logging" - depends on DM_MIRROR && NET - select CONNECTOR - ---help--- - The userspace logging module provides a mechanism for - relaying the dm-dirty-log API to userspace. Log designs - which are more suited to userspace implementation (e.g. - shared storage logs) or experimental logs can be implemented - by leveraging this framework. - config DM_ZERO tristate "Zero target" depends on BLK_DEV_DM diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index a7fd82133b12..12dc29ba7399 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1654,9 +1654,9 @@ int bitmap_create(struct mddev *mddev) bitmap->mddev = mddev; if (mddev->kobj.sd) - bm = sysfs_get_dirent(mddev->kobj.sd, NULL, "bitmap"); + bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap"); if (bm) { - bitmap->sysfs_can_clear = sysfs_get_dirent(bm, NULL, "can_clear"); + bitmap->sysfs_can_clear = sysfs_get_dirent(bm, "can_clear"); sysfs_put(bm); } else bitmap->sysfs_can_clear = NULL; diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 1af7255bbffb..9ef0752e8a08 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -20,7 +20,13 @@ #define CACHE_SUPERBLOCK_MAGIC 06142003 #define CACHE_SUPERBLOCK_LOCATION 0 -#define CACHE_VERSION 1 + +/* + * defines a range of metadata versions that this module can handle. + */ +#define MIN_CACHE_VERSION 1 +#define MAX_CACHE_VERSION 1 + #define CACHE_METADATA_CACHE_SIZE 64 /* @@ -134,6 +140,18 @@ static void sb_prepare_for_write(struct dm_block_validator *v, SUPERBLOCK_CSUM_XOR)); } +static int check_metadata_version(struct cache_disk_superblock *disk_super) +{ + uint32_t metadata_version = le32_to_cpu(disk_super->version); + if (metadata_version < MIN_CACHE_VERSION || metadata_version > MAX_CACHE_VERSION) { + DMERR("Cache metadata version %u found, but only versions between %u and %u supported.", + metadata_version, MIN_CACHE_VERSION, MAX_CACHE_VERSION); + return -EINVAL; + } + + return 0; +} + static int sb_check(struct dm_block_validator *v, struct dm_block *b, size_t sb_block_size) @@ -164,7 +182,7 @@ static int sb_check(struct dm_block_validator *v, return -EILSEQ; } - return 0; + return check_metadata_version(disk_super); } static struct dm_block_validator sb_validator = { @@ -198,7 +216,7 @@ static int superblock_lock(struct dm_cache_metadata *cmd, /*----------------------------------------------------------------*/ -static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result) +static int __superblock_all_zeroes(struct dm_block_manager *bm, bool *result) { int r; unsigned i; @@ -214,10 +232,10 @@ static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result) return r; data_le = dm_block_data(b); - *result = 1; + *result = true; for (i = 0; i < sb_block_size; i++) { if (data_le[i] != zero) { - *result = 0; + *result = false; break; } } @@ -270,7 +288,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) disk_super->flags = 0; memset(disk_super->uuid, 0, sizeof(disk_super->uuid)); disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC); - disk_super->version = cpu_to_le32(CACHE_VERSION); + disk_super->version = cpu_to_le32(MAX_CACHE_VERSION); memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name)); memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version)); disk_super->policy_hint_size = 0; @@ -411,7 +429,8 @@ bad: static int __open_or_format_metadata(struct dm_cache_metadata *cmd, bool format_device) { - int r, unformatted; + int r; + bool unformatted = false; r = __superblock_all_zeroes(cmd->bm, &unformatted); if (r) @@ -666,19 +685,85 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd) kfree(cmd); } +/* + * Checks that the given cache block is either unmapped or clean. + */ +static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b, + bool *result) +{ + int r; + __le64 value; + dm_oblock_t ob; + unsigned flags; + + r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(b), &value); + if (r) { + DMERR("block_unmapped_or_clean failed"); + return r; + } + + unpack_value(value, &ob, &flags); + *result = !((flags & M_VALID) && (flags & M_DIRTY)); + + return 0; +} + +static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd, + dm_cblock_t begin, dm_cblock_t end, + bool *result) +{ + int r; + *result = true; + + while (begin != end) { + r = block_unmapped_or_clean(cmd, begin, result); + if (r) + return r; + + if (!*result) { + DMERR("cache block %llu is dirty", + (unsigned long long) from_cblock(begin)); + return 0; + } + + begin = to_cblock(from_cblock(begin) + 1); + } + + return 0; +} + int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size) { int r; + bool clean; __le64 null_mapping = pack_value(0, 0); down_write(&cmd->root_lock); __dm_bless_for_disk(&null_mapping); + + if (from_cblock(new_cache_size) < from_cblock(cmd->cache_blocks)) { + r = blocks_are_unmapped_or_clean(cmd, new_cache_size, cmd->cache_blocks, &clean); + if (r) { + __dm_unbless_for_disk(&null_mapping); + goto out; + } + + if (!clean) { + DMERR("unable to shrink cache due to dirty blocks"); + r = -EINVAL; + __dm_unbless_for_disk(&null_mapping); + goto out; + } + } + r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks), from_cblock(new_cache_size), &null_mapping, &cmd->root); if (!r) cmd->cache_blocks = new_cache_size; cmd->changed = true; + +out: up_write(&cmd->root_lock); return r; @@ -1182,3 +1267,8 @@ int dm_cache_save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock, return r; } + +int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result) +{ + return blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result); +} diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index f45cef21f3d0..cd906f14f98d 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h @@ -137,6 +137,11 @@ int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy * int dm_cache_save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock, uint32_t hint); +/* + * Query method. Are all the blocks in the cache clean? + */ +int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result); + /*----------------------------------------------------------------*/ #endif /* DM_CACHE_METADATA_H */ diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h index 0928abdc49f0..2256a1f24f73 100644 --- a/drivers/md/dm-cache-policy-internal.h +++ b/drivers/md/dm-cache-policy-internal.h @@ -61,7 +61,12 @@ static inline int policy_writeback_work(struct dm_cache_policy *p, static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) { - return p->remove_mapping(p, oblock); + p->remove_mapping(p, oblock); +} + +static inline int policy_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) +{ + return p->remove_cblock(p, cblock); } static inline void policy_force_mapping(struct dm_cache_policy *p, diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 4296155090b2..416b7b752a6e 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -26,19 +26,6 @@ static unsigned next_power(unsigned n, unsigned min) /*----------------------------------------------------------------*/ -static unsigned long *alloc_bitset(unsigned nr_entries) -{ - size_t s = sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG); - return vzalloc(s); -} - -static void free_bitset(unsigned long *bits) -{ - vfree(bits); -} - -/*----------------------------------------------------------------*/ - /* * Large, sequential ios are probably better left on the origin device since * spindles tend to have good bandwidth. @@ -151,6 +138,21 @@ static void queue_init(struct queue *q) } /* + * Checks to see if the queue is empty. + * FIXME: reduce cpu usage. + */ +static bool queue_empty(struct queue *q) +{ + unsigned i; + + for (i = 0; i < NR_QUEUE_LEVELS; i++) + if (!list_empty(q->qs + i)) + return false; + + return true; +} + +/* * Insert an entry to the back of the given level. */ static void queue_push(struct queue *q, unsigned level, struct list_head *elt) @@ -218,17 +220,116 @@ struct entry { struct hlist_node hlist; struct list_head list; dm_oblock_t oblock; - dm_cblock_t cblock; /* valid iff in_cache */ /* * FIXME: pack these better */ - bool in_cache:1; + bool dirty:1; unsigned hit_count; unsigned generation; unsigned tick; }; +/* + * Rather than storing the cblock in an entry, we allocate all entries in + * an array, and infer the cblock from the entry position. + * + * Free entries are linked together into a list. + */ +struct entry_pool { + struct entry *entries, *entries_end; + struct list_head free; + unsigned nr_allocated; +}; + +static int epool_init(struct entry_pool *ep, unsigned nr_entries) +{ + unsigned i; + + ep->entries = vzalloc(sizeof(struct entry) * nr_entries); + if (!ep->entries) + return -ENOMEM; + + ep->entries_end = ep->entries + nr_entries; + + INIT_LIST_HEAD(&ep->free); + for (i = 0; i < nr_entries; i++) + list_add(&ep->entries[i].list, &ep->free); + + ep->nr_allocated = 0; + + return 0; +} + +static void epool_exit(struct entry_pool *ep) +{ + vfree(ep->entries); +} + +static struct entry *alloc_entry(struct entry_pool *ep) +{ + struct entry *e; + + if (list_empty(&ep->free)) + return NULL; + + e = list_entry(list_pop(&ep->free), struct entry, list); + INIT_LIST_HEAD(&e->list); + INIT_HLIST_NODE(&e->hlist); + ep->nr_allocated++; + + return e; +} + +/* + * This assumes the cblock hasn't already been allocated. + */ +static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock) +{ + struct entry *e = ep->entries + from_cblock(cblock); + list_del(&e->list); + + INIT_LIST_HEAD(&e->list); + INIT_HLIST_NODE(&e->hlist); + ep->nr_allocated++; + + return e; +} + +static void free_entry(struct entry_pool *ep, struct entry *e) +{ + BUG_ON(!ep->nr_allocated); + ep->nr_allocated--; + INIT_HLIST_NODE(&e->hlist); + list_add(&e->list, &ep->free); +} + +/* + * Returns NULL if the entry is free. + */ +static struct entry *epool_find(struct entry_pool *ep, dm_cblock_t cblock) +{ + struct entry *e = ep->entries + from_cblock(cblock); + return !hlist_unhashed(&e->hlist) ? e : NULL; +} + +static bool epool_empty(struct entry_pool *ep) +{ + return list_empty(&ep->free); +} + +static bool in_pool(struct entry_pool *ep, struct entry *e) +{ + return e >= ep->entries && e < ep->entries_end; +} + +static dm_cblock_t infer_cblock(struct entry_pool *ep, struct entry *e) +{ + return to_cblock(e - ep->entries); +} + +/*----------------------------------------------------------------*/ + struct mq_policy { struct dm_cache_policy policy; @@ -238,13 +339,22 @@ struct mq_policy { struct io_tracker tracker; /* - * We maintain two queues of entries. The cache proper contains - * the currently active mappings. Whereas the pre_cache tracks - * blocks that are being hit frequently and potential candidates - * for promotion to the cache. + * Entries come from two pools, one of pre-cache entries, and one + * for the cache proper. + */ + struct entry_pool pre_cache_pool; + struct entry_pool cache_pool; + + /* + * We maintain three queues of entries. The cache proper, + * consisting of a clean and dirty queue, contains the currently + * active mappings. Whereas the pre_cache tracks blocks that + * are being hit frequently and potential candidates for promotion + * to the cache. */ struct queue pre_cache; - struct queue cache; + struct queue cache_clean; + struct queue cache_dirty; /* * Keeps track of time, incremented by the core. We use this to @@ -282,25 +392,6 @@ struct mq_policy { unsigned promote_threshold; /* - * We need cache_size entries for the cache, and choose to have - * cache_size entries for the pre_cache too. One motivation for - * using the same size is to make the hit counts directly - * comparable between pre_cache and cache. - */ - unsigned nr_entries; - unsigned nr_entries_allocated; - struct list_head free; - - /* - * Cache blocks may be unallocated. We store this info in a - * bitset. - */ - unsigned long *allocation_bitset; - unsigned nr_cblocks_allocated; - unsigned find_free_nr_words; - unsigned find_free_last_word; - - /* * The hash table allows us to quickly find an entry by origin * block. Both pre_cache and cache entries are in here. */ @@ -310,49 +401,6 @@ struct mq_policy { }; /*----------------------------------------------------------------*/ -/* Free/alloc mq cache entry structures. */ -static void takeout_queue(struct list_head *lh, struct queue *q) -{ - unsigned level; - - for (level = 0; level < NR_QUEUE_LEVELS; level++) - list_splice(q->qs + level, lh); -} - -static void free_entries(struct mq_policy *mq) -{ - struct entry *e, *tmp; - - takeout_queue(&mq->free, &mq->pre_cache); - takeout_queue(&mq->free, &mq->cache); - - list_for_each_entry_safe(e, tmp, &mq->free, list) - kmem_cache_free(mq_entry_cache, e); -} - -static int alloc_entries(struct mq_policy *mq, unsigned elts) -{ - unsigned u = mq->nr_entries; - - INIT_LIST_HEAD(&mq->free); - mq->nr_entries_allocated = 0; - - while (u--) { - struct entry *e = kmem_cache_zalloc(mq_entry_cache, GFP_KERNEL); - - if (!e) { - free_entries(mq); - return -ENOMEM; - } - - - list_add(&e->list, &mq->free); - } - - return 0; -} - -/*----------------------------------------------------------------*/ /* * Simple hash table implementation. Should replace with the standard hash @@ -388,96 +436,14 @@ static void hash_remove(struct entry *e) /*----------------------------------------------------------------*/ -/* - * Allocates a new entry structure. The memory is allocated in one lump, - * so we just handing it out here. Returns NULL if all entries have - * already been allocated. Cannot fail otherwise. - */ -static struct entry *alloc_entry(struct mq_policy *mq) -{ - struct entry *e; - - if (mq->nr_entries_allocated >= mq->nr_entries) { - BUG_ON(!list_empty(&mq->free)); - return NULL; - } - - e = list_entry(list_pop(&mq->free), struct entry, list); - INIT_LIST_HEAD(&e->list); - INIT_HLIST_NODE(&e->hlist); - - mq->nr_entries_allocated++; - return e; -} - -/*----------------------------------------------------------------*/ - -/* - * Mark cache blocks allocated or not in the bitset. - */ -static void alloc_cblock(struct mq_policy *mq, dm_cblock_t cblock) -{ - BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size)); - BUG_ON(test_bit(from_cblock(cblock), mq->allocation_bitset)); - - set_bit(from_cblock(cblock), mq->allocation_bitset); - mq->nr_cblocks_allocated++; -} - -static void free_cblock(struct mq_policy *mq, dm_cblock_t cblock) -{ - BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size)); - BUG_ON(!test_bit(from_cblock(cblock), mq->allocation_bitset)); - - clear_bit(from_cblock(cblock), mq->allocation_bitset); - mq->nr_cblocks_allocated--; -} - static bool any_free_cblocks(struct mq_policy *mq) { - return mq->nr_cblocks_allocated < from_cblock(mq->cache_size); + return !epool_empty(&mq->cache_pool); } -/* - * Fills result out with a cache block that isn't in use, or return - * -ENOSPC. This does _not_ mark the cblock as allocated, the caller is - * reponsible for that. - */ -static int __find_free_cblock(struct mq_policy *mq, unsigned begin, unsigned end, - dm_cblock_t *result, unsigned *last_word) +static bool any_clean_cblocks(struct mq_policy *mq) { - int r = -ENOSPC; - unsigned w; - - for (w = begin; w < end; w++) { - /* - * ffz is undefined if no zero exists - */ - if (mq->allocation_bitset[w] != ~0UL) { - *last_word = w; - *result = to_cblock((w * BITS_PER_LONG) + ffz(mq->allocation_bitset[w])); - if (from_cblock(*result) < from_cblock(mq->cache_size)) - r = 0; - - break; - } - } - - return r; -} - -static int find_free_cblock(struct mq_policy *mq, dm_cblock_t *result) -{ - int r; - - if (!any_free_cblocks(mq)) - return -ENOSPC; - - r = __find_free_cblock(mq, mq->find_free_last_word, mq->find_free_nr_words, result, &mq->find_free_last_word); - if (r == -ENOSPC && mq->find_free_last_word) - r = __find_free_cblock(mq, 0, mq->find_free_last_word, result, &mq->find_free_last_word); - - return r; + return !queue_empty(&mq->cache_clean); } /*----------------------------------------------------------------*/ @@ -496,33 +462,35 @@ static unsigned queue_level(struct entry *e) return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u); } +static bool in_cache(struct mq_policy *mq, struct entry *e) +{ + return in_pool(&mq->cache_pool, e); +} + /* * Inserts the entry into the pre_cache or the cache. Ensures the cache - * block is marked as allocated if necc. Inserts into the hash table. Sets the - * tick which records when the entry was last moved about. + * block is marked as allocated if necc. Inserts into the hash table. + * Sets the tick which records when the entry was last moved about. */ static void push(struct mq_policy *mq, struct entry *e) { e->tick = mq->tick; hash_insert(mq, e); - if (e->in_cache) { - alloc_cblock(mq, e->cblock); - queue_push(&mq->cache, queue_level(e), &e->list); - } else + if (in_cache(mq, e)) + queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean, + queue_level(e), &e->list); + else queue_push(&mq->pre_cache, queue_level(e), &e->list); } /* * Removes an entry from pre_cache or cache. Removes from the hash table. - * Frees off the cache block if necc. */ static void del(struct mq_policy *mq, struct entry *e) { queue_remove(&e->list); hash_remove(e); - if (e->in_cache) - free_cblock(mq, e->cblock); } /* @@ -531,14 +499,14 @@ static void del(struct mq_policy *mq, struct entry *e) */ static struct entry *pop(struct mq_policy *mq, struct queue *q) { - struct entry *e = container_of(queue_pop(q), struct entry, list); + struct entry *e; + struct list_head *h = queue_pop(q); - if (e) { - hash_remove(e); + if (!h) + return NULL; - if (e->in_cache) - free_cblock(mq, e->cblock); - } + e = container_of(h, struct entry, list); + hash_remove(e); return e; } @@ -556,7 +524,8 @@ static bool updated_this_tick(struct mq_policy *mq, struct entry *e) * of the entries. * * At the moment the threshold is taken by averaging the hit counts of some - * of the entries in the cache (the first 20 entries of the first level). + * of the entries in the cache (the first 20 entries across all levels in + * ascending order, giving preference to the clean entries at each level). * * We can be much cleverer than this though. For example, each promotion * could bump up the threshold helping to prevent churn. Much more to do @@ -571,14 +540,21 @@ static void check_generation(struct mq_policy *mq) struct list_head *head; struct entry *e; - if ((mq->hit_count >= mq->generation_period) && - (mq->nr_cblocks_allocated == from_cblock(mq->cache_size))) { - + if ((mq->hit_count >= mq->generation_period) && (epool_empty(&mq->cache_pool))) { mq->hit_count = 0; mq->generation++; for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) { - head = mq->cache.qs + level; + head = mq->cache_clean.qs + level; + list_for_each_entry(e, head, list) { + nr++; + total += e->hit_count; + + if (++count >= MAX_TO_AVERAGE) + break; + } + + head = mq->cache_dirty.qs + level; list_for_each_entry(e, head, list) { nr++; total += e->hit_count; @@ -631,19 +607,30 @@ static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e) * - set the hit count to a hard coded value other than 1, eg, is it better * if it goes in at level 2? */ -static dm_cblock_t demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock) +static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock) { - dm_cblock_t result; - struct entry *demoted = pop(mq, &mq->cache); + struct entry *demoted = pop(mq, &mq->cache_clean); + + if (!demoted) + /* + * We could get a block from mq->cache_dirty, but that + * would add extra latency to the triggering bio as it + * waits for the writeback. Better to not promote this + * time and hope there's a clean block next time this block + * is hit. + */ + return -ENOSPC; - BUG_ON(!demoted); - result = demoted->cblock; *oblock = demoted->oblock; - demoted->in_cache = false; - demoted->hit_count = 1; - push(mq, demoted); + free_entry(&mq->cache_pool, demoted); + + /* + * We used to put the demoted block into the pre-cache, but I think + * it's simpler to just let it work it's way up from zero again. + * Stops blocks flickering in and out of the cache. + */ - return result; + return 0; } /* @@ -662,17 +649,18 @@ static dm_cblock_t demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock) static unsigned adjusted_promote_threshold(struct mq_policy *mq, bool discarded_oblock, int data_dir) { - if (discarded_oblock && any_free_cblocks(mq) && data_dir == WRITE) + if (data_dir == READ) + return mq->promote_threshold + READ_PROMOTE_THRESHOLD; + + if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) { /* * We don't need to do any copying at all, so give this a - * very low threshold. In practice this only triggers - * during initial population after a format. + * very low threshold. */ return DISCARDED_PROMOTE_THRESHOLD; + } - return data_dir == READ ? - (mq->promote_threshold + READ_PROMOTE_THRESHOLD) : - (mq->promote_threshold + WRITE_PROMOTE_THRESHOLD); + return mq->promote_threshold + WRITE_PROMOTE_THRESHOLD; } static bool should_promote(struct mq_policy *mq, struct entry *e, @@ -688,34 +676,49 @@ static int cache_entry_found(struct mq_policy *mq, { requeue_and_update_tick(mq, e); - if (e->in_cache) { + if (in_cache(mq, e)) { result->op = POLICY_HIT; - result->cblock = e->cblock; + result->cblock = infer_cblock(&mq->cache_pool, e); } return 0; } /* - * Moves and entry from the pre_cache to the cache. The main work is + * Moves an entry from the pre_cache to the cache. The main work is * finding which cache block to use. */ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e, struct policy_result *result) { - dm_cblock_t cblock; + int r; + struct entry *new_e; - if (find_free_cblock(mq, &cblock) == -ENOSPC) { + /* Ensure there's a free cblock in the cache */ + if (epool_empty(&mq->cache_pool)) { result->op = POLICY_REPLACE; - cblock = demote_cblock(mq, &result->old_oblock); + r = demote_cblock(mq, &result->old_oblock); + if (r) { + result->op = POLICY_MISS; + return 0; + } } else result->op = POLICY_NEW; - result->cblock = e->cblock = cblock; + new_e = alloc_entry(&mq->cache_pool); + BUG_ON(!new_e); + + new_e->oblock = e->oblock; + new_e->dirty = false; + new_e->hit_count = e->hit_count; + new_e->generation = e->generation; + new_e->tick = e->tick; del(mq, e); - e->in_cache = true; - push(mq, e); + free_entry(&mq->pre_cache_pool, e); + push(mq, new_e); + + result->cblock = infer_cblock(&mq->cache_pool, new_e); return 0; } @@ -743,7 +746,7 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e, static void insert_in_pre_cache(struct mq_policy *mq, dm_oblock_t oblock) { - struct entry *e = alloc_entry(mq); + struct entry *e = alloc_entry(&mq->pre_cache_pool); if (!e) /* @@ -757,7 +760,7 @@ static void insert_in_pre_cache(struct mq_policy *mq, return; } - e->in_cache = false; + e->dirty = false; e->oblock = oblock; e->hit_count = 1; e->generation = mq->generation; @@ -767,30 +770,36 @@ static void insert_in_pre_cache(struct mq_policy *mq, static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock, struct policy_result *result) { + int r; struct entry *e; - dm_cblock_t cblock; - if (find_free_cblock(mq, &cblock) == -ENOSPC) { - result->op = POLICY_MISS; - insert_in_pre_cache(mq, oblock); - return; - } + if (epool_empty(&mq->cache_pool)) { + result->op = POLICY_REPLACE; + r = demote_cblock(mq, &result->old_oblock); + if (unlikely(r)) { + result->op = POLICY_MISS; + insert_in_pre_cache(mq, oblock); + return; + } - e = alloc_entry(mq); - if (unlikely(!e)) { - result->op = POLICY_MISS; - return; + /* + * This will always succeed, since we've just demoted. + */ + e = alloc_entry(&mq->cache_pool); + BUG_ON(!e); + + } else { + e = alloc_entry(&mq->cache_pool); + result->op = POLICY_NEW; } e->oblock = oblock; - e->cblock = cblock; - e->in_cache = true; + e->dirty = false; e->hit_count = 1; e->generation = mq->generation; push(mq, e); - result->op = POLICY_NEW; - result->cblock = e->cblock; + result->cblock = infer_cblock(&mq->cache_pool, e); } static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock, @@ -821,13 +830,16 @@ static int map(struct mq_policy *mq, dm_oblock_t oblock, int r = 0; struct entry *e = hash_lookup(mq, oblock); - if (e && e->in_cache) + if (e && in_cache(mq, e)) r = cache_entry_found(mq, e, result); + else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL) result->op = POLICY_MISS; + else if (e) r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock, data_dir, result); + else r = no_entry_found(mq, oblock, can_migrate, discarded_oblock, data_dir, result); @@ -854,9 +866,9 @@ static void mq_destroy(struct dm_cache_policy *p) { struct mq_policy *mq = to_mq_policy(p); - free_bitset(mq->allocation_bitset); kfree(mq->table); - free_entries(mq); + epool_exit(&mq->cache_pool); + epool_exit(&mq->pre_cache_pool); kfree(mq); } @@ -904,8 +916,8 @@ static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t return -EWOULDBLOCK; e = hash_lookup(mq, oblock); - if (e && e->in_cache) { - *cblock = e->cblock; + if (e && in_cache(mq, e)) { + *cblock = infer_cblock(&mq->cache_pool, e); r = 0; } else r = -ENOENT; @@ -915,6 +927,36 @@ static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t return r; } +static void __mq_set_clear_dirty(struct mq_policy *mq, dm_oblock_t oblock, bool set) +{ + struct entry *e; + + e = hash_lookup(mq, oblock); + BUG_ON(!e || !in_cache(mq, e)); + + del(mq, e); + e->dirty = set; + push(mq, e); +} + +static void mq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) +{ + struct mq_policy *mq = to_mq_policy(p); + + mutex_lock(&mq->lock); + __mq_set_clear_dirty(mq, oblock, true); + mutex_unlock(&mq->lock); +} + +static void mq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) +{ + struct mq_policy *mq = to_mq_policy(p); + + mutex_lock(&mq->lock); + __mq_set_clear_dirty(mq, oblock, false); + mutex_unlock(&mq->lock); +} + static int mq_load_mapping(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t cblock, uint32_t hint, bool hint_valid) @@ -922,13 +964,9 @@ static int mq_load_mapping(struct dm_cache_policy *p, struct mq_policy *mq = to_mq_policy(p); struct entry *e; - e = alloc_entry(mq); - if (!e) - return -ENOMEM; - - e->cblock = cblock; + e = alloc_particular_entry(&mq->cache_pool, cblock); e->oblock = oblock; - e->in_cache = true; + e->dirty = false; /* this gets corrected in a minute */ e->hit_count = hint_valid ? hint : 1; e->generation = mq->generation; push(mq, e); @@ -936,57 +974,126 @@ static int mq_load_mapping(struct dm_cache_policy *p, return 0; } +static int mq_save_hints(struct mq_policy *mq, struct queue *q, + policy_walk_fn fn, void *context) +{ + int r; + unsigned level; + struct entry *e; + + for (level = 0; level < NR_QUEUE_LEVELS; level++) + list_for_each_entry(e, q->qs + level, list) { + r = fn(context, infer_cblock(&mq->cache_pool, e), + e->oblock, e->hit_count); + if (r) + return r; + } + + return 0; +} + static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn, void *context) { struct mq_policy *mq = to_mq_policy(p); int r = 0; - struct entry *e; - unsigned level; mutex_lock(&mq->lock); - for (level = 0; level < NR_QUEUE_LEVELS; level++) - list_for_each_entry(e, &mq->cache.qs[level], list) { - r = fn(context, e->cblock, e->oblock, e->hit_count); - if (r) - goto out; - } + r = mq_save_hints(mq, &mq->cache_clean, fn, context); + if (!r) + r = mq_save_hints(mq, &mq->cache_dirty, fn, context); -out: mutex_unlock(&mq->lock); return r; } +static void __remove_mapping(struct mq_policy *mq, dm_oblock_t oblock) +{ + struct entry *e; + + e = hash_lookup(mq, oblock); + BUG_ON(!e || !in_cache(mq, e)); + + del(mq, e); + free_entry(&mq->cache_pool, e); +} + static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) { struct mq_policy *mq = to_mq_policy(p); - struct entry *e; mutex_lock(&mq->lock); + __remove_mapping(mq, oblock); + mutex_unlock(&mq->lock); +} - e = hash_lookup(mq, oblock); +static int __remove_cblock(struct mq_policy *mq, dm_cblock_t cblock) +{ + struct entry *e = epool_find(&mq->cache_pool, cblock); - BUG_ON(!e || !e->in_cache); + if (!e) + return -ENODATA; del(mq, e); - e->in_cache = false; - push(mq, e); + free_entry(&mq->cache_pool, e); + return 0; +} + +static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) +{ + int r; + struct mq_policy *mq = to_mq_policy(p); + + mutex_lock(&mq->lock); + r = __remove_cblock(mq, cblock); mutex_unlock(&mq->lock); + + return r; } -static void force_mapping(struct mq_policy *mq, - dm_oblock_t current_oblock, dm_oblock_t new_oblock) +static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock, + dm_cblock_t *cblock) { - struct entry *e = hash_lookup(mq, current_oblock); + struct entry *e = pop(mq, &mq->cache_dirty); - BUG_ON(!e || !e->in_cache); + if (!e) + return -ENODATA; - del(mq, e); - e->oblock = new_oblock; + *oblock = e->oblock; + *cblock = infer_cblock(&mq->cache_pool, e); + e->dirty = false; push(mq, e); + + return 0; +} + +static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock, + dm_cblock_t *cblock) +{ + int r; + struct mq_policy *mq = to_mq_policy(p); + + mutex_lock(&mq->lock); + r = __mq_writeback_work(mq, oblock, cblock); + mutex_unlock(&mq->lock); + + return r; +} + +static void __force_mapping(struct mq_policy *mq, + dm_oblock_t current_oblock, dm_oblock_t new_oblock) +{ + struct entry *e = hash_lookup(mq, current_oblock); + + if (e && in_cache(mq, e)) { + del(mq, e); + e->oblock = new_oblock; + e->dirty = true; + push(mq, e); + } } static void mq_force_mapping(struct dm_cache_policy *p, @@ -995,16 +1102,20 @@ static void mq_force_mapping(struct dm_cache_policy *p, struct mq_policy *mq = to_mq_policy(p); mutex_lock(&mq->lock); - force_mapping(mq, current_oblock, new_oblock); + __force_mapping(mq, current_oblock, new_oblock); mutex_unlock(&mq->lock); } static dm_cblock_t mq_residency(struct dm_cache_policy *p) { + dm_cblock_t r; struct mq_policy *mq = to_mq_policy(p); - /* FIXME: lock mutex, not sure we can block here */ - return to_cblock(mq->nr_cblocks_allocated); + mutex_lock(&mq->lock); + r = to_cblock(mq->cache_pool.nr_allocated); + mutex_unlock(&mq->lock); + + return r; } static void mq_tick(struct dm_cache_policy *p) @@ -1057,10 +1168,13 @@ static void init_policy_functions(struct mq_policy *mq) mq->policy.destroy = mq_destroy; mq->policy.map = mq_map; mq->policy.lookup = mq_lookup; + mq->policy.set_dirty = mq_set_dirty; + mq->policy.clear_dirty = mq_clear_dirty; mq->policy.load_mapping = mq_load_mapping; mq->policy.walk_mappings = mq_walk_mappings; mq->policy.remove_mapping = mq_remove_mapping; - mq->policy.writeback_work = NULL; + mq->policy.remove_cblock = mq_remove_cblock; + mq->policy.writeback_work = mq_writeback_work; mq->policy.force_mapping = mq_force_mapping; mq->policy.residency = mq_residency; mq->policy.tick = mq_tick; @@ -1072,7 +1186,6 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, sector_t origin_size, sector_t cache_block_size) { - int r; struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL); if (!mq) @@ -1080,8 +1193,18 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, init_policy_functions(mq); iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT); - mq->cache_size = cache_size; + + if (epool_init(&mq->pre_cache_pool, from_cblock(cache_size))) { + DMERR("couldn't initialize pool of pre-cache entries"); + goto bad_pre_cache_init; + } + + if (epool_init(&mq->cache_pool, from_cblock(cache_size))) { + DMERR("couldn't initialize pool of cache entries"); + goto bad_cache_init; + } + mq->tick_protected = 0; mq->tick = 0; mq->hit_count = 0; @@ -1089,20 +1212,12 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, mq->promote_threshold = 0; mutex_init(&mq->lock); spin_lock_init(&mq->tick_lock); - mq->find_free_nr_words = dm_div_up(from_cblock(mq->cache_size), BITS_PER_LONG); - mq->find_free_last_word = 0; queue_init(&mq->pre_cache); - queue_init(&mq->cache); - mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U); + queue_init(&mq->cache_clean); + queue_init(&mq->cache_dirty); - mq->nr_entries = 2 * from_cblock(cache_size); - r = alloc_entries(mq, mq->nr_entries); - if (r) - goto bad_cache_alloc; - - mq->nr_entries_allocated = 0; - mq->nr_cblocks_allocated = 0; + mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U); mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); mq->hash_bits = ffs(mq->nr_buckets) - 1; @@ -1110,17 +1225,13 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, if (!mq->table) goto bad_alloc_table; - mq->allocation_bitset = alloc_bitset(from_cblock(cache_size)); - if (!mq->allocation_bitset) - goto bad_alloc_bitset; - return &mq->policy; -bad_alloc_bitset: - kfree(mq->table); bad_alloc_table: - free_entries(mq); -bad_cache_alloc: + epool_exit(&mq->cache_pool); +bad_cache_init: + epool_exit(&mq->pre_cache_pool); +bad_pre_cache_init: kfree(mq); return NULL; @@ -1130,7 +1241,7 @@ bad_cache_alloc: static struct dm_cache_policy_type mq_policy_type = { .name = "mq", - .version = {1, 0, 0}, + .version = {1, 1, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = mq_create @@ -1138,7 +1249,7 @@ static struct dm_cache_policy_type mq_policy_type = { static struct dm_cache_policy_type default_policy_type = { .name = "default", - .version = {1, 0, 0}, + .version = {1, 1, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = mq_create diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c index 21c03c570c06..d80057968407 100644 --- a/drivers/md/dm-cache-policy.c +++ b/drivers/md/dm-cache-policy.c @@ -119,13 +119,13 @@ struct dm_cache_policy *dm_cache_policy_create(const char *name, type = get_policy(name); if (!type) { DMWARN("unknown policy type"); - return NULL; + return ERR_PTR(-EINVAL); } p = type->create(cache_size, origin_size, cache_block_size); if (!p) { put_policy(type); - return NULL; + return ERR_PTR(-ENOMEM); } p->private = type; diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h index 33369ca9614f..052c00a84a5c 100644 --- a/drivers/md/dm-cache-policy.h +++ b/drivers/md/dm-cache-policy.h @@ -135,9 +135,6 @@ struct dm_cache_policy { */ int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock); - /* - * oblock must be a mapped block. Must not block. - */ void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); @@ -159,8 +156,24 @@ struct dm_cache_policy { void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock, dm_oblock_t new_oblock); - int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock); + /* + * This is called via the invalidate_cblocks message. It is + * possible the particular cblock has already been removed due to a + * write io in passthrough mode. In which case this should return + * -ENODATA. + */ + int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock); + /* + * Provide a dirty block to be written back by the core target. + * + * Returns: + * + * 0 and @cblock,@oblock: block to write back provided + * + * -ENODATA: no dirty blocks available + */ + int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock); /* * How full is the cache? diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 29569768ffbf..9efcf1059b99 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -61,6 +61,34 @@ static void free_bitset(unsigned long *bits) /*----------------------------------------------------------------*/ +/* + * There are a couple of places where we let a bio run, but want to do some + * work before calling its endio function. We do this by temporarily + * changing the endio fn. + */ +struct dm_hook_info { + bio_end_io_t *bi_end_io; + void *bi_private; +}; + +static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio, + bio_end_io_t *bi_end_io, void *bi_private) +{ + h->bi_end_io = bio->bi_end_io; + h->bi_private = bio->bi_private; + + bio->bi_end_io = bi_end_io; + bio->bi_private = bi_private; +} + +static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) +{ + bio->bi_end_io = h->bi_end_io; + bio->bi_private = h->bi_private; +} + +/*----------------------------------------------------------------*/ + #define PRISON_CELLS 1024 #define MIGRATION_POOL_SIZE 128 #define COMMIT_PERIOD HZ @@ -76,14 +104,37 @@ static void free_bitset(unsigned long *bits) /* * FIXME: the cache is read/write for the time being. */ -enum cache_mode { +enum cache_metadata_mode { CM_WRITE, /* metadata may be changed */ CM_READ_ONLY, /* metadata may not be changed */ }; +enum cache_io_mode { + /* + * Data is written to cached blocks only. These blocks are marked + * dirty. If you lose the cache device you will lose data. + * Potential performance increase for both reads and writes. + */ + CM_IO_WRITEBACK, + + /* + * Data is written to both cache and origin. Blocks are never + * dirty. Potential performance benfit for reads only. + */ + CM_IO_WRITETHROUGH, + + /* + * A degraded mode useful for various cache coherency situations + * (eg, rolling back snapshots). Reads and writes always go to the + * origin. If a write goes to a cached oblock, then the cache + * block is invalidated. + */ + CM_IO_PASSTHROUGH +}; + struct cache_features { - enum cache_mode mode; - bool write_through:1; + enum cache_metadata_mode mode; + enum cache_io_mode io_mode; }; struct cache_stats { @@ -99,6 +150,25 @@ struct cache_stats { atomic_t discard_count; }; +/* + * Defines a range of cblocks, begin to (end - 1) are in the range. end is + * the one-past-the-end value. + */ +struct cblock_range { + dm_cblock_t begin; + dm_cblock_t end; +}; + +struct invalidation_request { + struct list_head list; + struct cblock_range *cblocks; + + atomic_t complete; + int err; + + wait_queue_head_t result_wait; +}; + struct cache { struct dm_target *ti; struct dm_target_callbacks callbacks; @@ -148,6 +218,10 @@ struct cache { wait_queue_head_t migration_wait; atomic_t nr_migrations; + wait_queue_head_t quiescing_wait; + atomic_t quiescing; + atomic_t quiescing_ack; + /* * cache_size entries, dirty if set */ @@ -186,7 +260,7 @@ struct cache { bool need_tick_bio:1; bool sized:1; - bool quiescing:1; + bool invalidate:1; bool commit_requested:1; bool loaded_mappings:1; bool loaded_discards:1; @@ -197,6 +271,12 @@ struct cache { struct cache_features features; struct cache_stats stats; + + /* + * Invalidation fields. + */ + spinlock_t invalidation_lock; + struct list_head invalidation_requests; }; struct per_bio_data { @@ -211,7 +291,7 @@ struct per_bio_data { */ struct cache *cache; dm_cblock_t cblock; - bio_end_io_t *saved_bi_end_io; + struct dm_hook_info hook_info; struct dm_bio_details bio_details; }; @@ -228,6 +308,8 @@ struct dm_cache_migration { bool writeback:1; bool demote:1; bool promote:1; + bool requeue_holder:1; + bool invalidate:1; struct dm_bio_prison_cell *old_ocell; struct dm_bio_prison_cell *new_ocell; @@ -533,9 +615,24 @@ static void save_stats(struct cache *cache) #define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) #define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) +static bool writethrough_mode(struct cache_features *f) +{ + return f->io_mode == CM_IO_WRITETHROUGH; +} + +static bool writeback_mode(struct cache_features *f) +{ + return f->io_mode == CM_IO_WRITEBACK; +} + +static bool passthrough_mode(struct cache_features *f) +{ + return f->io_mode == CM_IO_PASSTHROUGH; +} + static size_t get_per_bio_data_size(struct cache *cache) { - return cache->features.write_through ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; + return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; } static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) @@ -605,6 +702,7 @@ static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, dm_oblock_t oblock, dm_cblock_t cblock) { + check_if_tick_bio_needed(cache, bio); remap_to_cache(cache, bio, cblock); if (bio_data_dir(bio) == WRITE) { set_dirty(cache, oblock, cblock); @@ -662,7 +760,8 @@ static void defer_writethrough_bio(struct cache *cache, struct bio *bio) static void writethrough_endio(struct bio *bio, int err) { struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); - bio->bi_end_io = pb->saved_bi_end_io; + + dm_unhook_bio(&pb->hook_info, bio); if (err) { bio_endio(bio, err); @@ -693,9 +792,8 @@ static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, pb->cache = cache; pb->cblock = cblock; - pb->saved_bi_end_io = bio->bi_end_io; + dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL); dm_bio_record(&pb->bio_details, bio); - bio->bi_end_io = writethrough_endio; remap_to_origin_clear_discard(pb->cache, bio, oblock); } @@ -748,8 +846,9 @@ static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, static void cleanup_migration(struct dm_cache_migration *mg) { - dec_nr_migrations(mg->cache); + struct cache *cache = mg->cache; free_migration(mg); + dec_nr_migrations(cache); } static void migration_failure(struct dm_cache_migration *mg) @@ -765,13 +864,13 @@ static void migration_failure(struct dm_cache_migration *mg) DMWARN_LIMIT("demotion failed; couldn't copy block"); policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); - cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1); + cell_defer(cache, mg->old_ocell, mg->promote ? false : true); if (mg->promote) - cell_defer(cache, mg->new_ocell, 1); + cell_defer(cache, mg->new_ocell, true); } else { DMWARN_LIMIT("promotion failed; couldn't copy block"); policy_remove_mapping(cache->policy, mg->new_oblock); - cell_defer(cache, mg->new_ocell, 1); + cell_defer(cache, mg->new_ocell, true); } cleanup_migration(mg); @@ -823,7 +922,7 @@ static void migration_success_post_commit(struct dm_cache_migration *mg) return; } else if (mg->demote) { - cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1); + cell_defer(cache, mg->old_ocell, mg->promote ? false : true); if (mg->promote) { mg->demote = false; @@ -832,11 +931,19 @@ static void migration_success_post_commit(struct dm_cache_migration *mg) list_add_tail(&mg->list, &cache->quiesced_migrations); spin_unlock_irqrestore(&cache->lock, flags); - } else + } else { + if (mg->invalidate) + policy_remove_mapping(cache->policy, mg->old_oblock); cleanup_migration(mg); + } } else { - cell_defer(cache, mg->new_ocell, true); + if (mg->requeue_holder) + cell_defer(cache, mg->new_ocell, true); + else { + bio_endio(mg->new_ocell->holder, 0); + cell_defer(cache, mg->new_ocell, false); + } clear_dirty(cache, mg->new_oblock, mg->cblock); cleanup_migration(mg); } @@ -881,8 +988,46 @@ static void issue_copy_real(struct dm_cache_migration *mg) r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); } - if (r < 0) + if (r < 0) { + DMERR_LIMIT("issuing migration failed"); migration_failure(mg); + } +} + +static void overwrite_endio(struct bio *bio, int err) +{ + struct dm_cache_migration *mg = bio->bi_private; + struct cache *cache = mg->cache; + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + unsigned long flags; + + if (err) + mg->err = true; + + spin_lock_irqsave(&cache->lock, flags); + list_add_tail(&mg->list, &cache->completed_migrations); + dm_unhook_bio(&pb->hook_info, bio); + mg->requeue_holder = false; + spin_unlock_irqrestore(&cache->lock, flags); + + wake_worker(cache); +} + +static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) +{ + size_t pb_data_size = get_per_bio_data_size(mg->cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + + dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); + remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock); + generic_make_request(bio); +} + +static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) +{ + return (bio_data_dir(bio) == WRITE) && + (bio->bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); } static void avoid_copy(struct dm_cache_migration *mg) @@ -899,9 +1044,17 @@ static void issue_copy(struct dm_cache_migration *mg) if (mg->writeback || mg->demote) avoid = !is_dirty(cache, mg->cblock) || is_discarded_oblock(cache, mg->old_oblock); - else + else { + struct bio *bio = mg->new_ocell->holder; + avoid = is_discarded_oblock(cache, mg->new_oblock); + if (!avoid && bio_writes_complete_block(cache, bio)) { + issue_overwrite(mg, bio); + return; + } + } + avoid ? avoid_copy(mg) : issue_copy_real(mg); } @@ -991,6 +1144,8 @@ static void promote(struct cache *cache, struct prealloc *structs, mg->writeback = false; mg->demote = false; mg->promote = true; + mg->requeue_holder = true; + mg->invalidate = false; mg->cache = cache; mg->new_oblock = oblock; mg->cblock = cblock; @@ -1012,6 +1167,8 @@ static void writeback(struct cache *cache, struct prealloc *structs, mg->writeback = true; mg->demote = false; mg->promote = false; + mg->requeue_holder = true; + mg->invalidate = false; mg->cache = cache; mg->old_oblock = oblock; mg->cblock = cblock; @@ -1035,6 +1192,8 @@ static void demote_then_promote(struct cache *cache, struct prealloc *structs, mg->writeback = false; mg->demote = true; mg->promote = true; + mg->requeue_holder = true; + mg->invalidate = false; mg->cache = cache; mg->old_oblock = old_oblock; mg->new_oblock = new_oblock; @@ -1047,6 +1206,33 @@ static void demote_then_promote(struct cache *cache, struct prealloc *structs, quiesce_migration(mg); } +/* + * Invalidate a cache entry. No writeback occurs; any changes in the cache + * block are thrown away. + */ +static void invalidate(struct cache *cache, struct prealloc *structs, + dm_oblock_t oblock, dm_cblock_t cblock, + struct dm_bio_prison_cell *cell) +{ + struct dm_cache_migration *mg = prealloc_get_migration(structs); + + mg->err = false; + mg->writeback = false; + mg->demote = true; + mg->promote = false; + mg->requeue_holder = true; + mg->invalidate = true; + mg->cache = cache; + mg->old_oblock = oblock; + mg->cblock = cblock; + mg->old_ocell = cell; + mg->new_ocell = NULL; + mg->start_jiffies = jiffies; + + inc_nr_migrations(cache); + quiesce_migration(mg); +} + /*---------------------------------------------------------------- * bio processing *--------------------------------------------------------------*/ @@ -1109,13 +1295,6 @@ static bool spare_migration_bandwidth(struct cache *cache) return current_volume < cache->migration_threshold; } -static bool is_writethrough_io(struct cache *cache, struct bio *bio, - dm_cblock_t cblock) -{ - return bio_data_dir(bio) == WRITE && - cache->features.write_through && !is_dirty(cache, cblock); -} - static void inc_hit_counter(struct cache *cache, struct bio *bio) { atomic_inc(bio_data_dir(bio) == READ ? @@ -1128,6 +1307,15 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio) &cache->stats.read_miss : &cache->stats.write_miss); } +static void issue_cache_bio(struct cache *cache, struct bio *bio, + struct per_bio_data *pb, + dm_oblock_t oblock, dm_cblock_t cblock) +{ + pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); + remap_to_cache_dirty(cache, bio, oblock, cblock); + issue(cache, bio); +} + static void process_bio(struct cache *cache, struct prealloc *structs, struct bio *bio) { @@ -1139,7 +1327,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs, size_t pb_data_size = get_per_bio_data_size(cache); struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); bool discarded_block = is_discarded_oblock(cache, block); - bool can_migrate = discarded_block || spare_migration_bandwidth(cache); + bool passthrough = passthrough_mode(&cache->features); + bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache)); /* * Check to see if that block is currently migrating. @@ -1160,15 +1349,39 @@ static void process_bio(struct cache *cache, struct prealloc *structs, switch (lookup_result.op) { case POLICY_HIT: - inc_hit_counter(cache, bio); - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); + if (passthrough) { + inc_miss_counter(cache, bio); - if (is_writethrough_io(cache, bio, lookup_result.cblock)) - remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); - else - remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); + /* + * Passthrough always maps to the origin, + * invalidating any cache blocks that are written + * to. + */ + + if (bio_data_dir(bio) == WRITE) { + atomic_inc(&cache->stats.demotion); + invalidate(cache, structs, block, lookup_result.cblock, new_ocell); + release_cell = false; + + } else { + /* FIXME: factor out issue_origin() */ + pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); + remap_to_origin_clear_discard(cache, bio, block); + issue(cache, bio); + } + } else { + inc_hit_counter(cache, bio); + + if (bio_data_dir(bio) == WRITE && + writethrough_mode(&cache->features) && + !is_dirty(cache, lookup_result.cblock)) { + pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); + remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); + issue(cache, bio); + } else + issue_cache_bio(cache, bio, pb, block, lookup_result.cblock); + } - issue(cache, bio); break; case POLICY_MISS: @@ -1227,15 +1440,17 @@ static int need_commit_due_to_time(struct cache *cache) static int commit_if_needed(struct cache *cache) { - if (dm_cache_changed_this_transaction(cache->cmd) && - (cache->commit_requested || need_commit_due_to_time(cache))) { + int r = 0; + + if ((cache->commit_requested || need_commit_due_to_time(cache)) && + dm_cache_changed_this_transaction(cache->cmd)) { atomic_inc(&cache->stats.commit_count); - cache->last_commit_jiffies = jiffies; cache->commit_requested = false; - return dm_cache_commit(cache->cmd, false); + r = dm_cache_commit(cache->cmd, false); + cache->last_commit_jiffies = jiffies; } - return 0; + return r; } static void process_deferred_bios(struct cache *cache) @@ -1344,36 +1559,88 @@ static void writeback_some_dirty_blocks(struct cache *cache) } /*---------------------------------------------------------------- - * Main worker loop + * Invalidations. + * Dropping something from the cache *without* writing back. *--------------------------------------------------------------*/ -static void start_quiescing(struct cache *cache) + +static void process_invalidation_request(struct cache *cache, struct invalidation_request *req) { - unsigned long flags; + int r = 0; + uint64_t begin = from_cblock(req->cblocks->begin); + uint64_t end = from_cblock(req->cblocks->end); - spin_lock_irqsave(&cache->lock, flags); - cache->quiescing = 1; - spin_unlock_irqrestore(&cache->lock, flags); + while (begin != end) { + r = policy_remove_cblock(cache->policy, to_cblock(begin)); + if (!r) { + r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin)); + if (r) + break; + + } else if (r == -ENODATA) { + /* harmless, already unmapped */ + r = 0; + + } else { + DMERR("policy_remove_cblock failed"); + break; + } + + begin++; + } + + cache->commit_requested = true; + + req->err = r; + atomic_set(&req->complete, 1); + + wake_up(&req->result_wait); } -static void stop_quiescing(struct cache *cache) +static void process_invalidation_requests(struct cache *cache) { - unsigned long flags; + struct list_head list; + struct invalidation_request *req, *tmp; - spin_lock_irqsave(&cache->lock, flags); - cache->quiescing = 0; - spin_unlock_irqrestore(&cache->lock, flags); + INIT_LIST_HEAD(&list); + spin_lock(&cache->invalidation_lock); + list_splice_init(&cache->invalidation_requests, &list); + spin_unlock(&cache->invalidation_lock); + + list_for_each_entry_safe (req, tmp, &list, list) + process_invalidation_request(cache, req); } +/*---------------------------------------------------------------- + * Main worker loop + *--------------------------------------------------------------*/ static bool is_quiescing(struct cache *cache) { - int r; - unsigned long flags; + return atomic_read(&cache->quiescing); +} - spin_lock_irqsave(&cache->lock, flags); - r = cache->quiescing; - spin_unlock_irqrestore(&cache->lock, flags); +static void ack_quiescing(struct cache *cache) +{ + if (is_quiescing(cache)) { + atomic_inc(&cache->quiescing_ack); + wake_up(&cache->quiescing_wait); + } +} - return r; +static void wait_for_quiescing_ack(struct cache *cache) +{ + wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack)); +} + +static void start_quiescing(struct cache *cache) +{ + atomic_inc(&cache->quiescing); + wait_for_quiescing_ack(cache); +} + +static void stop_quiescing(struct cache *cache) +{ + atomic_set(&cache->quiescing, 0); + atomic_set(&cache->quiescing_ack, 0); } static void wait_for_migrations(struct cache *cache) @@ -1412,7 +1679,8 @@ static int more_work(struct cache *cache) !bio_list_empty(&cache->deferred_writethrough_bios) || !list_empty(&cache->quiesced_migrations) || !list_empty(&cache->completed_migrations) || - !list_empty(&cache->need_commit_migrations); + !list_empty(&cache->need_commit_migrations) || + cache->invalidate; } static void do_worker(struct work_struct *ws) @@ -1420,16 +1688,16 @@ static void do_worker(struct work_struct *ws) struct cache *cache = container_of(ws, struct cache, worker); do { - if (!is_quiescing(cache)) + if (!is_quiescing(cache)) { + writeback_some_dirty_blocks(cache); + process_deferred_writethrough_bios(cache); process_deferred_bios(cache); + process_invalidation_requests(cache); + } process_migrations(cache, &cache->quiesced_migrations, issue_copy); process_migrations(cache, &cache->completed_migrations, complete_migration); - writeback_some_dirty_blocks(cache); - - process_deferred_writethrough_bios(cache); - if (commit_if_needed(cache)) { process_deferred_flush_bios(cache, false); @@ -1442,6 +1710,9 @@ static void do_worker(struct work_struct *ws) process_migrations(cache, &cache->need_commit_migrations, migration_success_post_commit); } + + ack_quiescing(cache); + } while (more_work(cache)); } @@ -1715,7 +1986,7 @@ static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as, static void init_features(struct cache_features *cf) { cf->mode = CM_WRITE; - cf->write_through = false; + cf->io_mode = CM_IO_WRITEBACK; } static int parse_features(struct cache_args *ca, struct dm_arg_set *as, @@ -1740,10 +2011,13 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as, arg = dm_shift_arg(as); if (!strcasecmp(arg, "writeback")) - cf->write_through = false; + cf->io_mode = CM_IO_WRITEBACK; else if (!strcasecmp(arg, "writethrough")) - cf->write_through = true; + cf->io_mode = CM_IO_WRITETHROUGH; + + else if (!strcasecmp(arg, "passthrough")) + cf->io_mode = CM_IO_PASSTHROUGH; else { *error = "Unrecognised cache feature requested"; @@ -1872,14 +2146,15 @@ static int set_config_values(struct cache *cache, int argc, const char **argv) static int create_cache_policy(struct cache *cache, struct cache_args *ca, char **error) { - cache->policy = dm_cache_policy_create(ca->policy_name, - cache->cache_size, - cache->origin_sectors, - cache->sectors_per_block); - if (!cache->policy) { + struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name, + cache->cache_size, + cache->origin_sectors, + cache->sectors_per_block); + if (IS_ERR(p)) { *error = "Error creating cache's policy"; - return -ENOMEM; + return PTR_ERR(p); } + cache->policy = p; return 0; } @@ -1995,6 +2270,22 @@ static int cache_create(struct cache_args *ca, struct cache **result) } cache->cmd = cmd; + if (passthrough_mode(&cache->features)) { + bool all_clean; + + r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); + if (r) { + *error = "dm_cache_metadata_all_clean() failed"; + goto bad; + } + + if (!all_clean) { + *error = "Cannot enter passthrough mode unless all blocks are clean"; + r = -EINVAL; + goto bad; + } + } + spin_lock_init(&cache->lock); bio_list_init(&cache->deferred_bios); bio_list_init(&cache->deferred_flush_bios); @@ -2005,6 +2296,10 @@ static int cache_create(struct cache_args *ca, struct cache **result) atomic_set(&cache->nr_migrations, 0); init_waitqueue_head(&cache->migration_wait); + init_waitqueue_head(&cache->quiescing_wait); + atomic_set(&cache->quiescing, 0); + atomic_set(&cache->quiescing_ack, 0); + r = -ENOMEM; cache->nr_dirty = 0; cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); @@ -2064,7 +2359,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) cache->need_tick_bio = true; cache->sized = false; - cache->quiescing = false; + cache->invalidate = false; cache->commit_requested = false; cache->loaded_mappings = false; cache->loaded_discards = false; @@ -2078,6 +2373,9 @@ static int cache_create(struct cache_args *ca, struct cache **result) atomic_set(&cache->stats.commit_count, 0); atomic_set(&cache->stats.discard_count, 0); + spin_lock_init(&cache->invalidation_lock); + INIT_LIST_HEAD(&cache->invalidation_requests); + *result = cache; return 0; @@ -2207,17 +2505,37 @@ static int cache_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } + r = DM_MAPIO_REMAPPED; switch (lookup_result.op) { case POLICY_HIT: - inc_hit_counter(cache, bio); - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); + if (passthrough_mode(&cache->features)) { + if (bio_data_dir(bio) == WRITE) { + /* + * We need to invalidate this block, so + * defer for the worker thread. + */ + cell_defer(cache, cell, true); + r = DM_MAPIO_SUBMITTED; + + } else { + pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); + inc_miss_counter(cache, bio); + remap_to_origin_clear_discard(cache, bio, block); + + cell_defer(cache, cell, false); + } - if (is_writethrough_io(cache, bio, lookup_result.cblock)) - remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); - else - remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); + } else { + inc_hit_counter(cache, bio); - cell_defer(cache, cell, false); + if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && + !is_dirty(cache, lookup_result.cblock)) + remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); + else + remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); + + cell_defer(cache, cell, false); + } break; case POLICY_MISS: @@ -2242,10 +2560,10 @@ static int cache_map(struct dm_target *ti, struct bio *bio) DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__, (unsigned) lookup_result.op); bio_io_error(bio); - return DM_MAPIO_SUBMITTED; + r = DM_MAPIO_SUBMITTED; } - return DM_MAPIO_REMAPPED; + return r; } static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) @@ -2406,26 +2724,71 @@ static int load_discard(void *context, sector_t discard_block_size, return 0; } +static dm_cblock_t get_cache_dev_size(struct cache *cache) +{ + sector_t size = get_dev_size(cache->cache_dev); + (void) sector_div(size, cache->sectors_per_block); + return to_cblock(size); +} + +static bool can_resize(struct cache *cache, dm_cblock_t new_size) +{ + if (from_cblock(new_size) > from_cblock(cache->cache_size)) + return true; + + /* + * We can't drop a dirty block when shrinking the cache. + */ + while (from_cblock(new_size) < from_cblock(cache->cache_size)) { + new_size = to_cblock(from_cblock(new_size) + 1); + if (is_dirty(cache, new_size)) { + DMERR("unable to shrink cache; cache block %llu is dirty", + (unsigned long long) from_cblock(new_size)); + return false; + } + } + + return true; +} + +static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) +{ + int r; + + r = dm_cache_resize(cache->cmd, cache->cache_size); + if (r) { + DMERR("could not resize cache metadata"); + return r; + } + + cache->cache_size = new_size; + + return 0; +} + static int cache_preresume(struct dm_target *ti) { int r = 0; struct cache *cache = ti->private; - sector_t actual_cache_size = get_dev_size(cache->cache_dev); - (void) sector_div(actual_cache_size, cache->sectors_per_block); + dm_cblock_t csize = get_cache_dev_size(cache); /* * Check to see if the cache has resized. */ - if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) { - cache->cache_size = to_cblock(actual_cache_size); - - r = dm_cache_resize(cache->cmd, cache->cache_size); - if (r) { - DMERR("could not resize cache metadata"); + if (!cache->sized) { + r = resize_cache_dev(cache, csize); + if (r) return r; - } cache->sized = true; + + } else if (csize != cache->cache_size) { + if (!can_resize(cache, csize)) + return -EINVAL; + + r = resize_cache_dev(cache, csize); + if (r) + return r; } if (!cache->loaded_mappings) { @@ -2518,10 +2881,19 @@ static void cache_status(struct dm_target *ti, status_type_t type, (unsigned long long) from_cblock(residency), cache->nr_dirty); - if (cache->features.write_through) + if (writethrough_mode(&cache->features)) DMEMIT("1 writethrough "); - else - DMEMIT("0 "); + + else if (passthrough_mode(&cache->features)) + DMEMIT("1 passthrough "); + + else if (writeback_mode(&cache->features)) + DMEMIT("1 writeback "); + + else { + DMERR("internal error: unknown io mode: %d", (int) cache->features.io_mode); + goto err; + } DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold); if (sz < maxlen) { @@ -2553,7 +2925,128 @@ err: } /* - * Supports <key> <value>. + * A cache block range can take two forms: + * + * i) A single cblock, eg. '3456' + * ii) A begin and end cblock with dots between, eg. 123-234 + */ +static int parse_cblock_range(struct cache *cache, const char *str, + struct cblock_range *result) +{ + char dummy; + uint64_t b, e; + int r; + + /* + * Try and parse form (ii) first. + */ + r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy); + if (r < 0) + return r; + + if (r == 2) { + result->begin = to_cblock(b); + result->end = to_cblock(e); + return 0; + } + + /* + * That didn't work, try form (i). + */ + r = sscanf(str, "%llu%c", &b, &dummy); + if (r < 0) + return r; + + if (r == 1) { + result->begin = to_cblock(b); + result->end = to_cblock(from_cblock(result->begin) + 1u); + return 0; + } + + DMERR("invalid cblock range '%s'", str); + return -EINVAL; +} + +static int validate_cblock_range(struct cache *cache, struct cblock_range *range) +{ + uint64_t b = from_cblock(range->begin); + uint64_t e = from_cblock(range->end); + uint64_t n = from_cblock(cache->cache_size); + + if (b >= n) { + DMERR("begin cblock out of range: %llu >= %llu", b, n); + return -EINVAL; + } + + if (e > n) { + DMERR("end cblock out of range: %llu > %llu", e, n); + return -EINVAL; + } + + if (b >= e) { + DMERR("invalid cblock range: %llu >= %llu", b, e); + return -EINVAL; + } + + return 0; +} + +static int request_invalidation(struct cache *cache, struct cblock_range *range) +{ + struct invalidation_request req; + + INIT_LIST_HEAD(&req.list); + req.cblocks = range; + atomic_set(&req.complete, 0); + req.err = 0; + init_waitqueue_head(&req.result_wait); + + spin_lock(&cache->invalidation_lock); + list_add(&req.list, &cache->invalidation_requests); + spin_unlock(&cache->invalidation_lock); + wake_worker(cache); + + wait_event(req.result_wait, atomic_read(&req.complete)); + return req.err; +} + +static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, + const char **cblock_ranges) +{ + int r = 0; + unsigned i; + struct cblock_range range; + + if (!passthrough_mode(&cache->features)) { + DMERR("cache has to be in passthrough mode for invalidation"); + return -EPERM; + } + + for (i = 0; i < count; i++) { + r = parse_cblock_range(cache, cblock_ranges[i], &range); + if (r) + break; + + r = validate_cblock_range(cache, &range); + if (r) + break; + + /* + * Pass begin and end origin blocks to the worker and wake it. + */ + r = request_invalidation(cache, &range); + if (r) + break; + } + + return r; +} + +/* + * Supports + * "<key> <value>" + * and + * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]* * * The key migration_threshold is supported by the cache target core. */ @@ -2561,6 +3054,12 @@ static int cache_message(struct dm_target *ti, unsigned argc, char **argv) { struct cache *cache = ti->private; + if (!argc) + return -EINVAL; + + if (!strcasecmp(argv[0], "invalidate_cblocks")) + return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1); + if (argc != 2) return -EINVAL; @@ -2630,7 +3129,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {1, 1, 1}, + .version = {1, 2, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 0fce0bc1a957..81b0fa660452 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2,6 +2,7 @@ * Copyright (C) 2003 Christophe Saout <christophe@saout.de> * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved. + * Copyright (C) 2013 Milan Broz <gmazyland@gmail.com> * * This file is released under the GPL. */ @@ -98,6 +99,13 @@ struct iv_lmk_private { u8 *seed; }; +#define TCW_WHITENING_SIZE 16 +struct iv_tcw_private { + struct crypto_shash *crc32_tfm; + u8 *iv_seed; + u8 *whitening; +}; + /* * Crypt: maps a linear range of a block device * and encrypts / decrypts at the same time. @@ -139,6 +147,7 @@ struct crypt_config { struct iv_essiv_private essiv; struct iv_benbi_private benbi; struct iv_lmk_private lmk; + struct iv_tcw_private tcw; } iv_gen_private; sector_t iv_offset; unsigned int iv_size; @@ -171,7 +180,8 @@ struct crypt_config { unsigned long flags; unsigned int key_size; - unsigned int key_parts; + unsigned int key_parts; /* independent parts in key buffer */ + unsigned int key_extra_size; /* additional keys length */ u8 key[0]; }; @@ -230,6 +240,16 @@ static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc) * version 3: the same as version 2 with additional IV seed * (it uses 65 keys, last key is used as IV seed) * + * tcw: Compatible implementation of the block chaining mode used + * by the TrueCrypt device encryption system (prior to version 4.1). + * For more info see: http://www.truecrypt.org + * It operates on full 512 byte sectors and uses CBC + * with an IV derived from initial key and the sector number. + * In addition, whitening value is applied on every sector, whitening + * is calculated from initial key, sector number and mixed using CRC32. + * Note that this encryption scheme is vulnerable to watermarking attacks + * and should be used for old compatible containers access only. + * * plumb: unimplemented, see: * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 */ @@ -530,7 +550,7 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, char ctx[crypto_shash_descsize(lmk->hash_tfm)]; } sdesc; struct md5_state md5state; - u32 buf[4]; + __le32 buf[4]; int i, r; sdesc.desc.tfm = lmk->hash_tfm; @@ -608,6 +628,153 @@ static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, return r; } +static void crypt_iv_tcw_dtr(struct crypt_config *cc) +{ + struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; + + kzfree(tcw->iv_seed); + tcw->iv_seed = NULL; + kzfree(tcw->whitening); + tcw->whitening = NULL; + + if (tcw->crc32_tfm && !IS_ERR(tcw->crc32_tfm)) + crypto_free_shash(tcw->crc32_tfm); + tcw->crc32_tfm = NULL; +} + +static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti, + const char *opts) +{ + struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; + + if (cc->key_size <= (cc->iv_size + TCW_WHITENING_SIZE)) { + ti->error = "Wrong key size for TCW"; + return -EINVAL; + } + + tcw->crc32_tfm = crypto_alloc_shash("crc32", 0, 0); + if (IS_ERR(tcw->crc32_tfm)) { + ti->error = "Error initializing CRC32 in TCW"; + return PTR_ERR(tcw->crc32_tfm); + } + + tcw->iv_seed = kzalloc(cc->iv_size, GFP_KERNEL); + tcw->whitening = kzalloc(TCW_WHITENING_SIZE, GFP_KERNEL); + if (!tcw->iv_seed || !tcw->whitening) { + crypt_iv_tcw_dtr(cc); + ti->error = "Error allocating seed storage in TCW"; + return -ENOMEM; + } + + return 0; +} + +static int crypt_iv_tcw_init(struct crypt_config *cc) +{ + struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; + int key_offset = cc->key_size - cc->iv_size - TCW_WHITENING_SIZE; + + memcpy(tcw->iv_seed, &cc->key[key_offset], cc->iv_size); + memcpy(tcw->whitening, &cc->key[key_offset + cc->iv_size], + TCW_WHITENING_SIZE); + + return 0; +} + +static int crypt_iv_tcw_wipe(struct crypt_config *cc) +{ + struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; + + memset(tcw->iv_seed, 0, cc->iv_size); + memset(tcw->whitening, 0, TCW_WHITENING_SIZE); + + return 0; +} + +static int crypt_iv_tcw_whitening(struct crypt_config *cc, + struct dm_crypt_request *dmreq, + u8 *data) +{ + struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; + u64 sector = cpu_to_le64((u64)dmreq->iv_sector); + u8 buf[TCW_WHITENING_SIZE]; + struct { + struct shash_desc desc; + char ctx[crypto_shash_descsize(tcw->crc32_tfm)]; + } sdesc; + int i, r; + + /* xor whitening with sector number */ + memcpy(buf, tcw->whitening, TCW_WHITENING_SIZE); + crypto_xor(buf, (u8 *)§or, 8); + crypto_xor(&buf[8], (u8 *)§or, 8); + + /* calculate crc32 for every 32bit part and xor it */ + sdesc.desc.tfm = tcw->crc32_tfm; + sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + for (i = 0; i < 4; i++) { + r = crypto_shash_init(&sdesc.desc); + if (r) + goto out; + r = crypto_shash_update(&sdesc.desc, &buf[i * 4], 4); + if (r) + goto out; + r = crypto_shash_final(&sdesc.desc, &buf[i * 4]); + if (r) + goto out; + } + crypto_xor(&buf[0], &buf[12], 4); + crypto_xor(&buf[4], &buf[8], 4); + + /* apply whitening (8 bytes) to whole sector */ + for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++) + crypto_xor(data + i * 8, buf, 8); +out: + memset(buf, 0, sizeof(buf)); + return r; +} + +static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; + u64 sector = cpu_to_le64((u64)dmreq->iv_sector); + u8 *src; + int r = 0; + + /* Remove whitening from ciphertext */ + if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) { + src = kmap_atomic(sg_page(&dmreq->sg_in)); + r = crypt_iv_tcw_whitening(cc, dmreq, src + dmreq->sg_in.offset); + kunmap_atomic(src); + } + + /* Calculate IV */ + memcpy(iv, tcw->iv_seed, cc->iv_size); + crypto_xor(iv, (u8 *)§or, 8); + if (cc->iv_size > 8) + crypto_xor(&iv[8], (u8 *)§or, cc->iv_size - 8); + + return r; +} + +static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + u8 *dst; + int r; + + if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) + return 0; + + /* Apply whitening on ciphertext */ + dst = kmap_atomic(sg_page(&dmreq->sg_out)); + r = crypt_iv_tcw_whitening(cc, dmreq, dst + dmreq->sg_out.offset); + kunmap_atomic(dst); + + return r; +} + static struct crypt_iv_operations crypt_iv_plain_ops = { .generator = crypt_iv_plain_gen }; @@ -643,6 +810,15 @@ static struct crypt_iv_operations crypt_iv_lmk_ops = { .post = crypt_iv_lmk_post }; +static struct crypt_iv_operations crypt_iv_tcw_ops = { + .ctr = crypt_iv_tcw_ctr, + .dtr = crypt_iv_tcw_dtr, + .init = crypt_iv_tcw_init, + .wipe = crypt_iv_tcw_wipe, + .generator = crypt_iv_tcw_gen, + .post = crypt_iv_tcw_post +}; + static void crypt_convert_init(struct crypt_config *cc, struct convert_context *ctx, struct bio *bio_out, struct bio *bio_in, @@ -774,7 +950,7 @@ static int crypt_convert(struct crypt_config *cc, /* async */ case -EBUSY: wait_for_completion(&ctx->restart); - INIT_COMPLETION(ctx->restart); + reinit_completion(&ctx->restart); /* fall through*/ case -EINPROGRESS: this_cc->req = NULL; @@ -1274,9 +1450,12 @@ static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode) static int crypt_setkey_allcpus(struct crypt_config *cc) { - unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count); + unsigned subkey_size; int err = 0, i, r; + /* Ignore extra keys (which are used for IV etc) */ + subkey_size = (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count); + for (i = 0; i < cc->tfms_count; i++) { r = crypto_ablkcipher_setkey(cc->tfms[i], cc->key + (i * subkey_size), @@ -1409,6 +1588,7 @@ static int crypt_ctr_cipher(struct dm_target *ti, return -EINVAL; } cc->key_parts = cc->tfms_count; + cc->key_extra_size = 0; cc->cipher = kstrdup(cipher, GFP_KERNEL); if (!cc->cipher) @@ -1460,13 +1640,6 @@ static int crypt_ctr_cipher(struct dm_target *ti, goto bad; } - /* Initialize and set key */ - ret = crypt_set_key(cc, key); - if (ret < 0) { - ti->error = "Error decoding and setting key"; - goto bad; - } - /* Initialize IV */ cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc)); if (cc->iv_size) @@ -1493,18 +1666,33 @@ static int crypt_ctr_cipher(struct dm_target *ti, cc->iv_gen_ops = &crypt_iv_null_ops; else if (strcmp(ivmode, "lmk") == 0) { cc->iv_gen_ops = &crypt_iv_lmk_ops; - /* Version 2 and 3 is recognised according + /* + * Version 2 and 3 is recognised according * to length of provided multi-key string. * If present (version 3), last key is used as IV seed. + * All keys (including IV seed) are always the same size. */ - if (cc->key_size % cc->key_parts) + if (cc->key_size % cc->key_parts) { cc->key_parts++; + cc->key_extra_size = cc->key_size / cc->key_parts; + } + } else if (strcmp(ivmode, "tcw") == 0) { + cc->iv_gen_ops = &crypt_iv_tcw_ops; + cc->key_parts += 2; /* IV + whitening */ + cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE; } else { ret = -EINVAL; ti->error = "Invalid IV mode"; goto bad; } + /* Initialize and set key */ + ret = crypt_set_key(cc, key); + if (ret < 0) { + ti->error = "Error decoding and setting key"; + goto bad; + } + /* Allocate IV */ if (cc->iv_gen_ops && cc->iv_gen_ops->ctr) { ret = cc->iv_gen_ops->ctr(cc, ti, ivopts); @@ -1817,7 +2005,7 @@ static int crypt_iterate_devices(struct dm_target *ti, static struct target_type crypt_target = { .name = "crypt", - .version = {1, 12, 1}, + .version = {1, 13, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index afe08146f73e..51521429fb59 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -57,7 +57,7 @@ struct vers_iter { static struct list_head _name_buckets[NUM_BUCKETS]; static struct list_head _uuid_buckets[NUM_BUCKETS]; -static void dm_hash_remove_all(int keep_open_devices); +static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred); /* * Guards access to both hash tables. @@ -86,7 +86,7 @@ static int dm_hash_init(void) static void dm_hash_exit(void) { - dm_hash_remove_all(0); + dm_hash_remove_all(false, false, false); } /*----------------------------------------------------------------- @@ -276,7 +276,7 @@ static struct dm_table *__hash_remove(struct hash_cell *hc) return table; } -static void dm_hash_remove_all(int keep_open_devices) +static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred) { int i, dev_skipped; struct hash_cell *hc; @@ -293,7 +293,8 @@ retry: md = hc->md; dm_get(md); - if (keep_open_devices && dm_lock_for_deletion(md)) { + if (keep_open_devices && + dm_lock_for_deletion(md, mark_deferred, only_deferred)) { dm_put(md); dev_skipped++; continue; @@ -450,6 +451,11 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param, return md; } +void dm_deferred_remove(void) +{ + dm_hash_remove_all(true, false, true); +} + /*----------------------------------------------------------------- * Implementation of the ioctl commands *---------------------------------------------------------------*/ @@ -461,7 +467,7 @@ typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size); static int remove_all(struct dm_ioctl *param, size_t param_size) { - dm_hash_remove_all(1); + dm_hash_remove_all(true, !!(param->flags & DM_DEFERRED_REMOVE), false); param->data_size = 0; return 0; } @@ -683,6 +689,9 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param) if (dm_suspended_md(md)) param->flags |= DM_SUSPEND_FLAG; + if (dm_test_deferred_remove_flag(md)) + param->flags |= DM_DEFERRED_REMOVE; + param->dev = huge_encode_dev(disk_devt(disk)); /* @@ -832,8 +841,13 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) /* * Ensure the device is not open and nothing further can open it. */ - r = dm_lock_for_deletion(md); + r = dm_lock_for_deletion(md, !!(param->flags & DM_DEFERRED_REMOVE), false); if (r) { + if (r == -EBUSY && param->flags & DM_DEFERRED_REMOVE) { + up_write(&_hash_lock); + dm_put(md); + return 0; + } DMDEBUG_LIMIT("unable to remove open device %s", hc->name); up_write(&_hash_lock); dm_put(md); @@ -848,6 +862,8 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size) dm_table_destroy(t); } + param->flags &= ~DM_DEFERRED_REMOVE; + if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr)) param->flags |= DM_UEVENT_GENERATED_FLAG; @@ -1469,6 +1485,14 @@ static int message_for_md(struct mapped_device *md, unsigned argc, char **argv, if (**argv != '@') return 2; /* no '@' prefix, deliver to target */ + if (!strcasecmp(argv[0], "@cancel_deferred_remove")) { + if (argc != 1) { + DMERR("Invalid arguments for @cancel_deferred_remove"); + return -EINVAL; + } + return dm_cancel_deferred_remove(md); + } + r = dm_stats_message(md, argc, argv, result, maxlen); if (r < 2) return r; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index de570a558764..6eb9dc9ef8f3 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -87,6 +87,7 @@ struct multipath { unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */ unsigned saved_queue_if_no_path:1; /* Saved state during suspension */ unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */ + unsigned pg_init_disabled:1; /* pg_init is not currently allowed */ unsigned pg_init_retries; /* Number of times to retry pg_init */ unsigned pg_init_count; /* Number of times pg_init called */ @@ -390,13 +391,16 @@ static int map_io(struct multipath *m, struct request *clone, if (was_queued) m->queue_size--; - if ((pgpath && m->queue_io) || - (!pgpath && m->queue_if_no_path)) { + if (m->pg_init_required) { + if (!m->pg_init_in_progress) + queue_work(kmultipathd, &m->process_queued_ios); + r = DM_MAPIO_REQUEUE; + } else if ((pgpath && m->queue_io) || + (!pgpath && m->queue_if_no_path)) { /* Queue for the daemon to resubmit */ list_add_tail(&clone->queuelist, &m->queued_ios); m->queue_size++; - if ((m->pg_init_required && !m->pg_init_in_progress) || - !m->queue_io) + if (!m->queue_io) queue_work(kmultipathd, &m->process_queued_ios); pgpath = NULL; r = DM_MAPIO_SUBMITTED; @@ -497,7 +501,8 @@ static void process_queued_ios(struct work_struct *work) (!pgpath && !m->queue_if_no_path)) must_queue = 0; - if (m->pg_init_required && !m->pg_init_in_progress && pgpath) + if (m->pg_init_required && !m->pg_init_in_progress && pgpath && + !m->pg_init_disabled) __pg_init_all_paths(m); spin_unlock_irqrestore(&m->lock, flags); @@ -942,10 +947,20 @@ static void multipath_wait_for_pg_init_completion(struct multipath *m) static void flush_multipath_work(struct multipath *m) { + unsigned long flags; + + spin_lock_irqsave(&m->lock, flags); + m->pg_init_disabled = 1; + spin_unlock_irqrestore(&m->lock, flags); + flush_workqueue(kmpath_handlerd); multipath_wait_for_pg_init_completion(m); flush_workqueue(kmultipathd); flush_work(&m->trigger_event); + + spin_lock_irqsave(&m->lock, flags); + m->pg_init_disabled = 0; + spin_unlock_irqrestore(&m->lock, flags); } static void multipath_dtr(struct dm_target *ti) @@ -1164,7 +1179,7 @@ static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath) spin_lock_irqsave(&m->lock, flags); - if (m->pg_init_count <= m->pg_init_retries) + if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled) m->pg_init_required = 1; else limit_reached = 1; @@ -1665,6 +1680,11 @@ static int multipath_busy(struct dm_target *ti) spin_lock_irqsave(&m->lock, flags); + /* pg_init in progress, requeue until done */ + if (m->pg_init_in_progress) { + busy = 1; + goto out; + } /* Guess which priority_group will be used at next mapping time */ if (unlikely(!m->current_pgpath && m->next_pg)) pg = m->next_pg; @@ -1714,7 +1734,7 @@ out: *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 5, 1}, + .version = {1, 6, 0}, .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 8f8783533ac7..465f08ca62b1 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -545,14 +545,28 @@ static int adjoin(struct dm_table *table, struct dm_target *ti) /* * Used to dynamically allocate the arg array. + * + * We do first allocation with GFP_NOIO because dm-mpath and dm-thin must + * process messages even if some device is suspended. These messages have a + * small fixed number of arguments. + * + * On the other hand, dm-switch needs to process bulk data using messages and + * excessive use of GFP_NOIO could cause trouble. */ static char **realloc_argv(unsigned *array_size, char **old_argv) { char **argv; unsigned new_size; + gfp_t gfp; - new_size = *array_size ? *array_size * 2 : 64; - argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL); + if (*array_size) { + new_size = *array_size * 2; + gfp = GFP_KERNEL; + } else { + new_size = 8; + gfp = GFP_NOIO; + } + argv = kmalloc(new_size * sizeof(*argv), gfp); if (argv) { memcpy(argv, old_argv, *array_size * sizeof(*argv)); *array_size = new_size; @@ -1548,8 +1562,11 @@ int dm_table_resume_targets(struct dm_table *t) continue; r = ti->type->preresume(ti); - if (r) + if (r) { + DMERR("%s: %s: preresume failed, error = %d", + dm_device_name(t->md), ti->type->name, r); return r; + } } for (i = 0; i < t->num_targets; i++) { diff --git a/drivers/md/dm.c b/drivers/md/dm.c index b3e26c7d1417..0704c523a76b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -49,6 +49,11 @@ static unsigned int _major = 0; static DEFINE_IDR(_minor_idr); static DEFINE_SPINLOCK(_minor_lock); + +static void do_deferred_remove(struct work_struct *w); + +static DECLARE_WORK(deferred_remove_work, do_deferred_remove); + /* * For bio-based dm. * One of these is allocated per bio. @@ -116,6 +121,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); #define DMF_DELETING 4 #define DMF_NOFLUSH_SUSPENDING 5 #define DMF_MERGE_IS_OPTIONAL 6 +#define DMF_DEFERRED_REMOVE 7 /* * A dummy definition to make RCU happy. @@ -299,6 +305,8 @@ out_free_io_cache: static void local_exit(void) { + flush_scheduled_work(); + kmem_cache_destroy(_rq_tio_cache); kmem_cache_destroy(_io_cache); unregister_blkdev(_major, _name); @@ -404,7 +412,10 @@ static void dm_blk_close(struct gendisk *disk, fmode_t mode) spin_lock(&_minor_lock); - atomic_dec(&md->open_count); + if (atomic_dec_and_test(&md->open_count) && + (test_bit(DMF_DEFERRED_REMOVE, &md->flags))) + schedule_work(&deferred_remove_work); + dm_put(md); spin_unlock(&_minor_lock); @@ -418,14 +429,18 @@ int dm_open_count(struct mapped_device *md) /* * Guarantees nothing is using the device before it's deleted. */ -int dm_lock_for_deletion(struct mapped_device *md) +int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred) { int r = 0; spin_lock(&_minor_lock); - if (dm_open_count(md)) + if (dm_open_count(md)) { r = -EBUSY; + if (mark_deferred) + set_bit(DMF_DEFERRED_REMOVE, &md->flags); + } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags)) + r = -EEXIST; else set_bit(DMF_DELETING, &md->flags); @@ -434,6 +449,27 @@ int dm_lock_for_deletion(struct mapped_device *md) return r; } +int dm_cancel_deferred_remove(struct mapped_device *md) +{ + int r = 0; + + spin_lock(&_minor_lock); + + if (test_bit(DMF_DELETING, &md->flags)) + r = -EBUSY; + else + clear_bit(DMF_DEFERRED_REMOVE, &md->flags); + + spin_unlock(&_minor_lock); + + return r; +} + +static void do_deferred_remove(struct work_struct *w) +{ + dm_deferred_remove(); +} + sector_t dm_get_size(struct mapped_device *md) { return get_capacity(md->disk); @@ -2894,6 +2930,11 @@ int dm_suspended_md(struct mapped_device *md) return test_bit(DMF_SUSPENDED, &md->flags); } +int dm_test_deferred_remove_flag(struct mapped_device *md) +{ + return test_bit(DMF_DEFERRED_REMOVE, &md->flags); +} + int dm_suspended(struct dm_target *ti) { return dm_suspended_md(dm_table_get_md(ti->table)); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 1d1ad7b7e527..c57ba550f69e 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -129,6 +129,16 @@ int dm_deleting_md(struct mapped_device *md); int dm_suspended_md(struct mapped_device *md); /* + * Test if the device is scheduled for deferred remove. + */ +int dm_test_deferred_remove_flag(struct mapped_device *md); + +/* + * Try to remove devices marked for deferred removal. + */ +void dm_deferred_remove(void); + +/* * The device-mapper can be driven through one of two interfaces; * ioctl or filesystem, depending which patch you have applied. */ @@ -158,7 +168,8 @@ void dm_stripe_exit(void); void dm_destroy(struct mapped_device *md); void dm_destroy_immediate(struct mapped_device *md); int dm_open_count(struct mapped_device *md); -int dm_lock_for_deletion(struct mapped_device *md); +int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred); +int dm_cancel_deferred_remove(struct mapped_device *md); int dm_request_based(struct mapped_device *md); sector_t dm_get_size(struct mapped_device *md); struct dm_stats *dm_get_stats(struct mapped_device *md); diff --git a/drivers/md/md.c b/drivers/md/md.c index 752119068d66..21f4d7ff0da2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -112,7 +112,7 @@ static inline int speed_max(struct mddev *mddev) static struct ctl_table_header *raid_table_header; -static ctl_table raid_table[] = { +static struct ctl_table raid_table[] = { { .procname = "speed_limit_min", .data = &sysctl_speed_limit_min, @@ -130,7 +130,7 @@ static ctl_table raid_table[] = { { } }; -static ctl_table raid_dir_table[] = { +static struct ctl_table raid_dir_table[] = { { .procname = "raid", .maxlen = 0, @@ -140,7 +140,7 @@ static ctl_table raid_dir_table[] = { { } }; -static ctl_table raid_root_table[] = { +static struct ctl_table raid_root_table[] = { { .procname = "dev", .maxlen = 0, @@ -562,11 +562,19 @@ static struct mddev * mddev_find(dev_t unit) goto retry; } -static inline int mddev_lock(struct mddev * mddev) +static inline int __must_check mddev_lock(struct mddev * mddev) { return mutex_lock_interruptible(&mddev->reconfig_mutex); } +/* Sometimes we need to take the lock in a situation where + * failure due to interrupts is not acceptable. + */ +static inline void mddev_lock_nointr(struct mddev * mddev) +{ + mutex_lock(&mddev->reconfig_mutex); +} + static inline int mddev_is_locked(struct mddev *mddev) { return mutex_is_locked(&mddev->reconfig_mutex); @@ -768,16 +776,10 @@ void md_super_wait(struct mddev *mddev) finish_wait(&mddev->sb_wait, &wq); } -static void bi_complete(struct bio *bio, int error) -{ - complete((struct completion*)bio->bi_private); -} - int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, int rw, bool metadata_op) { struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); - struct completion event; int ret; rw |= REQ_SYNC; @@ -793,11 +795,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, else bio->bi_sector = sector + rdev->data_offset; bio_add_page(bio, page, size, 0); - init_completion(&event); - bio->bi_private = &event; - bio->bi_end_io = bi_complete; - submit_bio(rw, bio); - wait_for_completion(&event); + submit_bio_wait(rw, bio); ret = test_bit(BIO_UPTODATE, &bio->bi_flags); bio_put(bio); @@ -2978,7 +2976,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) for_each_mddev(mddev, tmp) { struct md_rdev *rdev2; - mddev_lock(mddev); + mddev_lock_nointr(mddev); rdev_for_each(rdev2, mddev) if (rdev->bdev == rdev2->bdev && rdev != rdev2 && @@ -2994,7 +2992,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) break; } } - mddev_lock(my_mddev); + mddev_lock_nointr(my_mddev); if (overlap) { /* Someone else could have slipped in a size * change here, but doing so is just silly. @@ -3515,7 +3513,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) printk(KERN_WARNING "md: cannot register extra attributes for %s\n", mdname(mddev)); - mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action"); + mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); } if (mddev->pers->sync_request != NULL && pers->sync_request == NULL) { @@ -3580,6 +3578,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->in_sync = 1; del_timer_sync(&mddev->safemode_timer); } + blk_set_stacking_limits(&mddev->queue->limits); pers->run(mddev); set_bit(MD_CHANGE_DEVS, &mddev->flags); mddev_resume(mddev); @@ -5258,7 +5257,7 @@ static void __md_stop_writes(struct mddev *mddev) void md_stop_writes(struct mddev *mddev) { - mddev_lock(mddev); + mddev_lock_nointr(mddev); __md_stop_writes(mddev); mddev_unlock(mddev); } @@ -5291,20 +5290,35 @@ EXPORT_SYMBOL_GPL(md_stop); static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) { int err = 0; + int did_freeze = 0; + + if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { + did_freeze = 1; + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + /* Thread might be blocked waiting for metadata update + * which will now never happen */ + wake_up_process(mddev->sync_thread->tsk); + } + mddev_unlock(mddev); + wait_event(resync_wait, mddev->sync_thread == NULL); + mddev_lock_nointr(mddev); + mutex_lock(&mddev->open_mutex); - if (atomic_read(&mddev->openers) > !!bdev) { + if (atomic_read(&mddev->openers) > !!bdev || + mddev->sync_thread || + (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { printk("md: %s still in use.\n",mdname(mddev)); + if (did_freeze) { + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } err = -EBUSY; goto out; } - if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) { - /* Someone opened the device since we flushed it - * so page cache could be dirty and it is too late - * to flush. So abort - */ - mutex_unlock(&mddev->open_mutex); - return -EBUSY; - } if (mddev->pers) { __md_stop_writes(mddev); @@ -5315,7 +5329,7 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) set_disk_ro(mddev->gendisk, 1); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); sysfs_notify_dirent_safe(mddev->sysfs_state); - err = 0; + err = 0; } out: mutex_unlock(&mddev->open_mutex); @@ -5331,20 +5345,34 @@ static int do_md_stop(struct mddev * mddev, int mode, { struct gendisk *disk = mddev->gendisk; struct md_rdev *rdev; + int did_freeze = 0; + + if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { + did_freeze = 1; + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + /* Thread might be blocked waiting for metadata update + * which will now never happen */ + wake_up_process(mddev->sync_thread->tsk); + } + mddev_unlock(mddev); + wait_event(resync_wait, mddev->sync_thread == NULL); + mddev_lock_nointr(mddev); mutex_lock(&mddev->open_mutex); if (atomic_read(&mddev->openers) > !!bdev || - mddev->sysfs_active) { + mddev->sysfs_active || + mddev->sync_thread || + (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { printk("md: %s still in use.\n",mdname(mddev)); mutex_unlock(&mddev->open_mutex); - return -EBUSY; - } - if (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags)) { - /* Someone opened the device since we flushed it - * so page cache could be dirty and it is too late - * to flush. So abort - */ - mutex_unlock(&mddev->open_mutex); + if (did_freeze) { + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } return -EBUSY; } if (mddev->pers) { @@ -6551,7 +6579,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, wait_event(mddev->sb_wait, !test_bit(MD_CHANGE_DEVS, &mddev->flags) && !test_bit(MD_CHANGE_PENDING, &mddev->flags)); - mddev_lock(mddev); + mddev_lock_nointr(mddev); } } else { err = -EROFS; @@ -7361,9 +7389,6 @@ void md_do_sync(struct md_thread *thread) mddev->curr_resync = 2; try_again: - if (kthread_should_stop()) - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) goto skip; for_each_mddev(mddev2, tmp) { @@ -7388,7 +7413,7 @@ void md_do_sync(struct md_thread *thread) * be caught by 'softlockup' */ prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); - if (!kthread_should_stop() && + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && mddev2->curr_resync >= mddev->curr_resync) { printk(KERN_INFO "md: delaying %s of %s" " until %s has finished (they" @@ -7464,7 +7489,7 @@ void md_do_sync(struct md_thread *thread) last_check = 0; if (j>2) { - printk(KERN_INFO + printk(KERN_INFO "md: resuming %s of %s from checkpoint.\n", desc, mdname(mddev)); mddev->curr_resync = j; @@ -7501,7 +7526,8 @@ void md_do_sync(struct md_thread *thread) sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } - while (j >= mddev->resync_max && !kthread_should_stop()) { + while (j >= mddev->resync_max && + !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* As this condition is controlled by user-space, * we can block indefinitely, so use '_interruptible' * to avoid triggering warnings. @@ -7509,17 +7535,18 @@ void md_do_sync(struct md_thread *thread) flush_signals(current); /* just in case */ wait_event_interruptible(mddev->recovery_wait, mddev->resync_max > j - || kthread_should_stop()); + || test_bit(MD_RECOVERY_INTR, + &mddev->recovery)); } - if (kthread_should_stop()) - goto interrupted; + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + break; sectors = mddev->pers->sync_request(mddev, j, &skipped, currspeed < speed_min(mddev)); if (sectors == 0) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - goto out; + break; } if (!skipped) { /* actual IO requested */ @@ -7556,10 +7583,8 @@ void md_do_sync(struct md_thread *thread) last_mark = next; } - - if (kthread_should_stop()) - goto interrupted; - + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + break; /* * this loop exits only if either when we are slower than @@ -7582,11 +7607,12 @@ void md_do_sync(struct md_thread *thread) } } } - printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); + printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc, + test_bit(MD_RECOVERY_INTR, &mddev->recovery) + ? "interrupted" : "done"); /* * this also signals 'finished resyncing' to md_stop */ - out: blk_finish_plug(&plug); wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); @@ -7640,16 +7666,6 @@ void md_do_sync(struct md_thread *thread) set_bit(MD_RECOVERY_DONE, &mddev->recovery); md_wakeup_thread(mddev->thread); return; - - interrupted: - /* - * got a signal, exit. - */ - printk(KERN_INFO - "md: md_do_sync() got signal ... exiting\n"); - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - goto out; - } EXPORT_SYMBOL_GPL(md_do_sync); @@ -7751,7 +7767,7 @@ void md_check_recovery(struct mddev *mddev) if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) return; if ( ! ( - (mddev->flags & ~ (1<<MD_CHANGE_PENDING)) || + (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) || (mddev->external == 0 && mddev->safemode == 1) || @@ -7894,6 +7910,7 @@ void md_reap_sync_thread(struct mddev *mddev) /* resync has finished, collect result */ md_unregister_thread(&mddev->sync_thread); + wake_up(&resync_wait); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* success...*/ diff --git a/drivers/md/md.h b/drivers/md/md.h index c96456c16700..2f5cc8a7ef3e 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -501,7 +501,7 @@ extern struct attribute_group md_bitmap_group; static inline struct sysfs_dirent *sysfs_get_dirent_safe(struct sysfs_dirent *sd, char *name) { if (sd) - return sysfs_get_dirent(sd, NULL, name); + return sysfs_get_dirent(sd, name); return sd; } static inline void sysfs_notify_dirent_safe(struct sysfs_dirent *sd) diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index 172147eb1d40..af96e24ec328 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -509,15 +509,18 @@ static int grow_add_tail_block(struct resize *resize) static int grow_needs_more_blocks(struct resize *resize) { int r; + unsigned old_nr_blocks = resize->old_nr_full_blocks; if (resize->old_nr_entries_in_last_block > 0) { + old_nr_blocks++; + r = grow_extend_tail_block(resize, resize->max_entries); if (r) return r; } r = insert_full_ablocks(resize->info, resize->size_of_block, - resize->old_nr_full_blocks, + old_nr_blocks, resize->new_nr_full_blocks, resize->max_entries, resize->value, &resize->root); diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c index e735a6d5a793..cfbf9617e465 100644 --- a/drivers/md/persistent-data/dm-space-map-disk.c +++ b/drivers/md/persistent-data/dm-space-map-disk.c @@ -140,26 +140,10 @@ static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b) static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b) { - int r; - uint32_t old_count; enum allocation_event ev; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - r = sm_ll_dec(&smd->ll, b, &ev); - if (!r && (ev == SM_FREE)) { - /* - * It's only free if it's also free in the last - * transaction. - */ - r = sm_ll_lookup(&smd->old_ll, b, &old_count); - if (r) - return r; - - if (!old_count) - smd->nr_allocated_this_transaction--; - } - - return r; + return sm_ll_dec(&smd->ll, b, &ev); } static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index af6681b19776..1e5a540995e9 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -66,7 +66,8 @@ */ static int max_queued_requests = 1024; -static void allow_barrier(struct r1conf *conf); +static void allow_barrier(struct r1conf *conf, sector_t start_next_window, + sector_t bi_sector); static void lower_barrier(struct r1conf *conf); static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) @@ -84,10 +85,12 @@ static void r1bio_pool_free(void *r1_bio, void *data) } #define RESYNC_BLOCK_SIZE (64*1024) -//#define RESYNC_BLOCK_SIZE PAGE_SIZE +#define RESYNC_DEPTH 32 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) -#define RESYNC_WINDOW (2048*1024) +#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH) +#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) +#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS) static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) { @@ -225,6 +228,8 @@ static void call_bio_endio(struct r1bio *r1_bio) struct bio *bio = r1_bio->master_bio; int done; struct r1conf *conf = r1_bio->mddev->private; + sector_t start_next_window = r1_bio->start_next_window; + sector_t bi_sector = bio->bi_sector; if (bio->bi_phys_segments) { unsigned long flags; @@ -232,6 +237,11 @@ static void call_bio_endio(struct r1bio *r1_bio) bio->bi_phys_segments--; done = (bio->bi_phys_segments == 0); spin_unlock_irqrestore(&conf->device_lock, flags); + /* + * make_request() might be waiting for + * bi_phys_segments to decrease + */ + wake_up(&conf->wait_barrier); } else done = 1; @@ -243,7 +253,7 @@ static void call_bio_endio(struct r1bio *r1_bio) * Wake up any possible resync thread that waits for the device * to go idle. */ - allow_barrier(conf); + allow_barrier(conf, start_next_window, bi_sector); } } @@ -814,8 +824,6 @@ static void flush_pending_writes(struct r1conf *conf) * there is no normal IO happeing. It must arrange to call * lower_barrier when the particular background IO completes. */ -#define RESYNC_DEPTH 32 - static void raise_barrier(struct r1conf *conf) { spin_lock_irq(&conf->resync_lock); @@ -827,9 +835,19 @@ static void raise_barrier(struct r1conf *conf) /* block any new IO from starting */ conf->barrier++; - /* Now wait for all pending IO to complete */ + /* For these conditions we must wait: + * A: while the array is in frozen state + * B: while barrier >= RESYNC_DEPTH, meaning resync reach + * the max count which allowed. + * C: next_resync + RESYNC_SECTORS > start_next_window, meaning + * next resync will reach to the window which normal bios are + * handling. + */ wait_event_lock_irq(conf->wait_barrier, - !conf->nr_pending && conf->barrier < RESYNC_DEPTH, + !conf->array_frozen && + conf->barrier < RESYNC_DEPTH && + (conf->start_next_window >= + conf->next_resync + RESYNC_SECTORS), conf->resync_lock); spin_unlock_irq(&conf->resync_lock); @@ -845,10 +863,33 @@ static void lower_barrier(struct r1conf *conf) wake_up(&conf->wait_barrier); } -static void wait_barrier(struct r1conf *conf) +static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio) { + bool wait = false; + + if (conf->array_frozen || !bio) + wait = true; + else if (conf->barrier && bio_data_dir(bio) == WRITE) { + if (conf->next_resync < RESYNC_WINDOW_SECTORS) + wait = true; + else if ((conf->next_resync - RESYNC_WINDOW_SECTORS + >= bio_end_sector(bio)) || + (conf->next_resync + NEXT_NORMALIO_DISTANCE + <= bio->bi_sector)) + wait = false; + else + wait = true; + } + + return wait; +} + +static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) +{ + sector_t sector = 0; + spin_lock_irq(&conf->resync_lock); - if (conf->barrier) { + if (need_to_wait_for_sync(conf, bio)) { conf->nr_waiting++; /* Wait for the barrier to drop. * However if there are already pending @@ -860,22 +901,67 @@ static void wait_barrier(struct r1conf *conf) * count down. */ wait_event_lock_irq(conf->wait_barrier, - !conf->barrier || - (conf->nr_pending && + !conf->array_frozen && + (!conf->barrier || + ((conf->start_next_window < + conf->next_resync + RESYNC_SECTORS) && current->bio_list && - !bio_list_empty(current->bio_list)), + !bio_list_empty(current->bio_list))), conf->resync_lock); conf->nr_waiting--; } + + if (bio && bio_data_dir(bio) == WRITE) { + if (conf->next_resync + NEXT_NORMALIO_DISTANCE + <= bio->bi_sector) { + if (conf->start_next_window == MaxSector) + conf->start_next_window = + conf->next_resync + + NEXT_NORMALIO_DISTANCE; + + if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE) + <= bio->bi_sector) + conf->next_window_requests++; + else + conf->current_window_requests++; + } + if (bio->bi_sector >= conf->start_next_window) + sector = conf->start_next_window; + } + conf->nr_pending++; spin_unlock_irq(&conf->resync_lock); + return sector; } -static void allow_barrier(struct r1conf *conf) +static void allow_barrier(struct r1conf *conf, sector_t start_next_window, + sector_t bi_sector) { unsigned long flags; + spin_lock_irqsave(&conf->resync_lock, flags); conf->nr_pending--; + if (start_next_window) { + if (start_next_window == conf->start_next_window) { + if (conf->start_next_window + NEXT_NORMALIO_DISTANCE + <= bi_sector) + conf->next_window_requests--; + else + conf->current_window_requests--; + } else + conf->current_window_requests--; + + if (!conf->current_window_requests) { + if (conf->next_window_requests) { + conf->current_window_requests = + conf->next_window_requests; + conf->next_window_requests = 0; + conf->start_next_window += + NEXT_NORMALIO_DISTANCE; + } else + conf->start_next_window = MaxSector; + } + } spin_unlock_irqrestore(&conf->resync_lock, flags); wake_up(&conf->wait_barrier); } @@ -884,8 +970,7 @@ static void freeze_array(struct r1conf *conf, int extra) { /* stop syncio and normal IO and wait for everything to * go quite. - * We increment barrier and nr_waiting, and then - * wait until nr_pending match nr_queued+extra + * We wait until nr_pending match nr_queued+extra * This is called in the context of one normal IO request * that has failed. Thus any sync request that might be pending * will be blocked by nr_pending, and we need to wait for @@ -895,8 +980,7 @@ static void freeze_array(struct r1conf *conf, int extra) * we continue. */ spin_lock_irq(&conf->resync_lock); - conf->barrier++; - conf->nr_waiting++; + conf->array_frozen = 1; wait_event_lock_irq_cmd(conf->wait_barrier, conf->nr_pending == conf->nr_queued+extra, conf->resync_lock, @@ -907,8 +991,7 @@ static void unfreeze_array(struct r1conf *conf) { /* reverse the effect of the freeze */ spin_lock_irq(&conf->resync_lock); - conf->barrier--; - conf->nr_waiting--; + conf->array_frozen = 0; wake_up(&conf->wait_barrier); spin_unlock_irq(&conf->resync_lock); } @@ -1013,6 +1096,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) int first_clone; int sectors_handled; int max_sectors; + sector_t start_next_window; /* * Register the new request and wait if the reconstruction @@ -1042,7 +1126,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) finish_wait(&conf->wait_barrier, &w); } - wait_barrier(conf); + start_next_window = wait_barrier(conf, bio); bitmap = mddev->bitmap; @@ -1163,6 +1247,7 @@ read_again: disks = conf->raid_disks * 2; retry_write: + r1_bio->start_next_window = start_next_window; blocked_rdev = NULL; rcu_read_lock(); max_sectors = r1_bio->sectors; @@ -1231,14 +1316,24 @@ read_again: if (unlikely(blocked_rdev)) { /* Wait for this device to become unblocked */ int j; + sector_t old = start_next_window; for (j = 0; j < i; j++) if (r1_bio->bios[j]) rdev_dec_pending(conf->mirrors[j].rdev, mddev); r1_bio->state = 0; - allow_barrier(conf); + allow_barrier(conf, start_next_window, bio->bi_sector); md_wait_for_blocked_rdev(blocked_rdev, mddev); - wait_barrier(conf); + start_next_window = wait_barrier(conf, bio); + /* + * We must make sure the multi r1bios of bio have + * the same value of bi_phys_segments + */ + if (bio->bi_phys_segments && old && + old != start_next_window) + /* Wait for the former r1bio(s) to complete */ + wait_event(conf->wait_barrier, + bio->bi_phys_segments == 1); goto retry_write; } @@ -1438,11 +1533,14 @@ static void print_conf(struct r1conf *conf) static void close_sync(struct r1conf *conf) { - wait_barrier(conf); - allow_barrier(conf); + wait_barrier(conf, NULL); + allow_barrier(conf, 0, 0); mempool_destroy(conf->r1buf_pool); conf->r1buf_pool = NULL; + + conf->next_resync = 0; + conf->start_next_window = MaxSector; } static int raid1_spare_active(struct mddev *mddev) @@ -2714,6 +2812,9 @@ static struct r1conf *setup_conf(struct mddev *mddev) conf->pending_count = 0; conf->recovery_disabled = mddev->recovery_disabled - 1; + conf->start_next_window = MaxSector; + conf->current_window_requests = conf->next_window_requests = 0; + err = -EIO; for (i = 0; i < conf->raid_disks * 2; i++) { @@ -2871,8 +2972,8 @@ static int stop(struct mddev *mddev) atomic_read(&bitmap->behind_writes) == 0); } - raise_barrier(conf); - lower_barrier(conf); + freeze_array(conf, 0); + unfreeze_array(conf); md_unregister_thread(&mddev->thread); if (conf->r1bio_pool) @@ -3031,10 +3132,10 @@ static void raid1_quiesce(struct mddev *mddev, int state) wake_up(&conf->wait_barrier); break; case 1: - raise_barrier(conf); + freeze_array(conf, 0); break; case 0: - lower_barrier(conf); + unfreeze_array(conf); break; } } @@ -3051,7 +3152,8 @@ static void *raid1_takeover(struct mddev *mddev) mddev->new_chunk_sectors = 0; conf = setup_conf(mddev); if (!IS_ERR(conf)) - conf->barrier = 1; + /* Array must appear to be quiesced */ + conf->array_frozen = 1; return conf; } return ERR_PTR(-EINVAL); diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 0ff3715fb7eb..9bebca7bff2f 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -41,6 +41,19 @@ struct r1conf { */ sector_t next_resync; + /* When raid1 starts resync, we divide array into four partitions + * |---------|--------------|---------------------|-------------| + * next_resync start_next_window end_window + * start_next_window = next_resync + NEXT_NORMALIO_DISTANCE + * end_window = start_next_window + NEXT_NORMALIO_DISTANCE + * current_window_requests means the count of normalIO between + * start_next_window and end_window. + * next_window_requests means the count of normalIO after end_window. + * */ + sector_t start_next_window; + int current_window_requests; + int next_window_requests; + spinlock_t device_lock; /* list of 'struct r1bio' that need to be processed by raid1d, @@ -65,6 +78,7 @@ struct r1conf { int nr_waiting; int nr_queued; int barrier; + int array_frozen; /* Set to 1 if a full sync is needed, (fresh device added). * Cleared when a sync completes. @@ -111,6 +125,7 @@ struct r1bio { * in this BehindIO request */ sector_t sector; + sector_t start_next_window; int sectors; unsigned long state; struct mddev *mddev; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 7c3508abb5e1..c504e8389e69 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4384,7 +4384,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, mddev->flags == 0 || - kthread_should_stop()); + test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + allow_barrier(conf); + return sectors_done; + } conf->reshape_safe = mddev->reshape_position; allow_barrier(conf); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f8b906843926..cc055da02e2a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -85,6 +85,42 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) return &conf->stripe_hashtbl[hash]; } +static inline int stripe_hash_locks_hash(sector_t sect) +{ + return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; +} + +static inline void lock_device_hash_lock(struct r5conf *conf, int hash) +{ + spin_lock_irq(conf->hash_locks + hash); + spin_lock(&conf->device_lock); +} + +static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) +{ + spin_unlock(&conf->device_lock); + spin_unlock_irq(conf->hash_locks + hash); +} + +static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) +{ + int i; + local_irq_disable(); + spin_lock(conf->hash_locks); + for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) + spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); + spin_lock(&conf->device_lock); +} + +static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) +{ + int i; + spin_unlock(&conf->device_lock); + for (i = NR_STRIPE_HASH_LOCKS; i; i--) + spin_unlock(conf->hash_locks + i - 1); + local_irq_enable(); +} + /* bio's attached to a stripe+device for I/O are linked together in bi_sector * order without overlap. There may be several bio's per stripe+device, and * a bio could span several devices. @@ -249,7 +285,8 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh) } } -static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) +static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, + struct list_head *temp_inactive_list) { BUG_ON(!list_empty(&sh->lru)); BUG_ON(atomic_read(&conf->active_stripes)==0); @@ -278,37 +315,68 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) < IO_THRESHOLD) md_wakeup_thread(conf->mddev->thread); atomic_dec(&conf->active_stripes); - if (!test_bit(STRIPE_EXPANDING, &sh->state)) { - list_add_tail(&sh->lru, &conf->inactive_list); - wake_up(&conf->wait_for_stripe); - if (conf->retry_read_aligned) - md_wakeup_thread(conf->mddev->thread); - } + if (!test_bit(STRIPE_EXPANDING, &sh->state)) + list_add_tail(&sh->lru, temp_inactive_list); } } -static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) +static void __release_stripe(struct r5conf *conf, struct stripe_head *sh, + struct list_head *temp_inactive_list) { if (atomic_dec_and_test(&sh->count)) - do_release_stripe(conf, sh); + do_release_stripe(conf, sh, temp_inactive_list); } -static struct llist_node *llist_reverse_order(struct llist_node *head) +/* + * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list + * + * Be careful: Only one task can add/delete stripes from temp_inactive_list at + * given time. Adding stripes only takes device lock, while deleting stripes + * only takes hash lock. + */ +static void release_inactive_stripe_list(struct r5conf *conf, + struct list_head *temp_inactive_list, + int hash) { - struct llist_node *new_head = NULL; + int size; + bool do_wakeup = false; + unsigned long flags; - while (head) { - struct llist_node *tmp = head; - head = head->next; - tmp->next = new_head; - new_head = tmp; + if (hash == NR_STRIPE_HASH_LOCKS) { + size = NR_STRIPE_HASH_LOCKS; + hash = NR_STRIPE_HASH_LOCKS - 1; + } else + size = 1; + while (size) { + struct list_head *list = &temp_inactive_list[size - 1]; + + /* + * We don't hold any lock here yet, get_active_stripe() might + * remove stripes from the list + */ + if (!list_empty_careful(list)) { + spin_lock_irqsave(conf->hash_locks + hash, flags); + if (list_empty(conf->inactive_list + hash) && + !list_empty(list)) + atomic_dec(&conf->empty_inactive_list_nr); + list_splice_tail_init(list, conf->inactive_list + hash); + do_wakeup = true; + spin_unlock_irqrestore(conf->hash_locks + hash, flags); + } + size--; + hash--; } - return new_head; + if (do_wakeup) { + wake_up(&conf->wait_for_stripe); + if (conf->retry_read_aligned) + md_wakeup_thread(conf->mddev->thread); + } } /* should hold conf->device_lock already */ -static int release_stripe_list(struct r5conf *conf) +static int release_stripe_list(struct r5conf *conf, + struct list_head *temp_inactive_list) { struct stripe_head *sh; int count = 0; @@ -317,6 +385,8 @@ static int release_stripe_list(struct r5conf *conf) head = llist_del_all(&conf->released_stripes); head = llist_reverse_order(head); while (head) { + int hash; + sh = llist_entry(head, struct stripe_head, release_list); head = llist_next(head); /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ @@ -327,7 +397,8 @@ static int release_stripe_list(struct r5conf *conf) * again, the count is always > 1. This is true for * STRIPE_ON_UNPLUG_LIST bit too. */ - __release_stripe(conf, sh); + hash = sh->hash_lock_index; + __release_stripe(conf, sh, &temp_inactive_list[hash]); count++; } @@ -338,9 +409,12 @@ static void release_stripe(struct stripe_head *sh) { struct r5conf *conf = sh->raid_conf; unsigned long flags; + struct list_head list; + int hash; bool wakeup; - if (test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) + if (unlikely(!conf->mddev->thread) || + test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) goto slow_path; wakeup = llist_add(&sh->release_list, &conf->released_stripes); if (wakeup) @@ -350,8 +424,11 @@ slow_path: local_irq_save(flags); /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { - do_release_stripe(conf, sh); + INIT_LIST_HEAD(&list); + hash = sh->hash_lock_index; + do_release_stripe(conf, sh, &list); spin_unlock(&conf->device_lock); + release_inactive_stripe_list(conf, &list, hash); } local_irq_restore(flags); } @@ -376,18 +453,21 @@ static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh) /* find an idle stripe, make sure it is unhashed, and return it. */ -static struct stripe_head *get_free_stripe(struct r5conf *conf) +static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash) { struct stripe_head *sh = NULL; struct list_head *first; - if (list_empty(&conf->inactive_list)) + if (list_empty(conf->inactive_list + hash)) goto out; - first = conf->inactive_list.next; + first = (conf->inactive_list + hash)->next; sh = list_entry(first, struct stripe_head, lru); list_del_init(first); remove_hash(sh); atomic_inc(&conf->active_stripes); + BUG_ON(hash != sh->hash_lock_index); + if (list_empty(conf->inactive_list + hash)) + atomic_inc(&conf->empty_inactive_list_nr); out: return sh; } @@ -430,7 +510,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) { struct r5conf *conf = sh->raid_conf; - int i; + int i, seq; BUG_ON(atomic_read(&sh->count) != 0); BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); @@ -440,7 +520,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) (unsigned long long)sh->sector); remove_hash(sh); - +retry: + seq = read_seqcount_begin(&conf->gen_lock); sh->generation = conf->generation - previous; sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; sh->sector = sector; @@ -462,6 +543,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) dev->flags = 0; raid5_build_block(sh, i, previous); } + if (read_seqcount_retry(&conf->gen_lock, seq)) + goto retry; insert_hash(conf, sh); sh->cpu = smp_processor_id(); } @@ -566,57 +649,59 @@ get_active_stripe(struct r5conf *conf, sector_t sector, int previous, int noblock, int noquiesce) { struct stripe_head *sh; + int hash = stripe_hash_locks_hash(sector); pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); - spin_lock_irq(&conf->device_lock); + spin_lock_irq(conf->hash_locks + hash); do { wait_event_lock_irq(conf->wait_for_stripe, conf->quiesce == 0 || noquiesce, - conf->device_lock); + *(conf->hash_locks + hash)); sh = __find_stripe(conf, sector, conf->generation - previous); if (!sh) { if (!conf->inactive_blocked) - sh = get_free_stripe(conf); + sh = get_free_stripe(conf, hash); if (noblock && sh == NULL) break; if (!sh) { conf->inactive_blocked = 1; - wait_event_lock_irq(conf->wait_for_stripe, - !list_empty(&conf->inactive_list) && - (atomic_read(&conf->active_stripes) - < (conf->max_nr_stripes *3/4) - || !conf->inactive_blocked), - conf->device_lock); + wait_event_lock_irq( + conf->wait_for_stripe, + !list_empty(conf->inactive_list + hash) && + (atomic_read(&conf->active_stripes) + < (conf->max_nr_stripes * 3 / 4) + || !conf->inactive_blocked), + *(conf->hash_locks + hash)); conf->inactive_blocked = 0; } else init_stripe(sh, sector, previous); } else { + spin_lock(&conf->device_lock); if (atomic_read(&sh->count)) { BUG_ON(!list_empty(&sh->lru) && !test_bit(STRIPE_EXPANDING, &sh->state) && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state) - && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); + ); } else { if (!test_bit(STRIPE_HANDLE, &sh->state)) atomic_inc(&conf->active_stripes); - if (list_empty(&sh->lru) && - !test_bit(STRIPE_EXPANDING, &sh->state)) - BUG(); + BUG_ON(list_empty(&sh->lru)); list_del_init(&sh->lru); if (sh->group) { sh->group->stripes_cnt--; sh->group = NULL; } } + spin_unlock(&conf->device_lock); } } while (sh == NULL); if (sh) atomic_inc(&sh->count); - spin_unlock_irq(&conf->device_lock); + spin_unlock_irq(conf->hash_locks + hash); return sh; } @@ -772,7 +857,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) bi->bi_sector = (sh->sector + rdev->data_offset); if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) - bi->bi_rw |= REQ_FLUSH; + bi->bi_rw |= REQ_NOMERGE; bi->bi_vcnt = 1; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; @@ -1596,7 +1681,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) put_cpu(); } -static int grow_one_stripe(struct r5conf *conf) +static int grow_one_stripe(struct r5conf *conf, int hash) { struct stripe_head *sh; sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL); @@ -1612,6 +1697,7 @@ static int grow_one_stripe(struct r5conf *conf) kmem_cache_free(conf->slab_cache, sh); return 0; } + sh->hash_lock_index = hash; /* we just created an active stripe so... */ atomic_set(&sh->count, 1); atomic_inc(&conf->active_stripes); @@ -1624,6 +1710,7 @@ static int grow_stripes(struct r5conf *conf, int num) { struct kmem_cache *sc; int devs = max(conf->raid_disks, conf->previous_raid_disks); + int hash; if (conf->mddev->gendisk) sprintf(conf->cache_name[0], @@ -1641,9 +1728,13 @@ static int grow_stripes(struct r5conf *conf, int num) return 1; conf->slab_cache = sc; conf->pool_size = devs; - while (num--) - if (!grow_one_stripe(conf)) + hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; + while (num--) { + if (!grow_one_stripe(conf, hash)) return 1; + conf->max_nr_stripes++; + hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; + } return 0; } @@ -1701,6 +1792,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) int err; struct kmem_cache *sc; int i; + int hash, cnt; if (newsize <= conf->pool_size) return 0; /* never bother to shrink */ @@ -1740,19 +1832,29 @@ static int resize_stripes(struct r5conf *conf, int newsize) * OK, we have enough stripes, start collecting inactive * stripes and copying them over */ + hash = 0; + cnt = 0; list_for_each_entry(nsh, &newstripes, lru) { - spin_lock_irq(&conf->device_lock); - wait_event_lock_irq(conf->wait_for_stripe, - !list_empty(&conf->inactive_list), - conf->device_lock); - osh = get_free_stripe(conf); - spin_unlock_irq(&conf->device_lock); + lock_device_hash_lock(conf, hash); + wait_event_cmd(conf->wait_for_stripe, + !list_empty(conf->inactive_list + hash), + unlock_device_hash_lock(conf, hash), + lock_device_hash_lock(conf, hash)); + osh = get_free_stripe(conf, hash); + unlock_device_hash_lock(conf, hash); atomic_set(&nsh->count, 1); for(i=0; i<conf->pool_size; i++) nsh->dev[i].page = osh->dev[i].page; for( ; i<newsize; i++) nsh->dev[i].page = NULL; + nsh->hash_lock_index = hash; kmem_cache_free(conf->slab_cache, osh); + cnt++; + if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + + !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { + hash++; + cnt = 0; + } } kmem_cache_destroy(conf->slab_cache); @@ -1811,13 +1913,13 @@ static int resize_stripes(struct r5conf *conf, int newsize) return err; } -static int drop_one_stripe(struct r5conf *conf) +static int drop_one_stripe(struct r5conf *conf, int hash) { struct stripe_head *sh; - spin_lock_irq(&conf->device_lock); - sh = get_free_stripe(conf); - spin_unlock_irq(&conf->device_lock); + spin_lock_irq(conf->hash_locks + hash); + sh = get_free_stripe(conf, hash); + spin_unlock_irq(conf->hash_locks + hash); if (!sh) return 0; BUG_ON(atomic_read(&sh->count)); @@ -1829,8 +1931,10 @@ static int drop_one_stripe(struct r5conf *conf) static void shrink_stripes(struct r5conf *conf) { - while (drop_one_stripe(conf)) - ; + int hash; + for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++) + while (drop_one_stripe(conf, hash)) + ; if (conf->slab_cache) kmem_cache_destroy(conf->slab_cache); @@ -1935,6 +2039,9 @@ static void raid5_end_read_request(struct bio * bi, int error) mdname(conf->mddev), bdn); else retry = 1; + if (set_bad && test_bit(In_sync, &rdev->flags) + && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) + retry = 1; if (retry) if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { set_bit(R5_ReadError, &sh->dev[i].flags); @@ -3914,7 +4021,8 @@ static void raid5_activate_delayed(struct r5conf *conf) } } -static void activate_bit_delay(struct r5conf *conf) +static void activate_bit_delay(struct r5conf *conf, + struct list_head *temp_inactive_list) { /* device_lock is held */ struct list_head head; @@ -3922,9 +4030,11 @@ static void activate_bit_delay(struct r5conf *conf) list_del_init(&conf->bitmap_list); while (!list_empty(&head)) { struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); + int hash; list_del_init(&sh->lru); atomic_inc(&sh->count); - __release_stripe(conf, sh); + hash = sh->hash_lock_index; + __release_stripe(conf, sh, &temp_inactive_list[hash]); } } @@ -3940,7 +4050,7 @@ int md_raid5_congested(struct mddev *mddev, int bits) return 1; if (conf->quiesce) return 1; - if (list_empty_careful(&conf->inactive_list)) + if (atomic_read(&conf->empty_inactive_list_nr)) return 1; return 0; @@ -4270,6 +4380,7 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) struct raid5_plug_cb { struct blk_plug_cb cb; struct list_head list; + struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; }; static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) @@ -4280,6 +4391,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) struct mddev *mddev = cb->cb.data; struct r5conf *conf = mddev->private; int cnt = 0; + int hash; if (cb->list.next && !list_empty(&cb->list)) { spin_lock_irq(&conf->device_lock); @@ -4297,11 +4409,14 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) * STRIPE_ON_RELEASE_LIST could be set here. In that * case, the count is always > 1 here */ - __release_stripe(conf, sh); + hash = sh->hash_lock_index; + __release_stripe(conf, sh, &cb->temp_inactive_list[hash]); cnt++; } spin_unlock_irq(&conf->device_lock); } + release_inactive_stripe_list(conf, cb->temp_inactive_list, + NR_STRIPE_HASH_LOCKS); if (mddev->queue) trace_block_unplug(mddev->queue, cnt, !from_schedule); kfree(cb); @@ -4322,8 +4437,12 @@ static void release_stripe_plug(struct mddev *mddev, cb = container_of(blk_cb, struct raid5_plug_cb, cb); - if (cb->list.next == NULL) + if (cb->list.next == NULL) { + int i; INIT_LIST_HEAD(&cb->list); + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) + INIT_LIST_HEAD(cb->temp_inactive_list + i); + } if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) list_add_tail(&sh->lru, &cb->list); @@ -4706,14 +4825,19 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, - atomic_read(&conf->reshape_stripes)==0); + atomic_read(&conf->reshape_stripes)==0 + || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (atomic_read(&conf->reshape_stripes) != 0) + return 0; mddev->reshape_position = conf->reshape_progress; mddev->curr_resync_completed = sector_nr; conf->reshape_checkpoint = jiffies; set_bit(MD_CHANGE_DEVS, &mddev->flags); md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, mddev->flags == 0 || - kthread_should_stop()); + test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + return 0; spin_lock_irq(&conf->device_lock); conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); @@ -4796,7 +4920,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk >= mddev->resync_max - mddev->curr_resync_completed) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, - atomic_read(&conf->reshape_stripes) == 0); + atomic_read(&conf->reshape_stripes) == 0 + || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (atomic_read(&conf->reshape_stripes) != 0) + goto ret; mddev->reshape_position = conf->reshape_progress; mddev->curr_resync_completed = sector_nr; conf->reshape_checkpoint = jiffies; @@ -4804,13 +4931,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, !test_bit(MD_CHANGE_DEVS, &mddev->flags) - || kthread_should_stop()); + || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + goto ret; spin_lock_irq(&conf->device_lock); conf->reshape_safe = mddev->reshape_position; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } +ret: return reshape_sectors; } @@ -4968,27 +5098,45 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) } static int handle_active_stripes(struct r5conf *conf, int group, - struct r5worker *worker) + struct r5worker *worker, + struct list_head *temp_inactive_list) { struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; - int i, batch_size = 0; + int i, batch_size = 0, hash; + bool release_inactive = false; while (batch_size < MAX_STRIPE_BATCH && (sh = __get_priority_stripe(conf, group)) != NULL) batch[batch_size++] = sh; - if (batch_size == 0) - return batch_size; + if (batch_size == 0) { + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) + if (!list_empty(temp_inactive_list + i)) + break; + if (i == NR_STRIPE_HASH_LOCKS) + return batch_size; + release_inactive = true; + } spin_unlock_irq(&conf->device_lock); + release_inactive_stripe_list(conf, temp_inactive_list, + NR_STRIPE_HASH_LOCKS); + + if (release_inactive) { + spin_lock_irq(&conf->device_lock); + return 0; + } + for (i = 0; i < batch_size; i++) handle_stripe(batch[i]); cond_resched(); spin_lock_irq(&conf->device_lock); - for (i = 0; i < batch_size; i++) - __release_stripe(conf, batch[i]); + for (i = 0; i < batch_size; i++) { + hash = batch[i]->hash_lock_index; + __release_stripe(conf, batch[i], &temp_inactive_list[hash]); + } return batch_size; } @@ -5009,9 +5157,10 @@ static void raid5_do_work(struct work_struct *work) while (1) { int batch_size, released; - released = release_stripe_list(conf); + released = release_stripe_list(conf, worker->temp_inactive_list); - batch_size = handle_active_stripes(conf, group_id, worker); + batch_size = handle_active_stripes(conf, group_id, worker, + worker->temp_inactive_list); worker->working = false; if (!batch_size && !released) break; @@ -5050,7 +5199,7 @@ static void raid5d(struct md_thread *thread) struct bio *bio; int batch_size, released; - released = release_stripe_list(conf); + released = release_stripe_list(conf, conf->temp_inactive_list); if ( !list_empty(&conf->bitmap_list)) { @@ -5060,7 +5209,7 @@ static void raid5d(struct md_thread *thread) bitmap_unplug(mddev->bitmap); spin_lock_irq(&conf->device_lock); conf->seq_write = conf->seq_flush; - activate_bit_delay(conf); + activate_bit_delay(conf, conf->temp_inactive_list); } raid5_activate_delayed(conf); @@ -5074,7 +5223,8 @@ static void raid5d(struct md_thread *thread) handled++; } - batch_size = handle_active_stripes(conf, ANY_GROUP, NULL); + batch_size = handle_active_stripes(conf, ANY_GROUP, NULL, + conf->temp_inactive_list); if (!batch_size && !released) break; handled += batch_size; @@ -5110,22 +5260,29 @@ raid5_set_cache_size(struct mddev *mddev, int size) { struct r5conf *conf = mddev->private; int err; + int hash; if (size <= 16 || size > 32768) return -EINVAL; + hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS; while (size < conf->max_nr_stripes) { - if (drop_one_stripe(conf)) + if (drop_one_stripe(conf, hash)) conf->max_nr_stripes--; else break; + hash--; + if (hash < 0) + hash = NR_STRIPE_HASH_LOCKS - 1; } err = md_allow_write(mddev); if (err) return err; + hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS; while (size > conf->max_nr_stripes) { - if (grow_one_stripe(conf)) + if (grow_one_stripe(conf, hash)) conf->max_nr_stripes++; else break; + hash = (hash + 1) % NR_STRIPE_HASH_LOCKS; } return 0; } @@ -5213,15 +5370,18 @@ raid5_show_group_thread_cnt(struct mddev *mddev, char *page) return 0; } -static int alloc_thread_groups(struct r5conf *conf, int cnt); +static int alloc_thread_groups(struct r5conf *conf, int cnt, + int *group_cnt, + int *worker_cnt_per_group, + struct r5worker_group **worker_groups); static ssize_t raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) { struct r5conf *conf = mddev->private; unsigned long new; int err; - struct r5worker_group *old_groups; - int old_group_cnt; + struct r5worker_group *new_groups, *old_groups; + int group_cnt, worker_cnt_per_group; if (len >= PAGE_SIZE) return -EINVAL; @@ -5237,14 +5397,19 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) mddev_suspend(mddev); old_groups = conf->worker_groups; - old_group_cnt = conf->worker_cnt_per_group; + if (old_groups) + flush_workqueue(raid5_wq); + + err = alloc_thread_groups(conf, new, + &group_cnt, &worker_cnt_per_group, + &new_groups); + if (!err) { + spin_lock_irq(&conf->device_lock); + conf->group_cnt = group_cnt; + conf->worker_cnt_per_group = worker_cnt_per_group; + conf->worker_groups = new_groups; + spin_unlock_irq(&conf->device_lock); - conf->worker_groups = NULL; - err = alloc_thread_groups(conf, new); - if (err) { - conf->worker_groups = old_groups; - conf->worker_cnt_per_group = old_group_cnt; - } else { if (old_groups) kfree(old_groups[0].workers); kfree(old_groups); @@ -5274,40 +5439,47 @@ static struct attribute_group raid5_attrs_group = { .attrs = raid5_attrs, }; -static int alloc_thread_groups(struct r5conf *conf, int cnt) +static int alloc_thread_groups(struct r5conf *conf, int cnt, + int *group_cnt, + int *worker_cnt_per_group, + struct r5worker_group **worker_groups) { - int i, j; + int i, j, k; ssize_t size; struct r5worker *workers; - conf->worker_cnt_per_group = cnt; + *worker_cnt_per_group = cnt; if (cnt == 0) { - conf->worker_groups = NULL; + *group_cnt = 0; + *worker_groups = NULL; return 0; } - conf->group_cnt = num_possible_nodes(); + *group_cnt = num_possible_nodes(); size = sizeof(struct r5worker) * cnt; - workers = kzalloc(size * conf->group_cnt, GFP_NOIO); - conf->worker_groups = kzalloc(sizeof(struct r5worker_group) * - conf->group_cnt, GFP_NOIO); - if (!conf->worker_groups || !workers) { + workers = kzalloc(size * *group_cnt, GFP_NOIO); + *worker_groups = kzalloc(sizeof(struct r5worker_group) * + *group_cnt, GFP_NOIO); + if (!*worker_groups || !workers) { kfree(workers); - kfree(conf->worker_groups); - conf->worker_groups = NULL; + kfree(*worker_groups); return -ENOMEM; } - for (i = 0; i < conf->group_cnt; i++) { + for (i = 0; i < *group_cnt; i++) { struct r5worker_group *group; - group = &conf->worker_groups[i]; + group = &(*worker_groups)[i]; INIT_LIST_HEAD(&group->handle_list); group->conf = conf; group->workers = workers + i * cnt; for (j = 0; j < cnt; j++) { - group->workers[j].group = group; - INIT_WORK(&group->workers[j].work, raid5_do_work); + struct r5worker *worker = group->workers + j; + worker->group = group; + INIT_WORK(&worker->work, raid5_do_work); + + for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++) + INIT_LIST_HEAD(worker->temp_inactive_list + k); } } @@ -5458,6 +5630,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) struct md_rdev *rdev; struct disk_info *disk; char pers_name[6]; + int i; + int group_cnt, worker_cnt_per_group; + struct r5worker_group *new_group; if (mddev->new_level != 5 && mddev->new_level != 4 @@ -5492,7 +5667,12 @@ static struct r5conf *setup_conf(struct mddev *mddev) if (conf == NULL) goto abort; /* Don't enable multi-threading by default*/ - if (alloc_thread_groups(conf, 0)) + if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, + &new_group)) { + conf->group_cnt = group_cnt; + conf->worker_cnt_per_group = worker_cnt_per_group; + conf->worker_groups = new_group; + } else goto abort; spin_lock_init(&conf->device_lock); seqcount_init(&conf->gen_lock); @@ -5502,7 +5682,6 @@ static struct r5conf *setup_conf(struct mddev *mddev) INIT_LIST_HEAD(&conf->hold_list); INIT_LIST_HEAD(&conf->delayed_list); INIT_LIST_HEAD(&conf->bitmap_list); - INIT_LIST_HEAD(&conf->inactive_list); init_llist_head(&conf->released_stripes); atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); @@ -5528,6 +5707,21 @@ static struct r5conf *setup_conf(struct mddev *mddev) if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) goto abort; + /* We init hash_locks[0] separately to that it can be used + * as the reference lock in the spin_lock_nest_lock() call + * in lock_all_device_hash_locks_irq in order to convince + * lockdep that we know what we are doing. + */ + spin_lock_init(conf->hash_locks); + for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) + spin_lock_init(conf->hash_locks + i); + + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) + INIT_LIST_HEAD(conf->inactive_list + i); + + for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) + INIT_LIST_HEAD(conf->temp_inactive_list + i); + conf->level = mddev->new_level; if (raid5_alloc_percpu(conf) != 0) goto abort; @@ -5568,7 +5762,6 @@ static struct r5conf *setup_conf(struct mddev *mddev) else conf->max_degraded = 1; conf->algorithm = mddev->new_layout; - conf->max_nr_stripes = NR_STRIPES; conf->reshape_progress = mddev->reshape_position; if (conf->reshape_progress != MaxSector) { conf->prev_chunk_sectors = mddev->chunk_sectors; @@ -5577,7 +5770,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; - if (grow_stripes(conf, conf->max_nr_stripes)) { + atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); + if (grow_stripes(conf, NR_STRIPES)) { printk(KERN_ERR "md/raid:%s: couldn't allocate %dkB for buffers\n", mdname(mddev), memory); @@ -6383,12 +6577,18 @@ static int raid5_start_reshape(struct mddev *mddev) if (!mddev->sync_thread) { mddev->recovery = 0; spin_lock_irq(&conf->device_lock); + write_seqcount_begin(&conf->gen_lock); mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; + mddev->new_chunk_sectors = + conf->chunk_sectors = conf->prev_chunk_sectors; + mddev->new_layout = conf->algorithm = conf->prev_algo; rdev_for_each(rdev, mddev) rdev->new_data_offset = rdev->data_offset; smp_wmb(); + conf->generation --; conf->reshape_progress = MaxSector; mddev->reshape_position = MaxSector; + write_seqcount_end(&conf->gen_lock); spin_unlock_irq(&conf->device_lock); return -EAGAIN; } @@ -6476,27 +6676,28 @@ static void raid5_quiesce(struct mddev *mddev, int state) break; case 1: /* stop all writes */ - spin_lock_irq(&conf->device_lock); + lock_all_device_hash_locks_irq(conf); /* '2' tells resync/reshape to pause so that all * active stripes can drain */ conf->quiesce = 2; - wait_event_lock_irq(conf->wait_for_stripe, + wait_event_cmd(conf->wait_for_stripe, atomic_read(&conf->active_stripes) == 0 && atomic_read(&conf->active_aligned_reads) == 0, - conf->device_lock); + unlock_all_device_hash_locks_irq(conf), + lock_all_device_hash_locks_irq(conf)); conf->quiesce = 1; - spin_unlock_irq(&conf->device_lock); + unlock_all_device_hash_locks_irq(conf); /* allow reshape to continue */ wake_up(&conf->wait_for_overlap); break; case 0: /* re-enable writes */ - spin_lock_irq(&conf->device_lock); + lock_all_device_hash_locks_irq(conf); conf->quiesce = 0; wake_up(&conf->wait_for_stripe); wake_up(&conf->wait_for_overlap); - spin_unlock_irq(&conf->device_lock); + unlock_all_device_hash_locks_irq(conf); break; } } diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 2113ffa82c7a..01ad8ae8f578 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -49,7 +49,7 @@ * can't distinguish between a clean block that has been generated * from parity calculations, and a clean block that has been * successfully written to the spare ( or to parity when resyncing). - * To distingush these states we have a stripe bit STRIPE_INSYNC that + * To distinguish these states we have a stripe bit STRIPE_INSYNC that * is set whenever a write is scheduled to the spare, or to the parity * disc if there is no spare. A sync request clears this bit, and * when we find it set with no buffers locked, we know the sync is @@ -205,6 +205,7 @@ struct stripe_head { short pd_idx; /* parity disk index */ short qd_idx; /* 'Q' disk index for raid6 */ short ddf_layout;/* use DDF ordering to calculate Q */ + short hash_lock_index; unsigned long state; /* state flags */ atomic_t count; /* nr of active thread/requests */ int bm_seq; /* sequence number for bitmap flushes */ @@ -367,9 +368,18 @@ struct disk_info { struct md_rdev *rdev, *replacement; }; +/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64. + * This is because we sometimes take all the spinlocks + * and creating that much locking depth can cause + * problems. + */ +#define NR_STRIPE_HASH_LOCKS 8 +#define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1) + struct r5worker { struct work_struct work; struct r5worker_group *group; + struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; bool working; }; @@ -382,6 +392,8 @@ struct r5worker_group { struct r5conf { struct hlist_head *stripe_hashtbl; + /* only protect corresponding hash list and inactive_list */ + spinlock_t hash_locks[NR_STRIPE_HASH_LOCKS]; struct mddev *mddev; int chunk_sectors; int level, algorithm; @@ -462,7 +474,8 @@ struct r5conf { * Free stripes pool */ atomic_t active_stripes; - struct list_head inactive_list; + struct list_head inactive_list[NR_STRIPE_HASH_LOCKS]; + atomic_t empty_inactive_list_nr; struct llist_head released_stripes; wait_queue_head_t wait_for_stripe; wait_queue_head_t wait_for_overlap; @@ -477,6 +490,7 @@ struct r5conf { * the new thread here until we fully activate the array. */ struct md_thread *thread; + struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; struct r5worker_group *worker_groups; int group_cnt; int worker_cnt_per_group; |