diff options
Diffstat (limited to 'drivers/md')
27 files changed, 1659 insertions, 595 deletions
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c index f752d12081ff..be065300e93c 100644 --- a/drivers/md/dm-bio-prison.c +++ b/drivers/md/dm-bio-prison.c @@ -14,68 +14,38 @@ /*----------------------------------------------------------------*/ -struct bucket { - spinlock_t lock; - struct hlist_head cells; -}; +#define MIN_CELLS 1024 struct dm_bio_prison { + spinlock_t lock; mempool_t *cell_pool; - - unsigned nr_buckets; - unsigned hash_mask; - struct bucket *buckets; + struct rb_root cells; }; -/*----------------------------------------------------------------*/ - -static uint32_t calc_nr_buckets(unsigned nr_cells) -{ - uint32_t n = 128; - - nr_cells /= 4; - nr_cells = min(nr_cells, 8192u); - - while (n < nr_cells) - n <<= 1; - - return n; -} - static struct kmem_cache *_cell_cache; -static void init_bucket(struct bucket *b) -{ - spin_lock_init(&b->lock); - INIT_HLIST_HEAD(&b->cells); -} +/*----------------------------------------------------------------*/ /* * @nr_cells should be the number of cells you want in use _concurrently_. * Don't confuse it with the number of distinct keys. */ -struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells) +struct dm_bio_prison *dm_bio_prison_create(void) { - unsigned i; - uint32_t nr_buckets = calc_nr_buckets(nr_cells); - size_t len = sizeof(struct dm_bio_prison) + - (sizeof(struct bucket) * nr_buckets); - struct dm_bio_prison *prison = kmalloc(len, GFP_KERNEL); + struct dm_bio_prison *prison = kmalloc(sizeof(*prison), GFP_KERNEL); if (!prison) return NULL; - prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache); + spin_lock_init(&prison->lock); + + prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache); if (!prison->cell_pool) { kfree(prison); return NULL; } - prison->nr_buckets = nr_buckets; - prison->hash_mask = nr_buckets - 1; - prison->buckets = (struct bucket *) (prison + 1); - for (i = 0; i < nr_buckets; i++) - init_bucket(prison->buckets + i); + prison->cells = RB_ROOT; return prison; } @@ -101,68 +71,73 @@ void dm_bio_prison_free_cell(struct dm_bio_prison *prison, } EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell); -static uint32_t hash_key(struct dm_bio_prison *prison, struct dm_cell_key *key) +static void __setup_new_cell(struct dm_cell_key *key, + struct bio *holder, + struct dm_bio_prison_cell *cell) { - const unsigned long BIG_PRIME = 4294967291UL; - uint64_t hash = key->block * BIG_PRIME; - - return (uint32_t) (hash & prison->hash_mask); + memcpy(&cell->key, key, sizeof(cell->key)); + cell->holder = holder; + bio_list_init(&cell->bios); } -static int keys_equal(struct dm_cell_key *lhs, struct dm_cell_key *rhs) +static int cmp_keys(struct dm_cell_key *lhs, + struct dm_cell_key *rhs) { - return (lhs->virtual == rhs->virtual) && - (lhs->dev == rhs->dev) && - (lhs->block == rhs->block); -} + if (lhs->virtual < rhs->virtual) + return -1; -static struct bucket *get_bucket(struct dm_bio_prison *prison, - struct dm_cell_key *key) -{ - return prison->buckets + hash_key(prison, key); -} + if (lhs->virtual > rhs->virtual) + return 1; -static struct dm_bio_prison_cell *__search_bucket(struct bucket *b, - struct dm_cell_key *key) -{ - struct dm_bio_prison_cell *cell; + if (lhs->dev < rhs->dev) + return -1; - hlist_for_each_entry(cell, &b->cells, list) - if (keys_equal(&cell->key, key)) - return cell; + if (lhs->dev > rhs->dev) + return 1; - return NULL; -} + if (lhs->block_end <= rhs->block_begin) + return -1; -static void __setup_new_cell(struct bucket *b, - struct dm_cell_key *key, - struct bio *holder, - struct dm_bio_prison_cell *cell) -{ - memcpy(&cell->key, key, sizeof(cell->key)); - cell->holder = holder; - bio_list_init(&cell->bios); - hlist_add_head(&cell->list, &b->cells); + if (lhs->block_begin >= rhs->block_end) + return 1; + + return 0; } -static int __bio_detain(struct bucket *b, +static int __bio_detain(struct dm_bio_prison *prison, struct dm_cell_key *key, struct bio *inmate, struct dm_bio_prison_cell *cell_prealloc, struct dm_bio_prison_cell **cell_result) { - struct dm_bio_prison_cell *cell; - - cell = __search_bucket(b, key); - if (cell) { - if (inmate) - bio_list_add(&cell->bios, inmate); - *cell_result = cell; - return 1; + int r; + struct rb_node **new = &prison->cells.rb_node, *parent = NULL; + + while (*new) { + struct dm_bio_prison_cell *cell = + container_of(*new, struct dm_bio_prison_cell, node); + + r = cmp_keys(key, &cell->key); + + parent = *new; + if (r < 0) + new = &((*new)->rb_left); + else if (r > 0) + new = &((*new)->rb_right); + else { + if (inmate) + bio_list_add(&cell->bios, inmate); + *cell_result = cell; + return 1; + } } - __setup_new_cell(b, key, inmate, cell_prealloc); + __setup_new_cell(key, inmate, cell_prealloc); *cell_result = cell_prealloc; + + rb_link_node(&cell_prealloc->node, parent, new); + rb_insert_color(&cell_prealloc->node, &prison->cells); + return 0; } @@ -174,11 +149,10 @@ static int bio_detain(struct dm_bio_prison *prison, { int r; unsigned long flags; - struct bucket *b = get_bucket(prison, key); - spin_lock_irqsave(&b->lock, flags); - r = __bio_detain(b, key, inmate, cell_prealloc, cell_result); - spin_unlock_irqrestore(&b->lock, flags); + spin_lock_irqsave(&prison->lock, flags); + r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result); + spin_unlock_irqrestore(&prison->lock, flags); return r; } @@ -205,10 +179,11 @@ EXPORT_SYMBOL_GPL(dm_get_cell); /* * @inmates must have been initialised prior to this call */ -static void __cell_release(struct dm_bio_prison_cell *cell, +static void __cell_release(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell, struct bio_list *inmates) { - hlist_del(&cell->list); + rb_erase(&cell->node, &prison->cells); if (inmates) { if (cell->holder) @@ -222,21 +197,21 @@ void dm_cell_release(struct dm_bio_prison *prison, struct bio_list *bios) { unsigned long flags; - struct bucket *b = get_bucket(prison, &cell->key); - spin_lock_irqsave(&b->lock, flags); - __cell_release(cell, bios); - spin_unlock_irqrestore(&b->lock, flags); + spin_lock_irqsave(&prison->lock, flags); + __cell_release(prison, cell, bios); + spin_unlock_irqrestore(&prison->lock, flags); } EXPORT_SYMBOL_GPL(dm_cell_release); /* * Sometimes we don't want the holder, just the additional bios. */ -static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, +static void __cell_release_no_holder(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell, struct bio_list *inmates) { - hlist_del(&cell->list); + rb_erase(&cell->node, &prison->cells); bio_list_merge(inmates, &cell->bios); } @@ -245,11 +220,10 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison, struct bio_list *inmates) { unsigned long flags; - struct bucket *b = get_bucket(prison, &cell->key); - spin_lock_irqsave(&b->lock, flags); - __cell_release_no_holder(cell, inmates); - spin_unlock_irqrestore(&b->lock, flags); + spin_lock_irqsave(&prison->lock, flags); + __cell_release_no_holder(prison, cell, inmates); + spin_unlock_irqrestore(&prison->lock, flags); } EXPORT_SYMBOL_GPL(dm_cell_release_no_holder); @@ -267,6 +241,20 @@ void dm_cell_error(struct dm_bio_prison *prison, } EXPORT_SYMBOL_GPL(dm_cell_error); +void dm_cell_visit_release(struct dm_bio_prison *prison, + void (*visit_fn)(void *, struct dm_bio_prison_cell *), + void *context, + struct dm_bio_prison_cell *cell) +{ + unsigned long flags; + + spin_lock_irqsave(&prison->lock, flags); + visit_fn(context, cell); + rb_erase(&cell->node, &prison->cells); + spin_unlock_irqrestore(&prison->lock, flags); +} +EXPORT_SYMBOL_GPL(dm_cell_visit_release); + /*----------------------------------------------------------------*/ #define DEFERRED_SET_SIZE 64 diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h index 6805a142b750..74cf01144b1f 100644 --- a/drivers/md/dm-bio-prison.h +++ b/drivers/md/dm-bio-prison.h @@ -10,8 +10,8 @@ #include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */ #include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */ -#include <linux/list.h> #include <linux/bio.h> +#include <linux/rbtree.h> /*----------------------------------------------------------------*/ @@ -23,11 +23,14 @@ */ struct dm_bio_prison; -/* FIXME: this needs to be more abstract */ +/* + * Keys define a range of blocks within either a virtual or physical + * device. + */ struct dm_cell_key { int virtual; dm_thin_id dev; - dm_block_t block; + dm_block_t block_begin, block_end; }; /* @@ -35,13 +38,15 @@ struct dm_cell_key { * themselves. */ struct dm_bio_prison_cell { - struct hlist_node list; + struct list_head user_list; /* for client use */ + struct rb_node node; + struct dm_cell_key key; struct bio *holder; struct bio_list bios; }; -struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells); +struct dm_bio_prison *dm_bio_prison_create(void); void dm_bio_prison_destroy(struct dm_bio_prison *prison); /* @@ -57,7 +62,7 @@ void dm_bio_prison_free_cell(struct dm_bio_prison *prison, struct dm_bio_prison_cell *cell); /* - * Creates, or retrieves a cell for the given key. + * Creates, or retrieves a cell that overlaps the given key. * * Returns 1 if pre-existing cell returned, zero if new cell created using * @cell_prealloc. @@ -68,7 +73,8 @@ int dm_get_cell(struct dm_bio_prison *prison, struct dm_bio_prison_cell **cell_result); /* - * An atomic op that combines retrieving a cell, and adding a bio to it. + * An atomic op that combines retrieving or creating a cell, and adding a + * bio to it. * * Returns 1 if the cell was already held, 0 if @inmate is the new holder. */ @@ -87,6 +93,14 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison, void dm_cell_error(struct dm_bio_prison *prison, struct dm_bio_prison_cell *cell, int error); +/* + * Visits the cell and then releases. Guarantees no new inmates are + * inserted between the visit and release. + */ +void dm_cell_visit_release(struct dm_bio_prison *prison, + void (*visit_fn)(void *, struct dm_bio_prison_cell *), + void *context, struct dm_bio_prison_cell *cell); + /*----------------------------------------------------------------*/ /* diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 825ca1f87639..c33b49792b87 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -14,6 +14,7 @@ #include <linux/vmalloc.h> #include <linux/shrinker.h> #include <linux/module.h> +#include <linux/rbtree.h> #define DM_MSG_PREFIX "bufio" @@ -34,26 +35,23 @@ /* * Check buffer ages in this interval (seconds) */ -#define DM_BUFIO_WORK_TIMER_SECS 10 +#define DM_BUFIO_WORK_TIMER_SECS 30 /* * Free buffers when they are older than this (seconds) */ -#define DM_BUFIO_DEFAULT_AGE_SECS 60 +#define DM_BUFIO_DEFAULT_AGE_SECS 300 /* - * The number of bvec entries that are embedded directly in the buffer. - * If the chunk size is larger, dm-io is used to do the io. + * The nr of bytes of cached data to keep around. */ -#define DM_BUFIO_INLINE_VECS 16 +#define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024) /* - * Buffer hash + * The number of bvec entries that are embedded directly in the buffer. + * If the chunk size is larger, dm-io is used to do the io. */ -#define DM_BUFIO_HASH_BITS 20 -#define DM_BUFIO_HASH(block) \ - ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \ - ((1 << DM_BUFIO_HASH_BITS) - 1)) +#define DM_BUFIO_INLINE_VECS 16 /* * Don't try to use kmem_cache_alloc for blocks larger than this. @@ -106,7 +104,7 @@ struct dm_bufio_client { unsigned minimum_buffers; - struct hlist_head *cache_hash; + struct rb_root buffer_tree; wait_queue_head_t free_buffer_wait; int async_write_error; @@ -135,7 +133,7 @@ enum data_mode { }; struct dm_buffer { - struct hlist_node hash_list; + struct rb_node node; struct list_head lru_list; sector_t block; void *data; @@ -223,6 +221,7 @@ static DEFINE_SPINLOCK(param_spinlock); * Buffers are freed after this timeout */ static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; +static unsigned dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES; static unsigned long dm_bufio_peak_allocated; static unsigned long dm_bufio_allocated_kmem_cache; @@ -253,6 +252,53 @@ static LIST_HEAD(dm_bufio_all_clients); */ static DEFINE_MUTEX(dm_bufio_clients_lock); +/*---------------------------------------------------------------- + * A red/black tree acts as an index for all the buffers. + *--------------------------------------------------------------*/ +static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) +{ + struct rb_node *n = c->buffer_tree.rb_node; + struct dm_buffer *b; + + while (n) { + b = container_of(n, struct dm_buffer, node); + + if (b->block == block) + return b; + + n = (b->block < block) ? n->rb_left : n->rb_right; + } + + return NULL; +} + +static void __insert(struct dm_bufio_client *c, struct dm_buffer *b) +{ + struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL; + struct dm_buffer *found; + + while (*new) { + found = container_of(*new, struct dm_buffer, node); + + if (found->block == b->block) { + BUG_ON(found != b); + return; + } + + parent = *new; + new = (found->block < b->block) ? + &((*new)->rb_left) : &((*new)->rb_right); + } + + rb_link_node(&b->node, parent, new); + rb_insert_color(&b->node, &c->buffer_tree); +} + +static void __remove(struct dm_bufio_client *c, struct dm_buffer *b) +{ + rb_erase(&b->node, &c->buffer_tree); +} + /*----------------------------------------------------------------*/ static void adjust_total_allocated(enum data_mode data_mode, long diff) @@ -434,7 +480,7 @@ static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty) b->block = block; b->list_mode = dirty; list_add(&b->lru_list, &c->lru[dirty]); - hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]); + __insert(b->c, b); b->last_accessed = jiffies; } @@ -448,7 +494,7 @@ static void __unlink_buffer(struct dm_buffer *b) BUG_ON(!c->n_buffers[b->list_mode]); c->n_buffers[b->list_mode]--; - hlist_del(&b->hash_list); + __remove(b->c, b); list_del(&b->lru_list); } @@ -532,6 +578,19 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block, end_io(&b->bio, r); } +static void inline_endio(struct bio *bio, int error) +{ + bio_end_io_t *end_fn = bio->bi_private; + + /* + * Reset the bio to free any attached resources + * (e.g. bio integrity profiles). + */ + bio_reset(bio); + + end_fn(bio, error); +} + static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, bio_end_io_t *end_io) { @@ -543,7 +602,12 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits; b->bio.bi_bdev = b->c->bdev; - b->bio.bi_end_io = end_io; + b->bio.bi_end_io = inline_endio; + /* + * Use of .bi_private isn't a problem here because + * the dm_buffer's inline bio is local to bufio. + */ + b->bio.bi_private = end_io; /* * We assume that if len >= PAGE_SIZE ptr is page-aligned. @@ -887,23 +951,6 @@ static void __check_watermark(struct dm_bufio_client *c, __write_dirty_buffers_async(c, 1, write_list); } -/* - * Find a buffer in the hash. - */ -static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) -{ - struct dm_buffer *b; - - hlist_for_each_entry(b, &c->cache_hash[DM_BUFIO_HASH(block)], - hash_list) { - dm_bufio_cond_resched(); - if (b->block == block) - return b; - } - - return NULL; -} - /*---------------------------------------------------------------- * Getting a buffer *--------------------------------------------------------------*/ @@ -1433,45 +1480,52 @@ static void drop_buffers(struct dm_bufio_client *c) } /* - * Test if the buffer is unused and too old, and commit it. - * At if noio is set, we must not do any I/O because we hold - * dm_bufio_clients_lock and we would risk deadlock if the I/O gets rerouted to - * different bufio client. + * We may not be able to evict this buffer if IO pending or the client + * is still using it. Caller is expected to know buffer is too old. + * + * And if GFP_NOFS is used, we must not do any I/O because we hold + * dm_bufio_clients_lock and we would risk deadlock if the I/O gets + * rerouted to different bufio client. */ -static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp, - unsigned long max_jiffies) +static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp) { - if (jiffies - b->last_accessed < max_jiffies) - return 0; - - if (!(gfp & __GFP_IO)) { + if (!(gfp & __GFP_FS)) { if (test_bit(B_READING, &b->state) || test_bit(B_WRITING, &b->state) || test_bit(B_DIRTY, &b->state)) - return 0; + return false; } if (b->hold_count) - return 0; + return false; __make_buffer_clean(b); __unlink_buffer(b); __free_buffer_wake(b); - return 1; + return true; } -static long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, - gfp_t gfp_mask) +static unsigned get_retain_buffers(struct dm_bufio_client *c) +{ + unsigned retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes); + return retain_bytes / c->block_size; +} + +static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, + gfp_t gfp_mask) { int l; struct dm_buffer *b, *tmp; - long freed = 0; + unsigned long freed = 0; + unsigned long count = nr_to_scan; + unsigned retain_target = get_retain_buffers(c); for (l = 0; l < LIST_SIZE; l++) { list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) { - freed += __cleanup_old_buffer(b, gfp_mask, 0); - if (!--nr_to_scan) + if (__try_evict_buffer(b, gfp_mask)) + freed++; + if (!--nr_to_scan || ((count - freed) <= retain_target)) return freed; dm_bufio_cond_resched(); } @@ -1486,7 +1540,7 @@ dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) unsigned long freed; c = container_of(shrink, struct dm_bufio_client, shrinker); - if (sc->gfp_mask & __GFP_IO) + if (sc->gfp_mask & __GFP_FS) dm_bufio_lock(c); else if (!dm_bufio_trylock(c)) return SHRINK_STOP; @@ -1503,7 +1557,7 @@ dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) unsigned long count; c = container_of(shrink, struct dm_bufio_client, shrinker); - if (sc->gfp_mask & __GFP_IO) + if (sc->gfp_mask & __GFP_FS) dm_bufio_lock(c); else if (!dm_bufio_trylock(c)) return 0; @@ -1533,11 +1587,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign r = -ENOMEM; goto bad_client; } - c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS); - if (!c->cache_hash) { - r = -ENOMEM; - goto bad_hash; - } + c->buffer_tree = RB_ROOT; c->bdev = bdev; c->block_size = block_size; @@ -1556,9 +1606,6 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign c->n_buffers[i] = 0; } - for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) - INIT_HLIST_HEAD(&c->cache_hash[i]); - mutex_init(&c->lock); INIT_LIST_HEAD(&c->reserved_buffers); c->need_reserved_buffers = reserved_buffers; @@ -1632,8 +1679,6 @@ bad_cache: } dm_io_client_destroy(c->dm_io); bad_dm_io: - vfree(c->cache_hash); -bad_hash: kfree(c); bad_client: return ERR_PTR(r); @@ -1660,9 +1705,7 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c) mutex_unlock(&dm_bufio_clients_lock); - for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) - BUG_ON(!hlist_empty(&c->cache_hash[i])); - + BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree)); BUG_ON(c->need_reserved_buffers); while (!list_empty(&c->reserved_buffers)) { @@ -1680,36 +1723,60 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c) BUG_ON(c->n_buffers[i]); dm_io_client_destroy(c->dm_io); - vfree(c->cache_hash); kfree(c); } EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); -static void cleanup_old_buffers(void) +static unsigned get_max_age_hz(void) { - unsigned long max_age = ACCESS_ONCE(dm_bufio_max_age); - struct dm_bufio_client *c; + unsigned max_age = ACCESS_ONCE(dm_bufio_max_age); - if (max_age > ULONG_MAX / HZ) - max_age = ULONG_MAX / HZ; + if (max_age > UINT_MAX / HZ) + max_age = UINT_MAX / HZ; - mutex_lock(&dm_bufio_clients_lock); - list_for_each_entry(c, &dm_bufio_all_clients, client_list) { - if (!dm_bufio_trylock(c)) - continue; + return max_age * HZ; +} - while (!list_empty(&c->lru[LIST_CLEAN])) { - struct dm_buffer *b; - b = list_entry(c->lru[LIST_CLEAN].prev, - struct dm_buffer, lru_list); - if (!__cleanup_old_buffer(b, 0, max_age * HZ)) - break; - dm_bufio_cond_resched(); - } +static bool older_than(struct dm_buffer *b, unsigned long age_hz) +{ + return (jiffies - b->last_accessed) >= age_hz; +} + +static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz) +{ + struct dm_buffer *b, *tmp; + unsigned retain_target = get_retain_buffers(c); + unsigned count; + + dm_bufio_lock(c); + + count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; + list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) { + if (count <= retain_target) + break; + + if (!older_than(b, age_hz)) + break; + + if (__try_evict_buffer(b, 0)) + count--; - dm_bufio_unlock(c); dm_bufio_cond_resched(); } + + dm_bufio_unlock(c); +} + +static void cleanup_old_buffers(void) +{ + unsigned long max_age_hz = get_max_age_hz(); + struct dm_bufio_client *c; + + mutex_lock(&dm_bufio_clients_lock); + + list_for_each_entry(c, &dm_bufio_all_clients, client_list) + __evict_old_buffers(c, max_age_hz); + mutex_unlock(&dm_bufio_clients_lock); } @@ -1834,6 +1901,9 @@ MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); +module_param_named(retain_bytes, dm_bufio_retain_bytes, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory"); + module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory"); diff --git a/drivers/md/dm-cache-block-types.h b/drivers/md/dm-cache-block-types.h index aac0e2df06be..bed4ad4e1b7c 100644 --- a/drivers/md/dm-cache-block-types.h +++ b/drivers/md/dm-cache-block-types.h @@ -19,6 +19,7 @@ typedef dm_block_t __bitwise__ dm_oblock_t; typedef uint32_t __bitwise__ dm_cblock_t; +typedef dm_block_t __bitwise__ dm_dblock_t; static inline dm_oblock_t to_oblock(dm_block_t b) { @@ -40,4 +41,14 @@ static inline uint32_t from_cblock(dm_cblock_t b) return (__force uint32_t) b; } +static inline dm_dblock_t to_dblock(dm_block_t b) +{ + return (__force dm_dblock_t) b; +} + +static inline dm_block_t from_dblock(dm_dblock_t b) +{ + return (__force dm_block_t) b; +} + #endif /* DM_CACHE_BLOCK_TYPES_H */ diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 06709257adde..9fc616c2755e 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -109,7 +109,7 @@ struct dm_cache_metadata { dm_block_t discard_root; sector_t discard_block_size; - dm_oblock_t discard_nr_blocks; + dm_dblock_t discard_nr_blocks; sector_t data_block_size; dm_cblock_t cache_blocks; @@ -329,7 +329,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd) disk_super->hint_root = cpu_to_le64(cmd->hint_root); disk_super->discard_root = cpu_to_le64(cmd->discard_root); disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); - disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks)); + disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks)); disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE); disk_super->data_block_size = cpu_to_le32(cmd->data_block_size); disk_super->cache_blocks = cpu_to_le32(0); @@ -528,7 +528,7 @@ static void read_superblock_fields(struct dm_cache_metadata *cmd, cmd->hint_root = le64_to_cpu(disk_super->hint_root); cmd->discard_root = le64_to_cpu(disk_super->discard_root); cmd->discard_block_size = le64_to_cpu(disk_super->discard_block_size); - cmd->discard_nr_blocks = to_oblock(le64_to_cpu(disk_super->discard_nr_blocks)); + cmd->discard_nr_blocks = to_dblock(le64_to_cpu(disk_super->discard_nr_blocks)); cmd->data_block_size = le32_to_cpu(disk_super->data_block_size); cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks)); strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name)); @@ -626,7 +626,7 @@ static int __commit_transaction(struct dm_cache_metadata *cmd, disk_super->hint_root = cpu_to_le64(cmd->hint_root); disk_super->discard_root = cpu_to_le64(cmd->discard_root); disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size); - disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks)); + disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks)); disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks)); strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name)); disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]); @@ -797,15 +797,15 @@ out: int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd, sector_t discard_block_size, - dm_oblock_t new_nr_entries) + dm_dblock_t new_nr_entries) { int r; down_write(&cmd->root_lock); r = dm_bitset_resize(&cmd->discard_info, cmd->discard_root, - from_oblock(cmd->discard_nr_blocks), - from_oblock(new_nr_entries), + from_dblock(cmd->discard_nr_blocks), + from_dblock(new_nr_entries), false, &cmd->discard_root); if (!r) { cmd->discard_block_size = discard_block_size; @@ -818,28 +818,28 @@ int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd, return r; } -static int __set_discard(struct dm_cache_metadata *cmd, dm_oblock_t b) +static int __set_discard(struct dm_cache_metadata *cmd, dm_dblock_t b) { return dm_bitset_set_bit(&cmd->discard_info, cmd->discard_root, - from_oblock(b), &cmd->discard_root); + from_dblock(b), &cmd->discard_root); } -static int __clear_discard(struct dm_cache_metadata *cmd, dm_oblock_t b) +static int __clear_discard(struct dm_cache_metadata *cmd, dm_dblock_t b) { return dm_bitset_clear_bit(&cmd->discard_info, cmd->discard_root, - from_oblock(b), &cmd->discard_root); + from_dblock(b), &cmd->discard_root); } -static int __is_discarded(struct dm_cache_metadata *cmd, dm_oblock_t b, +static int __is_discarded(struct dm_cache_metadata *cmd, dm_dblock_t b, bool *is_discarded) { return dm_bitset_test_bit(&cmd->discard_info, cmd->discard_root, - from_oblock(b), &cmd->discard_root, + from_dblock(b), &cmd->discard_root, is_discarded); } static int __discard(struct dm_cache_metadata *cmd, - dm_oblock_t dblock, bool discard) + dm_dblock_t dblock, bool discard) { int r; @@ -852,7 +852,7 @@ static int __discard(struct dm_cache_metadata *cmd, } int dm_cache_set_discard(struct dm_cache_metadata *cmd, - dm_oblock_t dblock, bool discard) + dm_dblock_t dblock, bool discard) { int r; @@ -870,8 +870,8 @@ static int __load_discards(struct dm_cache_metadata *cmd, dm_block_t b; bool discard; - for (b = 0; b < from_oblock(cmd->discard_nr_blocks); b++) { - dm_oblock_t dblock = to_oblock(b); + for (b = 0; b < from_dblock(cmd->discard_nr_blocks); b++) { + dm_dblock_t dblock = to_dblock(b); if (cmd->clean_when_opened) { r = __is_discarded(cmd, dblock, &discard); diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index 7383c90ccdb8..4ecc403be283 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h @@ -70,14 +70,14 @@ dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd); int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd, sector_t discard_block_size, - dm_oblock_t new_nr_entries); + dm_dblock_t new_nr_entries); typedef int (*load_discard_fn)(void *context, sector_t discard_block_size, - dm_oblock_t dblock, bool discarded); + dm_dblock_t dblock, bool discarded); int dm_cache_load_discards(struct dm_cache_metadata *cmd, load_discard_fn fn, void *context); -int dm_cache_set_discard(struct dm_cache_metadata *cmd, dm_oblock_t dblock, bool discard); +int dm_cache_set_discard(struct dm_cache_metadata *cmd, dm_dblock_t dblock, bool discard); int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock); int dm_cache_insert_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock, dm_oblock_t oblock); diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 0e385e40909e..13f547a4eeb6 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -181,24 +181,30 @@ static void queue_shift_down(struct queue *q) * Gives us the oldest entry of the lowest popoulated level. If the first * level is emptied then we shift down one level. */ -static struct list_head *queue_pop(struct queue *q) +static struct list_head *queue_peek(struct queue *q) { unsigned level; - struct list_head *r; for (level = 0; level < NR_QUEUE_LEVELS; level++) - if (!list_empty(q->qs + level)) { - r = q->qs[level].next; - list_del(r); + if (!list_empty(q->qs + level)) + return q->qs[level].next; - /* have we just emptied the bottom level? */ - if (level == 0 && list_empty(q->qs)) - queue_shift_down(q); + return NULL; +} - return r; - } +static struct list_head *queue_pop(struct queue *q) +{ + struct list_head *r = queue_peek(q); - return NULL; + if (r) { + list_del(r); + + /* have we just emptied the bottom level? */ + if (list_empty(q->qs)) + queue_shift_down(q); + } + + return r; } static struct list_head *list_pop(struct list_head *lh) @@ -383,13 +389,6 @@ struct mq_policy { unsigned generation; unsigned generation_period; /* in lookups (will probably change) */ - /* - * Entries in the pre_cache whose hit count passes the promotion - * threshold move to the cache proper. Working out the correct - * value for the promotion_threshold is crucial to this policy. - */ - unsigned promote_threshold; - unsigned discard_promote_adjustment; unsigned read_promote_adjustment; unsigned write_promote_adjustment; @@ -406,6 +405,7 @@ struct mq_policy { #define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1 #define DEFAULT_READ_PROMOTE_ADJUSTMENT 4 #define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8 +#define DISCOURAGE_DEMOTING_DIRTY_THRESHOLD 128 /*----------------------------------------------------------------*/ @@ -518,6 +518,12 @@ static struct entry *pop(struct mq_policy *mq, struct queue *q) return e; } +static struct entry *peek(struct queue *q) +{ + struct list_head *h = queue_peek(q); + return h ? container_of(h, struct entry, list) : NULL; +} + /* * Has this entry already been updated? */ @@ -570,10 +576,6 @@ static void check_generation(struct mq_policy *mq) break; } } - - mq->promote_threshold = nr ? total / nr : 1; - if (mq->promote_threshold * nr < total) - mq->promote_threshold++; } } @@ -641,6 +643,30 @@ static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock) } /* + * Entries in the pre_cache whose hit count passes the promotion + * threshold move to the cache proper. Working out the correct + * value for the promotion_threshold is crucial to this policy. + */ +static unsigned promote_threshold(struct mq_policy *mq) +{ + struct entry *e; + + if (any_free_cblocks(mq)) + return 0; + + e = peek(&mq->cache_clean); + if (e) + return e->hit_count; + + e = peek(&mq->cache_dirty); + if (e) + return e->hit_count + DISCOURAGE_DEMOTING_DIRTY_THRESHOLD; + + /* This should never happen */ + return 0; +} + +/* * We modify the basic promotion_threshold depending on the specific io. * * If the origin block has been discarded then there's no cost to copy it @@ -653,7 +679,7 @@ static unsigned adjusted_promote_threshold(struct mq_policy *mq, bool discarded_oblock, int data_dir) { if (data_dir == READ) - return mq->promote_threshold + mq->read_promote_adjustment; + return promote_threshold(mq) + mq->read_promote_adjustment; if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) { /* @@ -663,7 +689,7 @@ static unsigned adjusted_promote_threshold(struct mq_policy *mq, return mq->discard_promote_adjustment; } - return mq->promote_threshold + mq->write_promote_adjustment; + return promote_threshold(mq) + mq->write_promote_adjustment; } static bool should_promote(struct mq_policy *mq, struct entry *e, @@ -839,7 +865,8 @@ static int map(struct mq_policy *mq, dm_oblock_t oblock, if (e && in_cache(mq, e)) r = cache_entry_found(mq, e, result); - else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL) + else if (mq->tracker.thresholds[PATTERN_SEQUENTIAL] && + iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL) result->op = POLICY_MISS; else if (e) @@ -1230,7 +1257,6 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, mq->tick = 0; mq->hit_count = 0; mq->generation = 0; - mq->promote_threshold = 0; mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT; mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT; mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT; @@ -1265,7 +1291,7 @@ bad_pre_cache_init: static struct dm_cache_policy_type mq_policy_type = { .name = "mq", - .version = {1, 2, 0}, + .version = {1, 3, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = mq_create @@ -1273,7 +1299,7 @@ static struct dm_cache_policy_type mq_policy_type = { static struct dm_cache_policy_type default_policy_type = { .name = "default", - .version = {1, 2, 0}, + .version = {1, 3, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = mq_create, diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 7130505c2425..1e96d7889f51 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -95,7 +95,6 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio) /*----------------------------------------------------------------*/ -#define PRISON_CELLS 1024 #define MIGRATION_POOL_SIZE 128 #define COMMIT_PERIOD HZ #define MIGRATION_COUNT_WINDOW 10 @@ -237,8 +236,9 @@ struct cache { /* * origin_blocks entries, discarded if set. */ - dm_oblock_t discard_nr_blocks; + dm_dblock_t discard_nr_blocks; unsigned long *discard_bitset; + uint32_t discard_block_size; /* a power of 2 times sectors per block */ /* * Rather than reconstructing the table line for the status we just @@ -310,6 +310,7 @@ struct dm_cache_migration { dm_cblock_t cblock; bool err:1; + bool discard:1; bool writeback:1; bool demote:1; bool promote:1; @@ -433,11 +434,12 @@ static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cel /*----------------------------------------------------------------*/ -static void build_key(dm_oblock_t oblock, struct dm_cell_key *key) +static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key) { key->virtual = 0; key->dev = 0; - key->block = from_oblock(oblock); + key->block_begin = from_oblock(begin); + key->block_end = from_oblock(end); } /* @@ -447,15 +449,15 @@ static void build_key(dm_oblock_t oblock, struct dm_cell_key *key) */ typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell); -static int bio_detain(struct cache *cache, dm_oblock_t oblock, - struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, - cell_free_fn free_fn, void *free_context, - struct dm_bio_prison_cell **cell_result) +static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end, + struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, + cell_free_fn free_fn, void *free_context, + struct dm_bio_prison_cell **cell_result) { int r; struct dm_cell_key key; - build_key(oblock, &key); + build_key(oblock_begin, oblock_end, &key); r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); if (r) free_fn(free_context, cell_prealloc); @@ -463,6 +465,16 @@ static int bio_detain(struct cache *cache, dm_oblock_t oblock, return r; } +static int bio_detain(struct cache *cache, dm_oblock_t oblock, + struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, + cell_free_fn free_fn, void *free_context, + struct dm_bio_prison_cell **cell_result) +{ + dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); + return bio_detain_range(cache, oblock, end, bio, + cell_prealloc, free_fn, free_context, cell_result); +} + static int get_cell(struct cache *cache, dm_oblock_t oblock, struct prealloc *structs, @@ -474,7 +486,7 @@ static int get_cell(struct cache *cache, cell_prealloc = prealloc_get_cell(structs); - build_key(oblock, &key); + build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key); r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); if (r) prealloc_put_cell(structs, cell_prealloc); @@ -524,33 +536,57 @@ static dm_block_t block_div(dm_block_t b, uint32_t n) return b; } -static void set_discard(struct cache *cache, dm_oblock_t b) +static dm_block_t oblocks_per_dblock(struct cache *cache) +{ + dm_block_t oblocks = cache->discard_block_size; + + if (block_size_is_power_of_two(cache)) + oblocks >>= cache->sectors_per_block_shift; + else + oblocks = block_div(oblocks, cache->sectors_per_block); + + return oblocks; +} + +static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) +{ + return to_dblock(block_div(from_oblock(oblock), + oblocks_per_dblock(cache))); +} + +static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock) +{ + return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache)); +} + +static void set_discard(struct cache *cache, dm_dblock_t b) { unsigned long flags; + BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); atomic_inc(&cache->stats.discard_count); spin_lock_irqsave(&cache->lock, flags); - set_bit(from_oblock(b), cache->discard_bitset); + set_bit(from_dblock(b), cache->discard_bitset); spin_unlock_irqrestore(&cache->lock, flags); } -static void clear_discard(struct cache *cache, dm_oblock_t b) +static void clear_discard(struct cache *cache, dm_dblock_t b) { unsigned long flags; spin_lock_irqsave(&cache->lock, flags); - clear_bit(from_oblock(b), cache->discard_bitset); + clear_bit(from_dblock(b), cache->discard_bitset); spin_unlock_irqrestore(&cache->lock, flags); } -static bool is_discarded(struct cache *cache, dm_oblock_t b) +static bool is_discarded(struct cache *cache, dm_dblock_t b) { int r; unsigned long flags; spin_lock_irqsave(&cache->lock, flags); - r = test_bit(from_oblock(b), cache->discard_bitset); + r = test_bit(from_dblock(b), cache->discard_bitset); spin_unlock_irqrestore(&cache->lock, flags); return r; @@ -562,7 +598,8 @@ static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) unsigned long flags; spin_lock_irqsave(&cache->lock, flags); - r = test_bit(from_oblock(b), cache->discard_bitset); + r = test_bit(from_dblock(oblock_to_dblock(cache, b)), + cache->discard_bitset); spin_unlock_irqrestore(&cache->lock, flags); return r; @@ -687,7 +724,7 @@ static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, check_if_tick_bio_needed(cache, bio); remap_to_origin(cache, bio); if (bio_data_dir(bio) == WRITE) - clear_discard(cache, oblock); + clear_discard(cache, oblock_to_dblock(cache, oblock)); } static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, @@ -697,7 +734,7 @@ static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, remap_to_cache(cache, bio, cblock); if (bio_data_dir(bio) == WRITE) { set_dirty(cache, oblock, cblock); - clear_discard(cache, oblock); + clear_discard(cache, oblock_to_dblock(cache, oblock)); } } @@ -951,10 +988,14 @@ static void migration_success_post_commit(struct dm_cache_migration *mg) } } else { - clear_dirty(cache, mg->new_oblock, mg->cblock); - if (mg->requeue_holder) + if (mg->requeue_holder) { + clear_dirty(cache, mg->new_oblock, mg->cblock); cell_defer(cache, mg->new_ocell, true); - else { + } else { + /* + * The block was promoted via an overwrite, so it's dirty. + */ + set_dirty(cache, mg->new_oblock, mg->cblock); bio_endio(mg->new_ocell->holder, 0); cell_defer(cache, mg->new_ocell, false); } @@ -978,7 +1019,7 @@ static void copy_complete(int read_err, unsigned long write_err, void *context) wake_worker(cache); } -static void issue_copy_real(struct dm_cache_migration *mg) +static void issue_copy(struct dm_cache_migration *mg) { int r; struct dm_io_region o_region, c_region; @@ -1057,11 +1098,46 @@ static void avoid_copy(struct dm_cache_migration *mg) migration_success_pre_commit(mg); } -static void issue_copy(struct dm_cache_migration *mg) +static void calc_discard_block_range(struct cache *cache, struct bio *bio, + dm_dblock_t *b, dm_dblock_t *e) +{ + sector_t sb = bio->bi_iter.bi_sector; + sector_t se = bio_end_sector(bio); + + *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); + + if (se - sb < cache->discard_block_size) + *e = *b; + else + *e = to_dblock(block_div(se, cache->discard_block_size)); +} + +static void issue_discard(struct dm_cache_migration *mg) +{ + dm_dblock_t b, e; + struct bio *bio = mg->new_ocell->holder; + + calc_discard_block_range(mg->cache, bio, &b, &e); + while (b != e) { + set_discard(mg->cache, b); + b = to_dblock(from_dblock(b) + 1); + } + + bio_endio(bio, 0); + cell_defer(mg->cache, mg->new_ocell, false); + free_migration(mg); +} + +static void issue_copy_or_discard(struct dm_cache_migration *mg) { bool avoid; struct cache *cache = mg->cache; + if (mg->discard) { + issue_discard(mg); + return; + } + if (mg->writeback || mg->demote) avoid = !is_dirty(cache, mg->cblock) || is_discarded_oblock(cache, mg->old_oblock); @@ -1070,13 +1146,14 @@ static void issue_copy(struct dm_cache_migration *mg) avoid = is_discarded_oblock(cache, mg->new_oblock); - if (!avoid && bio_writes_complete_block(cache, bio)) { + if (writeback_mode(&cache->features) && + !avoid && bio_writes_complete_block(cache, bio)) { issue_overwrite(mg, bio); return; } } - avoid ? avoid_copy(mg) : issue_copy_real(mg); + avoid ? avoid_copy(mg) : issue_copy(mg); } static void complete_migration(struct dm_cache_migration *mg) @@ -1161,6 +1238,7 @@ static void promote(struct cache *cache, struct prealloc *structs, struct dm_cache_migration *mg = prealloc_get_migration(structs); mg->err = false; + mg->discard = false; mg->writeback = false; mg->demote = false; mg->promote = true; @@ -1184,6 +1262,7 @@ static void writeback(struct cache *cache, struct prealloc *structs, struct dm_cache_migration *mg = prealloc_get_migration(structs); mg->err = false; + mg->discard = false; mg->writeback = true; mg->demote = false; mg->promote = false; @@ -1209,6 +1288,7 @@ static void demote_then_promote(struct cache *cache, struct prealloc *structs, struct dm_cache_migration *mg = prealloc_get_migration(structs); mg->err = false; + mg->discard = false; mg->writeback = false; mg->demote = true; mg->promote = true; @@ -1237,6 +1317,7 @@ static void invalidate(struct cache *cache, struct prealloc *structs, struct dm_cache_migration *mg = prealloc_get_migration(structs); mg->err = false; + mg->discard = false; mg->writeback = false; mg->demote = true; mg->promote = false; @@ -1253,6 +1334,26 @@ static void invalidate(struct cache *cache, struct prealloc *structs, quiesce_migration(mg); } +static void discard(struct cache *cache, struct prealloc *structs, + struct dm_bio_prison_cell *cell) +{ + struct dm_cache_migration *mg = prealloc_get_migration(structs); + + mg->err = false; + mg->discard = true; + mg->writeback = false; + mg->demote = false; + mg->promote = false; + mg->requeue_holder = false; + mg->invalidate = false; + mg->cache = cache; + mg->old_ocell = NULL; + mg->new_ocell = cell; + mg->start_jiffies = jiffies; + + quiesce_migration(mg); +} + /*---------------------------------------------------------------- * bio processing *--------------------------------------------------------------*/ @@ -1286,31 +1387,27 @@ static void process_flush_bio(struct cache *cache, struct bio *bio) issue(cache, bio); } -/* - * People generally discard large parts of a device, eg, the whole device - * when formatting. Splitting these large discards up into cache block - * sized ios and then quiescing (always neccessary for discard) takes too - * long. - * - * We keep it simple, and allow any size of discard to come in, and just - * mark off blocks on the discard bitset. No passdown occurs! - * - * To implement passdown we need to change the bio_prison such that a cell - * can have a key that spans many blocks. - */ -static void process_discard_bio(struct cache *cache, struct bio *bio) +static void process_discard_bio(struct cache *cache, struct prealloc *structs, + struct bio *bio) { - dm_block_t start_block = dm_sector_div_up(bio->bi_iter.bi_sector, - cache->sectors_per_block); - dm_block_t end_block = bio_end_sector(bio); - dm_block_t b; + int r; + dm_dblock_t b, e; + struct dm_bio_prison_cell *cell_prealloc, *new_ocell; - end_block = block_div(end_block, cache->sectors_per_block); + calc_discard_block_range(cache, bio, &b, &e); + if (b == e) { + bio_endio(bio, 0); + return; + } - for (b = start_block; b < end_block; b++) - set_discard(cache, to_oblock(b)); + cell_prealloc = prealloc_get_cell(structs); + r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc, + (cell_free_fn) prealloc_put_cell, + structs, &new_ocell); + if (r > 0) + return; - bio_endio(bio, 0); + discard(cache, structs, new_ocell); } static bool spare_migration_bandwidth(struct cache *cache) @@ -1340,9 +1437,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs, dm_oblock_t block = get_bio_block(cache, bio); struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell; struct policy_result lookup_result; - bool discarded_block = is_discarded_oblock(cache, block); bool passthrough = passthrough_mode(&cache->features); - bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache)); + bool discarded_block, can_migrate; /* * Check to see if that block is currently migrating. @@ -1354,6 +1450,9 @@ static void process_bio(struct cache *cache, struct prealloc *structs, if (r > 0) return; + discarded_block = is_discarded_oblock(cache, block); + can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache)); + r = policy_map(cache->policy, block, true, can_migrate, discarded_block, bio, &lookup_result); @@ -1500,7 +1599,7 @@ static void process_deferred_bios(struct cache *cache) if (bio->bi_rw & REQ_FLUSH) process_flush_bio(cache, bio); else if (bio->bi_rw & REQ_DISCARD) - process_discard_bio(cache, bio); + process_discard_bio(cache, &structs, bio); else process_bio(cache, &structs, bio); } @@ -1715,7 +1814,7 @@ static void do_worker(struct work_struct *ws) process_invalidation_requests(cache); } - process_migrations(cache, &cache->quiesced_migrations, issue_copy); + process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard); process_migrations(cache, &cache->completed_migrations, complete_migration); if (commit_if_needed(cache)) { @@ -2180,6 +2279,45 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca, return 0; } +/* + * We want the discard block size to be at least the size of the cache + * block size and have no more than 2^14 discard blocks across the origin. + */ +#define MAX_DISCARD_BLOCKS (1 << 14) + +static bool too_many_discard_blocks(sector_t discard_block_size, + sector_t origin_size) +{ + (void) sector_div(origin_size, discard_block_size); + + return origin_size > MAX_DISCARD_BLOCKS; +} + +static sector_t calculate_discard_block_size(sector_t cache_block_size, + sector_t origin_size) +{ + sector_t discard_block_size = cache_block_size; + + if (origin_size) + while (too_many_discard_blocks(discard_block_size, origin_size)) + discard_block_size *= 2; + + return discard_block_size; +} + +static void set_cache_size(struct cache *cache, dm_cblock_t size) +{ + dm_block_t nr_blocks = from_cblock(size); + + if (nr_blocks > (1 << 20) && cache->cache_size != size) + DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n" + "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n" + "Please consider increasing the cache block size to reduce the overall cache block count.", + (unsigned long long) nr_blocks); + + cache->cache_size = size; +} + #define DEFAULT_MIGRATION_THRESHOLD 2048 static int cache_create(struct cache_args *ca, struct cache **result) @@ -2204,8 +2342,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) ti->num_discard_bios = 1; ti->discards_supported = true; ti->discard_zeroes_data_unsupported = true; - /* Discard bios must be split on a block boundary */ - ti->split_discard_bios = true; + ti->split_discard_bios = false; cache->features = ca->features; ti->per_bio_data_size = get_per_bio_data_size(cache); @@ -2235,10 +2372,10 @@ static int cache_create(struct cache_args *ca, struct cache **result) cache->sectors_per_block_shift = -1; cache_size = block_div(cache_size, ca->block_size); - cache->cache_size = to_cblock(cache_size); + set_cache_size(cache, to_cblock(cache_size)); } else { cache->sectors_per_block_shift = __ffs(ca->block_size); - cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift); + set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift)); } r = create_cache_policy(cache, ca, error); @@ -2303,13 +2440,17 @@ static int cache_create(struct cache_args *ca, struct cache **result) } clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size)); - cache->discard_nr_blocks = cache->origin_blocks; - cache->discard_bitset = alloc_bitset(from_oblock(cache->discard_nr_blocks)); + cache->discard_block_size = + calculate_discard_block_size(cache->sectors_per_block, + cache->origin_sectors); + cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors, + cache->discard_block_size)); + cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks)); if (!cache->discard_bitset) { *error = "could not allocate discard bitset"; goto bad; } - clear_bitset(cache->discard_bitset, from_oblock(cache->discard_nr_blocks)); + clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); if (IS_ERR(cache->copier)) { @@ -2327,7 +2468,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) INIT_DELAYED_WORK(&cache->waker, do_waker); cache->last_commit_jiffies = jiffies; - cache->prison = dm_bio_prison_create(PRISON_CELLS); + cache->prison = dm_bio_prison_create(); if (!cache->prison) { *error = "could not create bio prison"; goto bad; @@ -2549,11 +2690,11 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso static int cache_map(struct dm_target *ti, struct bio *bio) { int r; - struct dm_bio_prison_cell *cell; + struct dm_bio_prison_cell *cell = NULL; struct cache *cache = ti->private; r = __cache_map(cache, bio, &cell); - if (r == DM_MAPIO_REMAPPED) { + if (r == DM_MAPIO_REMAPPED && cell) { inc_ds(cache, bio, cell); cell_defer(cache, cell, false); } @@ -2599,16 +2740,16 @@ static int write_discard_bitset(struct cache *cache) { unsigned i, r; - r = dm_cache_discard_bitset_resize(cache->cmd, cache->sectors_per_block, - cache->origin_blocks); + r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size, + cache->discard_nr_blocks); if (r) { DMERR("could not resize on-disk discard bitset"); return r; } - for (i = 0; i < from_oblock(cache->discard_nr_blocks); i++) { - r = dm_cache_set_discard(cache->cmd, to_oblock(i), - is_discarded(cache, to_oblock(i))); + for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) { + r = dm_cache_set_discard(cache->cmd, to_dblock(i), + is_discarded(cache, to_dblock(i))); if (r) return r; } @@ -2680,15 +2821,86 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, return 0; } +/* + * The discard block size in the on disk metadata is not + * neccessarily the same as we're currently using. So we have to + * be careful to only set the discarded attribute if we know it + * covers a complete block of the new size. + */ +struct discard_load_info { + struct cache *cache; + + /* + * These blocks are sized using the on disk dblock size, rather + * than the current one. + */ + dm_block_t block_size; + dm_block_t discard_begin, discard_end; +}; + +static void discard_load_info_init(struct cache *cache, + struct discard_load_info *li) +{ + li->cache = cache; + li->discard_begin = li->discard_end = 0; +} + +static void set_discard_range(struct discard_load_info *li) +{ + sector_t b, e; + + if (li->discard_begin == li->discard_end) + return; + + /* + * Convert to sectors. + */ + b = li->discard_begin * li->block_size; + e = li->discard_end * li->block_size; + + /* + * Then convert back to the current dblock size. + */ + b = dm_sector_div_up(b, li->cache->discard_block_size); + sector_div(e, li->cache->discard_block_size); + + /* + * The origin may have shrunk, so we need to check we're still in + * bounds. + */ + if (e > from_dblock(li->cache->discard_nr_blocks)) + e = from_dblock(li->cache->discard_nr_blocks); + + for (; b < e; b++) + set_discard(li->cache, to_dblock(b)); +} + static int load_discard(void *context, sector_t discard_block_size, - dm_oblock_t oblock, bool discard) + dm_dblock_t dblock, bool discard) { - struct cache *cache = context; + struct discard_load_info *li = context; - if (discard) - set_discard(cache, oblock); - else - clear_discard(cache, oblock); + li->block_size = discard_block_size; + + if (discard) { + if (from_dblock(dblock) == li->discard_end) + /* + * We're already in a discard range, just extend it. + */ + li->discard_end = li->discard_end + 1ULL; + + else { + /* + * Emit the old range and start a new one. + */ + set_discard_range(li); + li->discard_begin = from_dblock(dblock); + li->discard_end = li->discard_begin + 1ULL; + } + } else { + set_discard_range(li); + li->discard_begin = li->discard_end = 0; + } return 0; } @@ -2730,7 +2942,7 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) return r; } - cache->cache_size = new_size; + set_cache_size(cache, new_size); return 0; } @@ -2772,11 +2984,22 @@ static int cache_preresume(struct dm_target *ti) } if (!cache->loaded_discards) { - r = dm_cache_load_discards(cache->cmd, load_discard, cache); + struct discard_load_info li; + + /* + * The discard bitset could have been resized, or the + * discard block size changed. To be safe we start by + * setting every dblock to not discarded. + */ + clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); + + discard_load_info_init(cache, &li); + r = dm_cache_load_discards(cache->cmd, load_discard, &li); if (r) { DMERR("could not load origin discards"); return r; } + set_discard_range(&li); cache->loaded_discards = true; } @@ -3079,8 +3302,9 @@ static void set_discard_limits(struct cache *cache, struct queue_limits *limits) /* * FIXME: these limits may be incompatible with the cache device */ - limits->max_discard_sectors = cache->sectors_per_block; - limits->discard_granularity = cache->sectors_per_block << SECTOR_SHIFT; + limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024, + cache->origin_sectors); + limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT; } static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) @@ -3104,7 +3328,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {1, 5, 0}, + .version = {1, 6, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index fc93b9330af4..08981be7baa1 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -705,7 +705,7 @@ static int crypt_iv_tcw_whitening(struct crypt_config *cc, for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++) crypto_xor(data + i * 8, buf, 8); out: - memset(buf, 0, sizeof(buf)); + memzero_explicit(buf, sizeof(buf)); return r; } diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 0be9381365d7..73f791bb9ea4 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -684,11 +684,14 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param) int srcu_idx; param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | - DM_ACTIVE_PRESENT_FLAG); + DM_ACTIVE_PRESENT_FLAG | DM_INTERNAL_SUSPEND_FLAG); if (dm_suspended_md(md)) param->flags |= DM_SUSPEND_FLAG; + if (dm_suspended_internally_md(md)) + param->flags |= DM_INTERNAL_SUSPEND_FLAG; + if (dm_test_deferred_remove_flag(md)) param->flags |= DM_DEFERRED_REMOVE; diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 4857fa4a5484..07c0fa0fa284 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -789,8 +789,7 @@ struct dm_raid_superblock { __le32 layout; __le32 stripe_sectors; - __u8 pad[452]; /* Round struct to 512 bytes. */ - /* Always set to 0 when writing. */ + /* Remainder of a logical block is zero-filled when writing (see super_sync()). */ } __packed; static int read_disk_sb(struct md_rdev *rdev, int size) @@ -827,7 +826,7 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev) test_bit(Faulty, &(rs->dev[i].rdev.flags))) failed_devices |= (1ULL << i); - memset(sb, 0, sizeof(*sb)); + memset(sb + 1, 0, rdev->sb_size - sizeof(*sb)); sb->magic = cpu_to_le32(DM_RAID_MAGIC); sb->features = cpu_to_le32(0); /* No features yet */ @@ -862,7 +861,11 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev) uint64_t events_sb, events_refsb; rdev->sb_start = 0; - rdev->sb_size = sizeof(*sb); + rdev->sb_size = bdev_logical_block_size(rdev->meta_bdev); + if (rdev->sb_size < sizeof(*sb) || rdev->sb_size > PAGE_SIZE) { + DMERR("superblock size of a logical block is no longer valid"); + return -EINVAL; + } ret = read_disk_sb(rdev, rdev->sb_size); if (ret) @@ -1169,8 +1172,12 @@ static void configure_discard_support(struct dm_target *ti, struct raid_set *rs) raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6); for (i = 0; i < rs->md.raid_disks; i++) { - struct request_queue *q = bdev_get_queue(rs->dev[i].rdev.bdev); + struct request_queue *q; + + if (!rs->dev[i].rdev.bdev) + continue; + q = bdev_get_queue(rs->dev[i].rdev.bdev); if (!q || !blk_queue_discard(q)) return; diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 87f86c77b094..f478a4c96d2f 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -824,7 +824,7 @@ static int message_stats_create(struct mapped_device *md, return 1; id = dm_stats_create(dm_get_stats(md), start, end, step, program_id, aux_data, - dm_internal_suspend, dm_internal_resume, md); + dm_internal_suspend_fast, dm_internal_resume_fast, md); if (id < 0) return id; diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index d1600d2aa2e2..f8b37d4c05d8 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -159,8 +159,10 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) sc->stripes_shift = __ffs(stripes); r = dm_set_target_max_io_len(ti, chunk_size); - if (r) + if (r) { + kfree(sc); return r; + } ti->num_flush_bios = stripes; ti->num_discard_bios = stripes; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index b2bd1ebf4562..3afae9e062f8 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1521,18 +1521,32 @@ fmode_t dm_table_get_mode(struct dm_table *t) } EXPORT_SYMBOL(dm_table_get_mode); -static void suspend_targets(struct dm_table *t, unsigned postsuspend) +enum suspend_mode { + PRESUSPEND, + PRESUSPEND_UNDO, + POSTSUSPEND, +}; + +static void suspend_targets(struct dm_table *t, enum suspend_mode mode) { int i = t->num_targets; struct dm_target *ti = t->targets; while (i--) { - if (postsuspend) { + switch (mode) { + case PRESUSPEND: + if (ti->type->presuspend) + ti->type->presuspend(ti); + break; + case PRESUSPEND_UNDO: + if (ti->type->presuspend_undo) + ti->type->presuspend_undo(ti); + break; + case POSTSUSPEND: if (ti->type->postsuspend) ti->type->postsuspend(ti); - } else if (ti->type->presuspend) - ti->type->presuspend(ti); - + break; + } ti++; } } @@ -1542,7 +1556,15 @@ void dm_table_presuspend_targets(struct dm_table *t) if (!t) return; - suspend_targets(t, 0); + suspend_targets(t, PRESUSPEND); +} + +void dm_table_presuspend_undo_targets(struct dm_table *t) +{ + if (!t) + return; + + suspend_targets(t, PRESUSPEND_UNDO); } void dm_table_postsuspend_targets(struct dm_table *t) @@ -1550,7 +1572,7 @@ void dm_table_postsuspend_targets(struct dm_table *t) if (!t) return; - suspend_targets(t, 1); + suspend_targets(t, POSTSUSPEND); } int dm_table_resume_targets(struct dm_table *t) diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index e9d33ad59df5..43adbb863f5a 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1384,42 +1384,38 @@ static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time) } int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, - int can_block, struct dm_thin_lookup_result *result) + int can_issue_io, struct dm_thin_lookup_result *result) { - int r = -EINVAL; - uint64_t block_time = 0; + int r; __le64 value; struct dm_pool_metadata *pmd = td->pmd; dm_block_t keys[2] = { td->id, block }; struct dm_btree_info *info; - if (can_block) { - down_read(&pmd->root_lock); - info = &pmd->info; - } else if (down_read_trylock(&pmd->root_lock)) - info = &pmd->nb_info; - else - return -EWOULDBLOCK; - if (pmd->fail_io) - goto out; + return -EINVAL; - r = dm_btree_lookup(info, pmd->root, keys, &value); - if (!r) - block_time = le64_to_cpu(value); + down_read(&pmd->root_lock); -out: - up_read(&pmd->root_lock); + if (can_issue_io) { + info = &pmd->info; + } else + info = &pmd->nb_info; + r = dm_btree_lookup(info, pmd->root, keys, &value); if (!r) { + uint64_t block_time = 0; dm_block_t exception_block; uint32_t exception_time; + + block_time = le64_to_cpu(value); unpack_block_time(block_time, &exception_block, &exception_time); result->block = exception_block; result->shared = __snapshotted_since(td, exception_time); } + up_read(&pmd->root_lock); return r; } @@ -1813,3 +1809,8 @@ bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd) return needs_check; } + +void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd) +{ + dm_tm_issue_prefetches(pmd->tm); +} diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index e3c857db195a..921d15ee56a0 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h @@ -139,12 +139,12 @@ struct dm_thin_lookup_result { /* * Returns: - * -EWOULDBLOCK iff @can_block is set and would block. + * -EWOULDBLOCK iff @can_issue_io is set and would issue IO * -ENODATA iff that mapping is not present. * 0 success */ int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, - int can_block, struct dm_thin_lookup_result *result); + int can_issue_io, struct dm_thin_lookup_result *result); /* * Obtain an unused block. @@ -213,6 +213,11 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd); bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd); +/* + * Issue any prefetches that may be useful. + */ +void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd); + /*----------------------------------------------------------------*/ #endif diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 4843801173fe..8735543eacdb 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -11,11 +11,13 @@ #include <linux/device-mapper.h> #include <linux/dm-io.h> #include <linux/dm-kcopyd.h> +#include <linux/log2.h> #include <linux/list.h> #include <linux/rculist.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/sort.h> #include <linux/rbtree.h> #define DM_MSG_PREFIX "thin" @@ -25,7 +27,6 @@ */ #define ENDIO_HOOK_POOL_SIZE 1024 #define MAPPING_POOL_SIZE 1024 -#define PRISON_CELLS 1024 #define COMMIT_PERIOD HZ #define NO_SPACE_TIMEOUT_SECS 60 @@ -114,7 +115,8 @@ static void build_data_key(struct dm_thin_device *td, { key->virtual = 0; key->dev = dm_thin_dev_id(td); - key->block = b; + key->block_begin = b; + key->block_end = b + 1ULL; } static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, @@ -122,7 +124,55 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, { key->virtual = 1; key->dev = dm_thin_dev_id(td); - key->block = b; + key->block_begin = b; + key->block_end = b + 1ULL; +} + +/*----------------------------------------------------------------*/ + +#define THROTTLE_THRESHOLD (1 * HZ) + +struct throttle { + struct rw_semaphore lock; + unsigned long threshold; + bool throttle_applied; +}; + +static void throttle_init(struct throttle *t) +{ + init_rwsem(&t->lock); + t->throttle_applied = false; +} + +static void throttle_work_start(struct throttle *t) +{ + t->threshold = jiffies + THROTTLE_THRESHOLD; +} + +static void throttle_work_update(struct throttle *t) +{ + if (!t->throttle_applied && jiffies > t->threshold) { + down_write(&t->lock); + t->throttle_applied = true; + } +} + +static void throttle_work_complete(struct throttle *t) +{ + if (t->throttle_applied) { + t->throttle_applied = false; + up_write(&t->lock); + } +} + +static void throttle_lock(struct throttle *t) +{ + down_read(&t->lock); +} + +static void throttle_unlock(struct throttle *t) +{ + up_read(&t->lock); } /*----------------------------------------------------------------*/ @@ -155,8 +205,11 @@ struct pool_features { struct thin_c; typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio); +typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell); typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m); +#define CELL_SORT_ARRAY_SIZE 8192 + struct pool { struct list_head list; struct dm_target *ti; /* Only set if a pool target is bound */ @@ -171,11 +224,13 @@ struct pool { struct pool_features pf; bool low_water_triggered:1; /* A dm event has been sent */ + bool suspended:1; struct dm_bio_prison *prison; struct dm_kcopyd_client *copier; struct workqueue_struct *wq; + struct throttle throttle; struct work_struct worker; struct delayed_work waker; struct delayed_work no_space_timeout; @@ -198,8 +253,13 @@ struct pool { process_bio_fn process_bio; process_bio_fn process_discard; + process_cell_fn process_cell; + process_cell_fn process_discard_cell; + process_mapping_fn process_prepared_mapping; process_mapping_fn process_prepared_discard; + + struct dm_bio_prison_cell *cell_sort_array[CELL_SORT_ARRAY_SIZE]; }; static enum pool_mode get_pool_mode(struct pool *pool); @@ -232,8 +292,11 @@ struct thin_c { struct pool *pool; struct dm_thin_device *td; + struct mapped_device *thin_md; + bool requeue_mode:1; spinlock_t lock; + struct list_head deferred_cells; struct bio_list deferred_bio_list; struct bio_list retry_on_resume_list; struct rb_root sort_bio_list; /* sorted list of deferred bios */ @@ -290,6 +353,15 @@ static void cell_release(struct pool *pool, dm_bio_prison_free_cell(pool->prison, cell); } +static void cell_visit_release(struct pool *pool, + void (*fn)(void *, struct dm_bio_prison_cell *), + void *context, + struct dm_bio_prison_cell *cell) +{ + dm_cell_visit_release(pool->prison, fn, context, cell); + dm_bio_prison_free_cell(pool->prison, cell); +} + static void cell_release_no_holder(struct pool *pool, struct dm_bio_prison_cell *cell, struct bio_list *bios) @@ -298,19 +370,6 @@ static void cell_release_no_holder(struct pool *pool, dm_bio_prison_free_cell(pool->prison, cell); } -static void cell_defer_no_holder_no_free(struct thin_c *tc, - struct dm_bio_prison_cell *cell) -{ - struct pool *pool = tc->pool; - unsigned long flags; - - spin_lock_irqsave(&tc->lock, flags); - dm_cell_release_no_holder(pool->prison, cell, &tc->deferred_bio_list); - spin_unlock_irqrestore(&tc->lock, flags); - - wake_worker(pool); -} - static void cell_error_with_code(struct pool *pool, struct dm_bio_prison_cell *cell, int error_code) { @@ -323,6 +382,16 @@ static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell) cell_error_with_code(pool, cell, -EIO); } +static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell) +{ + cell_error_with_code(pool, cell, 0); +} + +static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell) +{ + cell_error_with_code(pool, cell, DM_ENDIO_REQUEUE); +} + /*----------------------------------------------------------------*/ /* @@ -393,44 +462,65 @@ struct dm_thin_endio_hook { struct rb_node rb_node; }; -static void requeue_bio_list(struct thin_c *tc, struct bio_list *master) +static void __merge_bio_list(struct bio_list *bios, struct bio_list *master) +{ + bio_list_merge(bios, master); + bio_list_init(master); +} + +static void error_bio_list(struct bio_list *bios, int error) { struct bio *bio; + + while ((bio = bio_list_pop(bios))) + bio_endio(bio, error); +} + +static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error) +{ struct bio_list bios; unsigned long flags; bio_list_init(&bios); spin_lock_irqsave(&tc->lock, flags); - bio_list_merge(&bios, master); - bio_list_init(master); + __merge_bio_list(&bios, master); spin_unlock_irqrestore(&tc->lock, flags); - while ((bio = bio_list_pop(&bios))) - bio_endio(bio, DM_ENDIO_REQUEUE); + error_bio_list(&bios, error); } -static void requeue_io(struct thin_c *tc) +static void requeue_deferred_cells(struct thin_c *tc) { - requeue_bio_list(tc, &tc->deferred_bio_list); - requeue_bio_list(tc, &tc->retry_on_resume_list); + struct pool *pool = tc->pool; + unsigned long flags; + struct list_head cells; + struct dm_bio_prison_cell *cell, *tmp; + + INIT_LIST_HEAD(&cells); + + spin_lock_irqsave(&tc->lock, flags); + list_splice_init(&tc->deferred_cells, &cells); + spin_unlock_irqrestore(&tc->lock, flags); + + list_for_each_entry_safe(cell, tmp, &cells, user_list) + cell_requeue(pool, cell); } -static void error_thin_retry_list(struct thin_c *tc) +static void requeue_io(struct thin_c *tc) { - struct bio *bio; - unsigned long flags; struct bio_list bios; + unsigned long flags; bio_list_init(&bios); spin_lock_irqsave(&tc->lock, flags); - bio_list_merge(&bios, &tc->retry_on_resume_list); - bio_list_init(&tc->retry_on_resume_list); + __merge_bio_list(&bios, &tc->deferred_bio_list); + __merge_bio_list(&bios, &tc->retry_on_resume_list); spin_unlock_irqrestore(&tc->lock, flags); - while ((bio = bio_list_pop(&bios))) - bio_io_error(bio); + error_bio_list(&bios, DM_ENDIO_REQUEUE); + requeue_deferred_cells(tc); } static void error_retry_list(struct pool *pool) @@ -439,7 +529,7 @@ static void error_retry_list(struct pool *pool) rcu_read_lock(); list_for_each_entry_rcu(tc, &pool->active_thins, list) - error_thin_retry_list(tc); + error_thin_bio_list(tc, &tc->retry_on_resume_list, -EIO); rcu_read_unlock(); } @@ -629,33 +719,75 @@ static void overwrite_endio(struct bio *bio, int err) */ /* - * This sends the bios in the cell back to the deferred_bios list. + * This sends the bios in the cell, except the original holder, back + * to the deferred_bios list. */ -static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell) +static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell) { struct pool *pool = tc->pool; unsigned long flags; spin_lock_irqsave(&tc->lock, flags); - cell_release(pool, cell, &tc->deferred_bio_list); + cell_release_no_holder(pool, cell, &tc->deferred_bio_list); spin_unlock_irqrestore(&tc->lock, flags); wake_worker(pool); } -/* - * Same as cell_defer above, except it omits the original holder of the cell. - */ -static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell) +static void thin_defer_bio(struct thin_c *tc, struct bio *bio); + +struct remap_info { + struct thin_c *tc; + struct bio_list defer_bios; + struct bio_list issue_bios; +}; + +static void __inc_remap_and_issue_cell(void *context, + struct dm_bio_prison_cell *cell) { - struct pool *pool = tc->pool; - unsigned long flags; + struct remap_info *info = context; + struct bio *bio; - spin_lock_irqsave(&tc->lock, flags); - cell_release_no_holder(pool, cell, &tc->deferred_bio_list); - spin_unlock_irqrestore(&tc->lock, flags); + while ((bio = bio_list_pop(&cell->bios))) { + if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) + bio_list_add(&info->defer_bios, bio); + else { + inc_all_io_entry(info->tc->pool, bio); - wake_worker(pool); + /* + * We can't issue the bios with the bio prison lock + * held, so we add them to a list to issue on + * return from this function. + */ + bio_list_add(&info->issue_bios, bio); + } + } +} + +static void inc_remap_and_issue_cell(struct thin_c *tc, + struct dm_bio_prison_cell *cell, + dm_block_t block) +{ + struct bio *bio; + struct remap_info info; + + info.tc = tc; + bio_list_init(&info.defer_bios); + bio_list_init(&info.issue_bios); + + /* + * We have to be careful to inc any bios we're about to issue + * before the cell is released, and avoid a race with new bios + * being added to the cell. + */ + cell_visit_release(tc->pool, __inc_remap_and_issue_cell, + &info, cell); + + while ((bio = bio_list_pop(&info.defer_bios))) + thin_defer_bio(tc, bio); + + while ((bio = bio_list_pop(&info.issue_bios))) + remap_and_issue(info.tc, bio, block); } static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) @@ -706,10 +838,13 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) * the bios in the cell. */ if (bio) { - cell_defer_no_holder(tc, m->cell); + inc_remap_and_issue_cell(tc, m->cell, m->data_block); bio_endio(bio, 0); - } else - cell_defer(tc, m->cell); + } else { + inc_all_io_entry(tc->pool, m->cell->holder); + remap_and_issue(tc, m->cell->holder, m->data_block); + inc_remap_and_issue_cell(tc, m->cell, m->data_block); + } out: list_del(&m->list); @@ -842,6 +977,20 @@ static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m, } } +static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio, + dm_block_t data_block, + struct dm_thin_new_mapping *m) +{ + struct pool *pool = tc->pool; + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); + + h->overwrite_mapping = m; + m->bio = bio; + save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); + inc_all_io_entry(pool, bio); + remap_and_issue(tc, bio, data_block); +} + /* * A partial copy also needs to zero the uncopied region. */ @@ -876,15 +1025,9 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, * If the whole block of data is being overwritten, we can issue the * bio immediately. Otherwise we use kcopyd to clone the data first. */ - if (io_overwrites_block(pool, bio)) { - struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); - - h->overwrite_mapping = m; - m->bio = bio; - save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); - inc_all_io_entry(pool, bio); - remap_and_issue(tc, bio, data_dest); - } else { + if (io_overwrites_block(pool, bio)) + remap_and_issue_overwrite(tc, bio, data_dest, m); + else { struct dm_io_region from, to; from.bdev = origin->bdev; @@ -953,16 +1096,10 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, if (!pool->pf.zero_new_blocks) process_prepared_mapping(m); - else if (io_overwrites_block(pool, bio)) { - struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); - - h->overwrite_mapping = m; - m->bio = bio; - save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); - inc_all_io_entry(pool, bio); - remap_and_issue(tc, bio, data_block); + else if (io_overwrites_block(pool, bio)) + remap_and_issue_overwrite(tc, bio, data_block, m); - } else + else ll_zero(tc, m, data_block * pool->sectors_per_block, (data_block + 1) * pool->sectors_per_block); @@ -1134,29 +1271,25 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c bio_list_init(&bios); cell_release(pool, cell, &bios); - error = should_error_unserviceable_bio(pool); - if (error) - while ((bio = bio_list_pop(&bios))) - bio_endio(bio, error); - else - while ((bio = bio_list_pop(&bios))) - retry_on_resume(bio); + while ((bio = bio_list_pop(&bios))) + retry_on_resume(bio); } -static void process_discard(struct thin_c *tc, struct bio *bio) +static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell) { int r; - unsigned long flags; + struct bio *bio = cell->holder; struct pool *pool = tc->pool; - struct dm_bio_prison_cell *cell, *cell2; - struct dm_cell_key key, key2; + struct dm_bio_prison_cell *cell2; + struct dm_cell_key key2; dm_block_t block = get_bio_block(tc, bio); struct dm_thin_lookup_result lookup_result; struct dm_thin_new_mapping *m; - build_virtual_key(tc->td, block, &key); - if (bio_detain(tc->pool, &key, bio, &cell)) + if (tc->requeue_mode) { + cell_requeue(pool, cell); return; + } r = dm_thin_find_block(tc->td, block, 1, &lookup_result); switch (r) { @@ -1187,12 +1320,9 @@ static void process_discard(struct thin_c *tc, struct bio *bio) m->cell2 = cell2; m->bio = bio; - if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) { - spin_lock_irqsave(&pool->lock, flags); - list_add_tail(&m->list, &pool->prepared_discards); - spin_unlock_irqrestore(&pool->lock, flags); - wake_worker(pool); - } + if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) + pool->process_prepared_discard(m); + } else { inc_all_io_entry(pool, bio); cell_defer_no_holder(tc, cell); @@ -1227,6 +1357,19 @@ static void process_discard(struct thin_c *tc, struct bio *bio) } } +static void process_discard_bio(struct thin_c *tc, struct bio *bio) +{ + struct dm_bio_prison_cell *cell; + struct dm_cell_key key; + dm_block_t block = get_bio_block(tc, bio); + + build_virtual_key(tc->td, block, &key); + if (bio_detain(tc->pool, &key, bio, &cell)) + return; + + process_discard_cell(tc, cell); +} + static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, struct dm_cell_key *key, struct dm_thin_lookup_result *lookup_result, @@ -1255,11 +1398,53 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, } } +static void __remap_and_issue_shared_cell(void *context, + struct dm_bio_prison_cell *cell) +{ + struct remap_info *info = context; + struct bio *bio; + + while ((bio = bio_list_pop(&cell->bios))) { + if ((bio_data_dir(bio) == WRITE) || + (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA))) + bio_list_add(&info->defer_bios, bio); + else { + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));; + + h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds); + inc_all_io_entry(info->tc->pool, bio); + bio_list_add(&info->issue_bios, bio); + } + } +} + +static void remap_and_issue_shared_cell(struct thin_c *tc, + struct dm_bio_prison_cell *cell, + dm_block_t block) +{ + struct bio *bio; + struct remap_info info; + + info.tc = tc; + bio_list_init(&info.defer_bios); + bio_list_init(&info.issue_bios); + + cell_visit_release(tc->pool, __remap_and_issue_shared_cell, + &info, cell); + + while ((bio = bio_list_pop(&info.defer_bios))) + thin_defer_bio(tc, bio); + + while ((bio = bio_list_pop(&info.issue_bios))) + remap_and_issue(tc, bio, block); +} + static void process_shared_bio(struct thin_c *tc, struct bio *bio, dm_block_t block, - struct dm_thin_lookup_result *lookup_result) + struct dm_thin_lookup_result *lookup_result, + struct dm_bio_prison_cell *virt_cell) { - struct dm_bio_prison_cell *cell; + struct dm_bio_prison_cell *data_cell; struct pool *pool = tc->pool; struct dm_cell_key key; @@ -1268,19 +1453,23 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio, * of being broken so we have nothing further to do here. */ build_data_key(tc->td, lookup_result->block, &key); - if (bio_detain(pool, &key, bio, &cell)) + if (bio_detain(pool, &key, bio, &data_cell)) { + cell_defer_no_holder(tc, virt_cell); return; + } - if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) - break_sharing(tc, bio, block, &key, lookup_result, cell); - else { + if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) { + break_sharing(tc, bio, block, &key, lookup_result, data_cell); + cell_defer_no_holder(tc, virt_cell); + } else { struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds); inc_all_io_entry(pool, bio); - cell_defer_no_holder(tc, cell); - remap_and_issue(tc, bio, lookup_result->block); + + remap_and_issue_shared_cell(tc, data_cell, lookup_result->block); + remap_and_issue_shared_cell(tc, virt_cell, lookup_result->block); } } @@ -1333,34 +1522,28 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block } } -static void process_bio(struct thin_c *tc, struct bio *bio) +static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell) { int r; struct pool *pool = tc->pool; + struct bio *bio = cell->holder; dm_block_t block = get_bio_block(tc, bio); - struct dm_bio_prison_cell *cell; - struct dm_cell_key key; struct dm_thin_lookup_result lookup_result; - /* - * If cell is already occupied, then the block is already - * being provisioned so we have nothing further to do here. - */ - build_virtual_key(tc->td, block, &key); - if (bio_detain(pool, &key, bio, &cell)) + if (tc->requeue_mode) { + cell_requeue(pool, cell); return; + } r = dm_thin_find_block(tc->td, block, 1, &lookup_result); switch (r) { case 0: - if (lookup_result.shared) { - process_shared_bio(tc, bio, block, &lookup_result); - cell_defer_no_holder(tc, cell); /* FIXME: pass this cell into process_shared? */ - } else { + if (lookup_result.shared) + process_shared_bio(tc, bio, block, &lookup_result, cell); + else { inc_all_io_entry(pool, bio); - cell_defer_no_holder(tc, cell); - remap_and_issue(tc, bio, lookup_result.block); + inc_remap_and_issue_cell(tc, cell, lookup_result.block); } break; @@ -1394,7 +1577,26 @@ static void process_bio(struct thin_c *tc, struct bio *bio) } } -static void process_bio_read_only(struct thin_c *tc, struct bio *bio) +static void process_bio(struct thin_c *tc, struct bio *bio) +{ + struct pool *pool = tc->pool; + dm_block_t block = get_bio_block(tc, bio); + struct dm_bio_prison_cell *cell; + struct dm_cell_key key; + + /* + * If cell is already occupied, then the block is already + * being provisioned so we have nothing further to do here. + */ + build_virtual_key(tc->td, block, &key); + if (bio_detain(pool, &key, bio, &cell)) + return; + + process_cell(tc, cell); +} + +static void __process_bio_read_only(struct thin_c *tc, struct bio *bio, + struct dm_bio_prison_cell *cell) { int r; int rw = bio_data_dir(bio); @@ -1404,15 +1606,21 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio) r = dm_thin_find_block(tc->td, block, 1, &lookup_result); switch (r) { case 0: - if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) + if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) { handle_unserviceable_bio(tc->pool, bio); - else { + if (cell) + cell_defer_no_holder(tc, cell); + } else { inc_all_io_entry(tc->pool, bio); remap_and_issue(tc, bio, lookup_result.block); + if (cell) + inc_remap_and_issue_cell(tc, cell, lookup_result.block); } break; case -ENODATA: + if (cell) + cell_defer_no_holder(tc, cell); if (rw != READ) { handle_unserviceable_bio(tc->pool, bio); break; @@ -1431,11 +1639,23 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio) default: DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d", __func__, r); + if (cell) + cell_defer_no_holder(tc, cell); bio_io_error(bio); break; } } +static void process_bio_read_only(struct thin_c *tc, struct bio *bio) +{ + __process_bio_read_only(tc, bio, NULL); +} + +static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell *cell) +{ + __process_bio_read_only(tc, cell->holder, cell); +} + static void process_bio_success(struct thin_c *tc, struct bio *bio) { bio_endio(bio, 0); @@ -1446,6 +1666,16 @@ static void process_bio_fail(struct thin_c *tc, struct bio *bio) bio_io_error(bio); } +static void process_cell_success(struct thin_c *tc, struct dm_bio_prison_cell *cell) +{ + cell_success(tc->pool, cell); +} + +static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell) +{ + cell_error(tc->pool, cell); +} + /* * FIXME: should we also commit due to size of transaction, measured in * metadata blocks? @@ -1527,9 +1757,10 @@ static void process_thin_deferred_bios(struct thin_c *tc) struct bio *bio; struct bio_list bios; struct blk_plug plug; + unsigned count = 0; if (tc->requeue_mode) { - requeue_bio_list(tc, &tc->deferred_bio_list); + error_thin_bio_list(tc, &tc->deferred_bio_list, DM_ENDIO_REQUEUE); return; } @@ -1568,10 +1799,97 @@ static void process_thin_deferred_bios(struct thin_c *tc) pool->process_discard(tc, bio); else pool->process_bio(tc, bio); + + if ((count++ & 127) == 0) { + throttle_work_update(&pool->throttle); + dm_pool_issue_prefetches(pool->pmd); + } } blk_finish_plug(&plug); } +static int cmp_cells(const void *lhs, const void *rhs) +{ + struct dm_bio_prison_cell *lhs_cell = *((struct dm_bio_prison_cell **) lhs); + struct dm_bio_prison_cell *rhs_cell = *((struct dm_bio_prison_cell **) rhs); + + BUG_ON(!lhs_cell->holder); + BUG_ON(!rhs_cell->holder); + + if (lhs_cell->holder->bi_iter.bi_sector < rhs_cell->holder->bi_iter.bi_sector) + return -1; + + if (lhs_cell->holder->bi_iter.bi_sector > rhs_cell->holder->bi_iter.bi_sector) + return 1; + + return 0; +} + +static unsigned sort_cells(struct pool *pool, struct list_head *cells) +{ + unsigned count = 0; + struct dm_bio_prison_cell *cell, *tmp; + + list_for_each_entry_safe(cell, tmp, cells, user_list) { + if (count >= CELL_SORT_ARRAY_SIZE) + break; + + pool->cell_sort_array[count++] = cell; + list_del(&cell->user_list); + } + + sort(pool->cell_sort_array, count, sizeof(cell), cmp_cells, NULL); + + return count; +} + +static void process_thin_deferred_cells(struct thin_c *tc) +{ + struct pool *pool = tc->pool; + unsigned long flags; + struct list_head cells; + struct dm_bio_prison_cell *cell; + unsigned i, j, count; + + INIT_LIST_HEAD(&cells); + + spin_lock_irqsave(&tc->lock, flags); + list_splice_init(&tc->deferred_cells, &cells); + spin_unlock_irqrestore(&tc->lock, flags); + + if (list_empty(&cells)) + return; + + do { + count = sort_cells(tc->pool, &cells); + + for (i = 0; i < count; i++) { + cell = pool->cell_sort_array[i]; + BUG_ON(!cell->holder); + + /* + * If we've got no free new_mapping structs, and processing + * this bio might require one, we pause until there are some + * prepared mappings to process. + */ + if (ensure_next_mapping(pool)) { + for (j = i; j < count; j++) + list_add(&pool->cell_sort_array[j]->user_list, &cells); + + spin_lock_irqsave(&tc->lock, flags); + list_splice(&cells, &tc->deferred_cells); + spin_unlock_irqrestore(&tc->lock, flags); + return; + } + + if (cell->holder->bi_rw & REQ_DISCARD) + pool->process_discard_cell(tc, cell); + else + pool->process_cell(tc, cell); + } + } while (!list_empty(&cells)); +} + static void thin_get(struct thin_c *tc); static void thin_put(struct thin_c *tc); @@ -1620,6 +1938,7 @@ static void process_deferred_bios(struct pool *pool) tc = get_first_thin(pool); while (tc) { + process_thin_deferred_cells(tc); process_thin_deferred_bios(tc); tc = get_next_thin(pool, tc); } @@ -1653,9 +1972,15 @@ static void do_worker(struct work_struct *ws) { struct pool *pool = container_of(ws, struct pool, worker); + throttle_work_start(&pool->throttle); + dm_pool_issue_prefetches(pool->pmd); + throttle_work_update(&pool->throttle); process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping); + throttle_work_update(&pool->throttle); process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard); + throttle_work_update(&pool->throttle); process_deferred_bios(pool); + throttle_work_complete(&pool->throttle); } /* @@ -1792,6 +2117,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) dm_pool_metadata_read_only(pool->pmd); pool->process_bio = process_bio_fail; pool->process_discard = process_bio_fail; + pool->process_cell = process_cell_fail; + pool->process_discard_cell = process_cell_fail; pool->process_prepared_mapping = process_prepared_mapping_fail; pool->process_prepared_discard = process_prepared_discard_fail; @@ -1804,6 +2131,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) dm_pool_metadata_read_only(pool->pmd); pool->process_bio = process_bio_read_only; pool->process_discard = process_bio_success; + pool->process_cell = process_cell_read_only; + pool->process_discard_cell = process_cell_success; pool->process_prepared_mapping = process_prepared_mapping_fail; pool->process_prepared_discard = process_prepared_discard_passdown; @@ -1822,7 +2151,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) if (old_mode != new_mode) notify_of_pool_mode_change(pool, "out-of-data-space"); pool->process_bio = process_bio_read_only; - pool->process_discard = process_discard; + pool->process_discard = process_discard_bio; + pool->process_cell = process_cell_read_only; + pool->process_discard_cell = process_discard_cell; pool->process_prepared_mapping = process_prepared_mapping; pool->process_prepared_discard = process_prepared_discard_passdown; @@ -1835,7 +2166,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) notify_of_pool_mode_change(pool, "write"); dm_pool_metadata_read_write(pool->pmd); pool->process_bio = process_bio; - pool->process_discard = process_discard; + pool->process_discard = process_discard_bio; + pool->process_cell = process_cell; + pool->process_discard_cell = process_discard_cell; pool->process_prepared_mapping = process_prepared_mapping; pool->process_prepared_discard = process_prepared_discard; break; @@ -1895,6 +2228,29 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio) wake_worker(pool); } +static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio) +{ + struct pool *pool = tc->pool; + + throttle_lock(&pool->throttle); + thin_defer_bio(tc, bio); + throttle_unlock(&pool->throttle); +} + +static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell) +{ + unsigned long flags; + struct pool *pool = tc->pool; + + throttle_lock(&pool->throttle); + spin_lock_irqsave(&tc->lock, flags); + list_add_tail(&cell->user_list, &tc->deferred_cells); + spin_unlock_irqrestore(&tc->lock, flags); + throttle_unlock(&pool->throttle); + + wake_worker(pool); +} + static void thin_hook_bio(struct thin_c *tc, struct bio *bio) { struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); @@ -1915,8 +2271,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) dm_block_t block = get_bio_block(tc, bio); struct dm_thin_device *td = tc->td; struct dm_thin_lookup_result result; - struct dm_bio_prison_cell cell1, cell2; - struct dm_bio_prison_cell *cell_result; + struct dm_bio_prison_cell *virt_cell, *data_cell; struct dm_cell_key key; thin_hook_bio(tc, bio); @@ -1932,10 +2287,18 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) } if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { - thin_defer_bio(tc, bio); + thin_defer_bio_with_throttle(tc, bio); return DM_MAPIO_SUBMITTED; } + /* + * We must hold the virtual cell before doing the lookup, otherwise + * there's a race with discard. + */ + build_virtual_key(tc->td, block, &key); + if (bio_detain(tc->pool, &key, bio, &virt_cell)) + return DM_MAPIO_SUBMITTED; + r = dm_thin_find_block(td, block, 0, &result); /* @@ -1958,23 +2321,19 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) * More distant ancestors are irrelevant. The * shared flag will be set in their case. */ - thin_defer_bio(tc, bio); + thin_defer_cell(tc, virt_cell); return DM_MAPIO_SUBMITTED; } - build_virtual_key(tc->td, block, &key); - if (dm_bio_detain(tc->pool->prison, &key, bio, &cell1, &cell_result)) - return DM_MAPIO_SUBMITTED; - build_data_key(tc->td, result.block, &key); - if (dm_bio_detain(tc->pool->prison, &key, bio, &cell2, &cell_result)) { - cell_defer_no_holder_no_free(tc, &cell1); + if (bio_detain(tc->pool, &key, bio, &data_cell)) { + cell_defer_no_holder(tc, virt_cell); return DM_MAPIO_SUBMITTED; } inc_all_io_entry(tc->pool, bio); - cell_defer_no_holder_no_free(tc, &cell2); - cell_defer_no_holder_no_free(tc, &cell1); + cell_defer_no_holder(tc, data_cell); + cell_defer_no_holder(tc, virt_cell); remap(tc, bio, result.block); return DM_MAPIO_REMAPPED; @@ -1986,16 +2345,13 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) * of doing so. */ handle_unserviceable_bio(tc->pool, bio); + cell_defer_no_holder(tc, virt_cell); return DM_MAPIO_SUBMITTED; } /* fall through */ case -EWOULDBLOCK: - /* - * In future, the failed dm_thin_find_block above could - * provide the hint to load the metadata into cache. - */ - thin_defer_bio(tc, bio); + thin_defer_cell(tc, virt_cell); return DM_MAPIO_SUBMITTED; default: @@ -2005,6 +2361,7 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) * pool is switched to fail-io mode. */ bio_io_error(bio); + cell_defer_no_holder(tc, virt_cell); return DM_MAPIO_SUBMITTED; } } @@ -2185,7 +2542,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, pool->sectors_per_block_shift = __ffs(block_size); pool->low_water_blocks = 0; pool_features_init(&pool->pf); - pool->prison = dm_bio_prison_create(PRISON_CELLS); + pool->prison = dm_bio_prison_create(); if (!pool->prison) { *error = "Error creating pool's bio prison"; err_p = ERR_PTR(-ENOMEM); @@ -2211,6 +2568,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, goto bad_wq; } + throttle_init(&pool->throttle); INIT_WORK(&pool->worker, do_worker); INIT_DELAYED_WORK(&pool->waker, do_waker); INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout); @@ -2220,6 +2578,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, INIT_LIST_HEAD(&pool->prepared_discards); INIT_LIST_HEAD(&pool->active_thins); pool->low_water_triggered = false; + pool->suspended = true; pool->shared_read_ds = dm_deferred_set_create(); if (!pool->shared_read_ds) { @@ -2756,20 +3115,77 @@ static int pool_preresume(struct dm_target *ti) return 0; } +static void pool_suspend_active_thins(struct pool *pool) +{ + struct thin_c *tc; + + /* Suspend all active thin devices */ + tc = get_first_thin(pool); + while (tc) { + dm_internal_suspend_noflush(tc->thin_md); + tc = get_next_thin(pool, tc); + } +} + +static void pool_resume_active_thins(struct pool *pool) +{ + struct thin_c *tc; + + /* Resume all active thin devices */ + tc = get_first_thin(pool); + while (tc) { + dm_internal_resume(tc->thin_md); + tc = get_next_thin(pool, tc); + } +} + static void pool_resume(struct dm_target *ti) { struct pool_c *pt = ti->private; struct pool *pool = pt->pool; unsigned long flags; + /* + * Must requeue active_thins' bios and then resume + * active_thins _before_ clearing 'suspend' flag. + */ + requeue_bios(pool); + pool_resume_active_thins(pool); + spin_lock_irqsave(&pool->lock, flags); pool->low_water_triggered = false; + pool->suspended = false; spin_unlock_irqrestore(&pool->lock, flags); - requeue_bios(pool); do_waker(&pool->waker.work); } +static void pool_presuspend(struct dm_target *ti) +{ + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + unsigned long flags; + + spin_lock_irqsave(&pool->lock, flags); + pool->suspended = true; + spin_unlock_irqrestore(&pool->lock, flags); + + pool_suspend_active_thins(pool); +} + +static void pool_presuspend_undo(struct dm_target *ti) +{ + struct pool_c *pt = ti->private; + struct pool *pool = pt->pool; + unsigned long flags; + + pool_resume_active_thins(pool); + + spin_lock_irqsave(&pool->lock, flags); + pool->suspended = false; + spin_unlock_irqrestore(&pool->lock, flags); +} + static void pool_postsuspend(struct dm_target *ti) { struct pool_c *pt = ti->private; @@ -2941,7 +3357,6 @@ static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct * create_thin <dev_id> * create_snap <dev_id> <origin_id> * delete <dev_id> - * trim <dev_id> <new_size_in_sectors> * set_transaction_id <current_trans_id> <new_trans_id> * reserve_metadata_snap * release_metadata_snap @@ -3169,15 +3584,35 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct pool_c *pt = ti->private; struct pool *pool = pt->pool; - uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; + sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT; + + /* + * If max_sectors is smaller than pool->sectors_per_block adjust it + * to the highest possible power-of-2 factor of pool->sectors_per_block. + * This is especially beneficial when the pool's data device is a RAID + * device that has a full stripe width that matches pool->sectors_per_block + * -- because even though partial RAID stripe-sized IOs will be issued to a + * single RAID stripe; when aggregated they will end on a full RAID stripe + * boundary.. which avoids additional partial RAID stripe writes cascading + */ + if (limits->max_sectors < pool->sectors_per_block) { + while (!is_factor(pool->sectors_per_block, limits->max_sectors)) { + if ((limits->max_sectors & (limits->max_sectors - 1)) == 0) + limits->max_sectors--; + limits->max_sectors = rounddown_pow_of_two(limits->max_sectors); + } + } /* * If the system-determined stacked limits are compatible with the * pool's blocksize (io_opt is a factor) do not override them. */ if (io_opt_sectors < pool->sectors_per_block || - do_div(io_opt_sectors, pool->sectors_per_block)) { - blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT); + !is_factor(io_opt_sectors, pool->sectors_per_block)) { + if (is_factor(pool->sectors_per_block, limits->max_sectors)) + blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT); + else + blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT); blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); } @@ -3206,11 +3641,13 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 13, 0}, + .version = {1, 14, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, .map = pool_map, + .presuspend = pool_presuspend, + .presuspend_undo = pool_presuspend_undo, .postsuspend = pool_postsuspend, .preresume = pool_preresume, .resume = pool_resume, @@ -3240,14 +3677,14 @@ static void thin_dtr(struct dm_target *ti) struct thin_c *tc = ti->private; unsigned long flags; - thin_put(tc); - wait_for_completion(&tc->can_destroy); - spin_lock_irqsave(&tc->pool->lock, flags); list_del_rcu(&tc->list); spin_unlock_irqrestore(&tc->pool->lock, flags); synchronize_rcu(); + thin_put(tc); + wait_for_completion(&tc->can_destroy); + mutex_lock(&dm_thin_pool_table.mutex); __pool_dec(tc->pool); @@ -3294,7 +3731,9 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) r = -ENOMEM; goto out_unlock; } + tc->thin_md = dm_table_get_md(ti->table); spin_lock_init(&tc->lock); + INIT_LIST_HEAD(&tc->deferred_cells); bio_list_init(&tc->deferred_bio_list); bio_list_init(&tc->retry_on_resume_list); tc->sort_bio_list = RB_ROOT; @@ -3339,18 +3778,18 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) if (get_pool_mode(tc->pool) == PM_FAIL) { ti->error = "Couldn't open thin device, Pool is in fail mode"; r = -EINVAL; - goto bad_thin_open; + goto bad_pool; } r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td); if (r) { ti->error = "Couldn't open thin internal device"; - goto bad_thin_open; + goto bad_pool; } r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block); if (r) - goto bad_target_max_io_len; + goto bad; ti->num_flush_bios = 1; ti->flush_supported = true; @@ -3365,14 +3804,16 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->split_discard_bios = true; } - dm_put(pool_md); - mutex_unlock(&dm_thin_pool_table.mutex); - atomic_set(&tc->refcount, 1); - init_completion(&tc->can_destroy); - spin_lock_irqsave(&tc->pool->lock, flags); + if (tc->pool->suspended) { + spin_unlock_irqrestore(&tc->pool->lock, flags); + mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */ + ti->error = "Unable to activate thin device while pool is suspended"; + r = -EINVAL; + goto bad; + } list_add_tail_rcu(&tc->list, &tc->pool->active_thins); spin_unlock_irqrestore(&tc->pool->lock, flags); /* @@ -3383,11 +3824,16 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) */ synchronize_rcu(); + dm_put(pool_md); + + atomic_set(&tc->refcount, 1); + init_completion(&tc->can_destroy); + return 0; -bad_target_max_io_len: +bad: dm_pool_close_thin_device(tc->td); -bad_thin_open: +bad_pool: __pool_dec(tc->pool); bad_pool_lookup: dm_put(pool_md); @@ -3533,6 +3979,21 @@ err: DMEMIT("Error"); } +static int thin_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct thin_c *tc = ti->private; + struct request_queue *q = bdev_get_queue(tc->pool_dev->bdev); + + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = tc->pool_dev->bdev; + bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector); + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + static int thin_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { @@ -3557,7 +4018,7 @@ static int thin_iterate_devices(struct dm_target *ti, static struct target_type thin_target = { .name = "thin", - .version = {1, 13, 0}, + .version = {1, 14, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, @@ -3567,6 +4028,7 @@ static struct target_type thin_target = { .presuspend = thin_presuspend, .postsuspend = thin_postsuspend, .status = thin_status, + .merge = thin_merge, .iterate_devices = thin_iterate_devices, }; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 58f3927fd7cc..8f37ed215b19 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -19,6 +19,7 @@ #include <linux/idr.h> #include <linux/hdreg.h> #include <linux/delay.h> +#include <linux/wait.h> #include <trace/events/block.h> @@ -117,6 +118,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); #define DMF_NOFLUSH_SUSPENDING 5 #define DMF_MERGE_IS_OPTIONAL 6 #define DMF_DEFERRED_REMOVE 7 +#define DMF_SUSPENDED_INTERNALLY 8 /* * A dummy definition to make RCU happy. @@ -140,7 +142,7 @@ struct mapped_device { * Use dm_get_live_table{_fast} or take suspend_lock for * dereference. */ - struct dm_table *map; + struct dm_table __rcu *map; struct list_head table_devices; struct mutex table_devices_lock; @@ -525,14 +527,15 @@ retry: goto out; tgt = dm_table_get_target(map, 0); + if (!tgt->type->ioctl) + goto out; if (dm_suspended_md(md)) { r = -EAGAIN; goto out; } - if (tgt->type->ioctl) - r = tgt->type->ioctl(tgt, cmd, arg); + r = tgt->type->ioctl(tgt, cmd, arg); out: dm_put_live_table(md, srcu_idx); @@ -1607,9 +1610,9 @@ static int dm_merge_bvec(struct request_queue *q, * Find maximum amount of I/O that won't need splitting */ max_sectors = min(max_io_len(bvm->bi_sector, ti), - (sector_t) BIO_MAX_SECTORS); + (sector_t) queue_max_sectors(q)); max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; - if (max_size < 0) + if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */ max_size = 0; /* @@ -1621,10 +1624,10 @@ static int dm_merge_bvec(struct request_queue *q, max_size = ti->type->merge(ti, bvm, biovec, max_size); /* * If the target doesn't support merge method and some of the devices - * provided their merge_bvec method (we know this by looking at - * queue_max_hw_sectors), then we can't allow bios with multiple vector - * entries. So always set max_size to 0, and the code below allows - * just one page. + * provided their merge_bvec method (we know this by looking for the + * max_hw_sectors that dm_set_device_limits may set), then we can't + * allow bios with multiple vector entries. So always set max_size + * to 0, and the code below allows just one page. */ else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) max_size = 0; @@ -2332,7 +2335,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, merge_is_optional = dm_table_merge_is_optional(t); - old_map = md->map; + old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); rcu_assign_pointer(md->map, t); md->immutable_target_type = dm_table_get_immutable_target_type(t); @@ -2341,7 +2344,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); else clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); - dm_sync_table(md); + if (old_map) + dm_sync_table(md); return old_map; } @@ -2351,7 +2355,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, */ static struct dm_table *__unbind(struct mapped_device *md) { - struct dm_table *map = md->map; + struct dm_table *map = rcu_dereference_protected(md->map, 1); if (!map) return NULL; @@ -2716,36 +2720,18 @@ static void unlock_fs(struct mapped_device *md) } /* - * We need to be able to change a mapping table under a mounted - * filesystem. For example we might want to move some data in - * the background. Before the table can be swapped with - * dm_bind_table, dm_suspend must be called to flush any in - * flight bios and ensure that any further io gets deferred. - */ -/* - * Suspend mechanism in request-based dm. + * If __dm_suspend returns 0, the device is completely quiescent + * now. There is no request-processing activity. All new requests + * are being added to md->deferred list. * - * 1. Flush all I/Os by lock_fs() if needed. - * 2. Stop dispatching any I/O by stopping the request_queue. - * 3. Wait for all in-flight I/Os to be completed or requeued. - * - * To abort suspend, start the request_queue. + * Caller must hold md->suspend_lock */ -int dm_suspend(struct mapped_device *md, unsigned suspend_flags) +static int __dm_suspend(struct mapped_device *md, struct dm_table *map, + unsigned suspend_flags, int interruptible) { - struct dm_table *map = NULL; - int r = 0; - int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; - int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; - - mutex_lock(&md->suspend_lock); - - if (dm_suspended_md(md)) { - r = -EINVAL; - goto out_unlock; - } - - map = md->map; + bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; + bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; + int r; /* * DMF_NOFLUSH_SUSPENDING must be set before presuspend. @@ -2754,7 +2740,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) if (noflush) set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); - /* This does not get reverted if there's an error later. */ + /* + * This gets reverted if there's an error later and the targets + * provide the .presuspend_undo hook. + */ dm_table_presuspend_targets(map); /* @@ -2765,8 +2754,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) */ if (!noflush && do_lockfs) { r = lock_fs(md); - if (r) - goto out_unlock; + if (r) { + dm_table_presuspend_undo_targets(map); + return r; + } } /* @@ -2782,7 +2773,8 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) * flush_workqueue(md->wq). */ set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); - synchronize_srcu(&md->io_barrier); + if (map) + synchronize_srcu(&md->io_barrier); /* * Stop md->queue before flushing md->wq in case request-based @@ -2798,11 +2790,12 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) * We call dm_wait_for_completion to wait for all existing requests * to finish. */ - r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); + r = dm_wait_for_completion(md, interruptible); if (noflush) clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); - synchronize_srcu(&md->io_barrier); + if (map) + synchronize_srcu(&md->io_barrier); /* were we interrupted ? */ if (r < 0) { @@ -2812,14 +2805,56 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) start_queue(md->queue); unlock_fs(md); - goto out_unlock; /* pushback list is already flushed, so skip flush */ + dm_table_presuspend_undo_targets(map); + /* pushback list is already flushed, so skip flush */ } - /* - * If dm_wait_for_completion returned 0, the device is completely - * quiescent now. There is no request-processing activity. All new - * requests are being added to md->deferred list. - */ + return r; +} + +/* + * We need to be able to change a mapping table under a mounted + * filesystem. For example we might want to move some data in + * the background. Before the table can be swapped with + * dm_bind_table, dm_suspend must be called to flush any in + * flight bios and ensure that any further io gets deferred. + */ +/* + * Suspend mechanism in request-based dm. + * + * 1. Flush all I/Os by lock_fs() if needed. + * 2. Stop dispatching any I/O by stopping the request_queue. + * 3. Wait for all in-flight I/Os to be completed or requeued. + * + * To abort suspend, start the request_queue. + */ +int dm_suspend(struct mapped_device *md, unsigned suspend_flags) +{ + struct dm_table *map = NULL; + int r = 0; + +retry: + mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); + + if (dm_suspended_md(md)) { + r = -EINVAL; + goto out_unlock; + } + + if (dm_suspended_internally_md(md)) { + /* already internally suspended, wait for internal resume */ + mutex_unlock(&md->suspend_lock); + r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); + if (r) + return r; + goto retry; + } + + map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); + + r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE); + if (r) + goto out_unlock; set_bit(DMF_SUSPENDED, &md->flags); @@ -2830,22 +2865,13 @@ out_unlock: return r; } -int dm_resume(struct mapped_device *md) +static int __dm_resume(struct mapped_device *md, struct dm_table *map) { - int r = -EINVAL; - struct dm_table *map = NULL; - - mutex_lock(&md->suspend_lock); - if (!dm_suspended_md(md)) - goto out; - - map = md->map; - if (!map || !dm_table_get_size(map)) - goto out; - - r = dm_table_resume_targets(map); - if (r) - goto out; + if (map) { + int r = dm_table_resume_targets(map); + if (r) + return r; + } dm_queue_flush(md); @@ -2859,6 +2885,37 @@ int dm_resume(struct mapped_device *md) unlock_fs(md); + return 0; +} + +int dm_resume(struct mapped_device *md) +{ + int r = -EINVAL; + struct dm_table *map = NULL; + +retry: + mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); + + if (!dm_suspended_md(md)) + goto out; + + if (dm_suspended_internally_md(md)) { + /* already internally suspended, wait for internal resume */ + mutex_unlock(&md->suspend_lock); + r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); + if (r) + return r; + goto retry; + } + + map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); + if (!map || !dm_table_get_size(map)) + goto out; + + r = __dm_resume(md, map); + if (r) + goto out; + clear_bit(DMF_SUSPENDED, &md->flags); r = 0; @@ -2872,15 +2929,80 @@ out: * Internal suspend/resume works like userspace-driven suspend. It waits * until all bios finish and prevents issuing new bios to the target drivers. * It may be used only from the kernel. - * - * Internal suspend holds md->suspend_lock, which prevents interaction with - * userspace-driven suspend. */ -void dm_internal_suspend(struct mapped_device *md) +static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags) { - mutex_lock(&md->suspend_lock); + struct dm_table *map = NULL; + + if (dm_suspended_internally_md(md)) + return; /* nested internal suspend */ + + if (dm_suspended_md(md)) { + set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); + return; /* nest suspend */ + } + + map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); + + /* + * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is + * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend + * would require changing .presuspend to return an error -- avoid this + * until there is a need for more elaborate variants of internal suspend. + */ + (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE); + + set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); + + dm_table_postsuspend_targets(map); +} + +static void __dm_internal_resume(struct mapped_device *md) +{ + if (!dm_suspended_internally_md(md)) + return; /* resume from nested internal suspend */ + if (dm_suspended_md(md)) + goto done; /* resume from nested suspend */ + + /* + * NOTE: existing callers don't need to call dm_table_resume_targets + * (which may fail -- so best to avoid it for now by passing NULL map) + */ + (void) __dm_resume(md, NULL); + +done: + clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); + smp_mb__after_atomic(); + wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY); +} + +void dm_internal_suspend_noflush(struct mapped_device *md) +{ + mutex_lock(&md->suspend_lock); + __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG); + mutex_unlock(&md->suspend_lock); +} +EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush); + +void dm_internal_resume(struct mapped_device *md) +{ + mutex_lock(&md->suspend_lock); + __dm_internal_resume(md); + mutex_unlock(&md->suspend_lock); +} +EXPORT_SYMBOL_GPL(dm_internal_resume); + +/* + * Fast variants of internal suspend/resume hold md->suspend_lock, + * which prevents interaction with userspace-driven suspend. + */ + +void dm_internal_suspend_fast(struct mapped_device *md) +{ + mutex_lock(&md->suspend_lock); + if (dm_suspended_md(md) || dm_suspended_internally_md(md)) return; set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); @@ -2889,9 +3011,9 @@ void dm_internal_suspend(struct mapped_device *md) dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); } -void dm_internal_resume(struct mapped_device *md) +void dm_internal_resume_fast(struct mapped_device *md) { - if (dm_suspended_md(md)) + if (dm_suspended_md(md) || dm_suspended_internally_md(md)) goto done; dm_queue_flush(md); @@ -2977,6 +3099,11 @@ int dm_suspended_md(struct mapped_device *md) return test_bit(DMF_SUSPENDED, &md->flags); } +int dm_suspended_internally_md(struct mapped_device *md) +{ + return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); +} + int dm_test_deferred_remove_flag(struct mapped_device *md) { return test_bit(DMF_DEFERRED_REMOVE, &md->flags); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 988c7fb7b145..84b0f9e4ba6c 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -65,6 +65,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits); struct list_head *dm_table_get_devices(struct dm_table *t); void dm_table_presuspend_targets(struct dm_table *t); +void dm_table_presuspend_undo_targets(struct dm_table *t); void dm_table_postsuspend_targets(struct dm_table *t); int dm_table_resume_targets(struct dm_table *t); int dm_table_any_congested(struct dm_table *t, int bdi_bits); @@ -129,6 +130,15 @@ int dm_deleting_md(struct mapped_device *md); int dm_suspended_md(struct mapped_device *md); /* + * Internal suspend and resume methods. + */ +int dm_suspended_internally_md(struct mapped_device *md); +void dm_internal_suspend_fast(struct mapped_device *md); +void dm_internal_resume_fast(struct mapped_device *md); +void dm_internal_suspend_noflush(struct mapped_device *md); +void dm_internal_resume(struct mapped_device *md); + +/* * Test if the device is scheduled for deferred remove. */ int dm_test_deferred_remove_flag(struct mapped_device *md); diff --git a/drivers/md/md.c b/drivers/md/md.c index 4dfa15da9cb8..9233c71138f1 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5121,6 +5121,7 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) printk("md: %s still in use.\n",mdname(mddev)); if (did_freeze) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); } err = -EBUSY; @@ -5135,6 +5136,8 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) mddev->ro = 1; set_disk_ro(mddev->gendisk, 1); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); sysfs_notify_dirent_safe(mddev->sysfs_state); err = 0; } @@ -5178,6 +5181,7 @@ static int do_md_stop(struct mddev *mddev, int mode, mutex_unlock(&mddev->open_mutex); if (did_freeze) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); } return -EBUSY; diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index 1d75b1dc1e2e..e64b61ad0ef3 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -645,8 +645,10 @@ static int array_resize(struct dm_array_info *info, dm_block_t root, int r; struct resize resize; - if (old_size == new_size) + if (old_size == new_size) { + *new_root = root; return 0; + } resize.info = info; resize.root = root; diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index 37d367bb9aa8..bf2b80d5c470 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -42,6 +42,12 @@ struct btree_node { } __packed; +/* + * Locks a block using the btree node validator. + */ +int bn_read_lock(struct dm_btree_info *info, dm_block_t b, + struct dm_block **result); + void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, struct dm_btree_value_type *vt); diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c index cf9fd676ae44..1b5e13ec7f96 100644 --- a/drivers/md/persistent-data/dm-btree-spine.c +++ b/drivers/md/persistent-data/dm-btree-spine.c @@ -92,7 +92,7 @@ struct dm_block_validator btree_node_validator = { /*----------------------------------------------------------------*/ -static int bn_read_lock(struct dm_btree_info *info, dm_block_t b, +int bn_read_lock(struct dm_btree_info *info, dm_block_t b, struct dm_block **result) { return dm_tm_read_lock(info->tm, b, &btree_node_validator, result); diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index 416060c25709..200ac12a1d40 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -847,22 +847,26 @@ EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key); * FIXME: We shouldn't use a recursive algorithm when we have limited stack * space. Also this only works for single level trees. */ -static int walk_node(struct ro_spine *s, dm_block_t block, +static int walk_node(struct dm_btree_info *info, dm_block_t block, int (*fn)(void *context, uint64_t *keys, void *leaf), void *context) { int r; unsigned i, nr; + struct dm_block *node; struct btree_node *n; uint64_t keys; - r = ro_step(s, block); - n = ro_node(s); + r = bn_read_lock(info, block, &node); + if (r) + return r; + + n = dm_block_data(node); nr = le32_to_cpu(n->header.nr_entries); for (i = 0; i < nr; i++) { if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) { - r = walk_node(s, value64(n, i), fn, context); + r = walk_node(info, value64(n, i), fn, context); if (r) goto out; } else { @@ -874,7 +878,7 @@ static int walk_node(struct ro_spine *s, dm_block_t block, } out: - ro_pop(s); + dm_tm_unlock(info->tm, node); return r; } @@ -882,15 +886,7 @@ int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, int (*fn)(void *context, uint64_t *keys, void *leaf), void *context) { - int r; - struct ro_spine spine; - BUG_ON(info->levels > 1); - - init_ro_spine(&spine, info); - r = walk_node(&spine, root, fn, context); - exit_ro_spine(&spine); - - return r; + return walk_node(info, root, fn, context); } EXPORT_SYMBOL_GPL(dm_btree_walk); diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 786b689bdfc7..e8a904298887 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -564,7 +564,9 @@ static int sm_bootstrap_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count { struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - return smm->ll.nr_blocks; + *count = smm->ll.nr_blocks; + + return 0; } static int sm_bootstrap_get_nr_free(struct dm_space_map *sm, dm_block_t *count) @@ -581,7 +583,9 @@ static int sm_bootstrap_get_count(struct dm_space_map *sm, dm_block_t b, { struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - return b < smm->begin ? 1 : 0; + *result = (b < smm->begin) ? 1 : 0; + + return 0; } static int sm_bootstrap_count_is_more_than_one(struct dm_space_map *sm, diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c index 3bc30a0ae3d6..9cb797d800cf 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.c +++ b/drivers/md/persistent-data/dm-transaction-manager.c @@ -10,6 +10,8 @@ #include "dm-persistent-data-internal.h" #include <linux/export.h> +#include <linux/mutex.h> +#include <linux/hash.h> #include <linux/slab.h> #include <linux/device-mapper.h> @@ -17,6 +19,61 @@ /*----------------------------------------------------------------*/ +#define PREFETCH_SIZE 128 +#define PREFETCH_BITS 7 +#define PREFETCH_SENTINEL ((dm_block_t) -1ULL) + +struct prefetch_set { + struct mutex lock; + dm_block_t blocks[PREFETCH_SIZE]; +}; + +static unsigned prefetch_hash(dm_block_t b) +{ + return hash_64(b, PREFETCH_BITS); +} + +static void prefetch_wipe(struct prefetch_set *p) +{ + unsigned i; + for (i = 0; i < PREFETCH_SIZE; i++) + p->blocks[i] = PREFETCH_SENTINEL; +} + +static void prefetch_init(struct prefetch_set *p) +{ + mutex_init(&p->lock); + prefetch_wipe(p); +} + +static void prefetch_add(struct prefetch_set *p, dm_block_t b) +{ + unsigned h = prefetch_hash(b); + + mutex_lock(&p->lock); + if (p->blocks[h] == PREFETCH_SENTINEL) + p->blocks[h] = b; + + mutex_unlock(&p->lock); +} + +static void prefetch_issue(struct prefetch_set *p, struct dm_block_manager *bm) +{ + unsigned i; + + mutex_lock(&p->lock); + + for (i = 0; i < PREFETCH_SIZE; i++) + if (p->blocks[i] != PREFETCH_SENTINEL) { + dm_bm_prefetch(bm, p->blocks[i]); + p->blocks[i] = PREFETCH_SENTINEL; + } + + mutex_unlock(&p->lock); +} + +/*----------------------------------------------------------------*/ + struct shadow_info { struct hlist_node hlist; dm_block_t where; @@ -37,6 +94,8 @@ struct dm_transaction_manager { spinlock_t lock; struct hlist_head buckets[DM_HASH_SIZE]; + + struct prefetch_set prefetches; }; /*----------------------------------------------------------------*/ @@ -117,6 +176,8 @@ static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm, for (i = 0; i < DM_HASH_SIZE; i++) INIT_HLIST_HEAD(tm->buckets + i); + prefetch_init(&tm->prefetches); + return tm; } @@ -268,8 +329,14 @@ int dm_tm_read_lock(struct dm_transaction_manager *tm, dm_block_t b, struct dm_block_validator *v, struct dm_block **blk) { - if (tm->is_clone) - return dm_bm_read_try_lock(tm->real->bm, b, v, blk); + if (tm->is_clone) { + int r = dm_bm_read_try_lock(tm->real->bm, b, v, blk); + + if (r == -EWOULDBLOCK) + prefetch_add(&tm->real->prefetches, b); + + return r; + } return dm_bm_read_lock(tm->bm, b, v, blk); } @@ -317,6 +384,12 @@ struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm) return tm->bm; } +void dm_tm_issue_prefetches(struct dm_transaction_manager *tm) +{ + prefetch_issue(&tm->prefetches, tm->bm); +} +EXPORT_SYMBOL_GPL(dm_tm_issue_prefetches); + /*----------------------------------------------------------------*/ static int dm_tm_create_internal(struct dm_block_manager *bm, diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h index 2772ed2a781a..2e0d4d66fb1b 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.h +++ b/drivers/md/persistent-data/dm-transaction-manager.h @@ -109,6 +109,13 @@ int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b, struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm); /* + * If you're using a non-blocking clone the tm will build up a list of + * requested blocks that weren't in core. This call will request those + * blocks to be prefetched. + */ +void dm_tm_issue_prefetches(struct dm_transaction_manager *tm); + +/* * A little utility that ties the knot by producing a transaction manager * that has a space map managed by the transaction manager... * |