From a195db2d29a47c2c3a61386009bd400df18c86cf Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Mon, 6 Oct 2014 16:30:06 -0400 Subject: dm bio prison: switch to using a red black tree Previously it was using a fixed sized hash table. There are times when very many concurrent cells are held (such as when processing a very large discard). When this happens the hash table performance becomes very poor. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-bio-prison.c | 172 +++++++++++++++++++-------------------------- 1 file changed, 73 insertions(+), 99 deletions(-) (limited to 'drivers/md/dm-bio-prison.c') diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c index f752d12081ff..90a56625245a 100644 --- a/drivers/md/dm-bio-prison.c +++ b/drivers/md/dm-bio-prison.c @@ -14,68 +14,38 @@ /*----------------------------------------------------------------*/ -struct bucket { - spinlock_t lock; - struct hlist_head cells; -}; +#define MIN_CELLS 1024 struct dm_bio_prison { + spinlock_t lock; mempool_t *cell_pool; - - unsigned nr_buckets; - unsigned hash_mask; - struct bucket *buckets; + struct rb_root cells; }; -/*----------------------------------------------------------------*/ - -static uint32_t calc_nr_buckets(unsigned nr_cells) -{ - uint32_t n = 128; - - nr_cells /= 4; - nr_cells = min(nr_cells, 8192u); - - while (n < nr_cells) - n <<= 1; - - return n; -} - static struct kmem_cache *_cell_cache; -static void init_bucket(struct bucket *b) -{ - spin_lock_init(&b->lock); - INIT_HLIST_HEAD(&b->cells); -} +/*----------------------------------------------------------------*/ /* * @nr_cells should be the number of cells you want in use _concurrently_. * Don't confuse it with the number of distinct keys. */ -struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells) +struct dm_bio_prison *dm_bio_prison_create(void) { - unsigned i; - uint32_t nr_buckets = calc_nr_buckets(nr_cells); - size_t len = sizeof(struct dm_bio_prison) + - (sizeof(struct bucket) * nr_buckets); - struct dm_bio_prison *prison = kmalloc(len, GFP_KERNEL); + struct dm_bio_prison *prison = kmalloc(sizeof(*prison), GFP_KERNEL); if (!prison) return NULL; - prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache); + spin_lock_init(&prison->lock); + + prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache); if (!prison->cell_pool) { kfree(prison); return NULL; } - prison->nr_buckets = nr_buckets; - prison->hash_mask = nr_buckets - 1; - prison->buckets = (struct bucket *) (prison + 1); - for (i = 0; i < nr_buckets; i++) - init_bucket(prison->buckets + i); + prison->cells = RB_ROOT; return prison; } @@ -101,68 +71,73 @@ void dm_bio_prison_free_cell(struct dm_bio_prison *prison, } EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell); -static uint32_t hash_key(struct dm_bio_prison *prison, struct dm_cell_key *key) +static void __setup_new_cell(struct dm_cell_key *key, + struct bio *holder, + struct dm_bio_prison_cell *cell) { - const unsigned long BIG_PRIME = 4294967291UL; - uint64_t hash = key->block * BIG_PRIME; - - return (uint32_t) (hash & prison->hash_mask); + memcpy(&cell->key, key, sizeof(cell->key)); + cell->holder = holder; + bio_list_init(&cell->bios); } -static int keys_equal(struct dm_cell_key *lhs, struct dm_cell_key *rhs) +static int cmp_keys(struct dm_cell_key *lhs, + struct dm_cell_key *rhs) { - return (lhs->virtual == rhs->virtual) && - (lhs->dev == rhs->dev) && - (lhs->block == rhs->block); -} + if (lhs->virtual < rhs->virtual) + return -1; -static struct bucket *get_bucket(struct dm_bio_prison *prison, - struct dm_cell_key *key) -{ - return prison->buckets + hash_key(prison, key); -} + if (lhs->virtual > rhs->virtual) + return 1; -static struct dm_bio_prison_cell *__search_bucket(struct bucket *b, - struct dm_cell_key *key) -{ - struct dm_bio_prison_cell *cell; + if (lhs->dev < rhs->dev) + return -1; - hlist_for_each_entry(cell, &b->cells, list) - if (keys_equal(&cell->key, key)) - return cell; + if (lhs->dev > rhs->dev) + return 1; - return NULL; -} + if (lhs->block < rhs->block) + return -1; -static void __setup_new_cell(struct bucket *b, - struct dm_cell_key *key, - struct bio *holder, - struct dm_bio_prison_cell *cell) -{ - memcpy(&cell->key, key, sizeof(cell->key)); - cell->holder = holder; - bio_list_init(&cell->bios); - hlist_add_head(&cell->list, &b->cells); + if (lhs->block > rhs->block) + return 1; + + return 0; } -static int __bio_detain(struct bucket *b, +static int __bio_detain(struct dm_bio_prison *prison, struct dm_cell_key *key, struct bio *inmate, struct dm_bio_prison_cell *cell_prealloc, struct dm_bio_prison_cell **cell_result) { - struct dm_bio_prison_cell *cell; - - cell = __search_bucket(b, key); - if (cell) { - if (inmate) - bio_list_add(&cell->bios, inmate); - *cell_result = cell; - return 1; + int r; + struct rb_node **new = &prison->cells.rb_node, *parent = NULL; + + while (*new) { + struct dm_bio_prison_cell *cell = + container_of(*new, struct dm_bio_prison_cell, node); + + r = cmp_keys(key, &cell->key); + + parent = *new; + if (r < 0) + new = &((*new)->rb_left); + else if (r > 0) + new = &((*new)->rb_right); + else { + if (inmate) + bio_list_add(&cell->bios, inmate); + *cell_result = cell; + return 1; + } } - __setup_new_cell(b, key, inmate, cell_prealloc); + __setup_new_cell(key, inmate, cell_prealloc); *cell_result = cell_prealloc; + + rb_link_node(&cell_prealloc->node, parent, new); + rb_insert_color(&cell_prealloc->node, &prison->cells); + return 0; } @@ -174,11 +149,10 @@ static int bio_detain(struct dm_bio_prison *prison, { int r; unsigned long flags; - struct bucket *b = get_bucket(prison, key); - spin_lock_irqsave(&b->lock, flags); - r = __bio_detain(b, key, inmate, cell_prealloc, cell_result); - spin_unlock_irqrestore(&b->lock, flags); + spin_lock_irqsave(&prison->lock, flags); + r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result); + spin_unlock_irqrestore(&prison->lock, flags); return r; } @@ -205,10 +179,11 @@ EXPORT_SYMBOL_GPL(dm_get_cell); /* * @inmates must have been initialised prior to this call */ -static void __cell_release(struct dm_bio_prison_cell *cell, +static void __cell_release(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell, struct bio_list *inmates) { - hlist_del(&cell->list); + rb_erase(&cell->node, &prison->cells); if (inmates) { if (cell->holder) @@ -222,21 +197,21 @@ void dm_cell_release(struct dm_bio_prison *prison, struct bio_list *bios) { unsigned long flags; - struct bucket *b = get_bucket(prison, &cell->key); - spin_lock_irqsave(&b->lock, flags); - __cell_release(cell, bios); - spin_unlock_irqrestore(&b->lock, flags); + spin_lock_irqsave(&prison->lock, flags); + __cell_release(prison, cell, bios); + spin_unlock_irqrestore(&prison->lock, flags); } EXPORT_SYMBOL_GPL(dm_cell_release); /* * Sometimes we don't want the holder, just the additional bios. */ -static void __cell_release_no_holder(struct dm_bio_prison_cell *cell, +static void __cell_release_no_holder(struct dm_bio_prison *prison, + struct dm_bio_prison_cell *cell, struct bio_list *inmates) { - hlist_del(&cell->list); + rb_erase(&cell->node, &prison->cells); bio_list_merge(inmates, &cell->bios); } @@ -245,11 +220,10 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison, struct bio_list *inmates) { unsigned long flags; - struct bucket *b = get_bucket(prison, &cell->key); - spin_lock_irqsave(&b->lock, flags); - __cell_release_no_holder(cell, inmates); - spin_unlock_irqrestore(&b->lock, flags); + spin_lock_irqsave(&prison->lock, flags); + __cell_release_no_holder(prison, cell, inmates); + spin_unlock_irqrestore(&prison->lock, flags); } EXPORT_SYMBOL_GPL(dm_cell_release_no_holder); -- cgit v1.2.3 From 2d759a46b4d65e1392843cf9df7101897af87008 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 10 Oct 2014 15:27:16 +0100 Subject: dm thin: remap the bios in a cell immediately This use of direct submission in process_prepared_mapping() reduces latency for submitting bios in a cell by avoiding adding those bios to the deferred list and waiting for the next iteration of the worker. But this direct submission exposes the potential for a race between releasing a cell and incrementing deferred set. Fix this by introducing dm_cell_visit_release() and refactoring inc_remap_and_issue_cell() accordingly. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-bio-prison.c | 14 ++++++++ drivers/md/dm-bio-prison.h | 8 +++++ drivers/md/dm-thin.c | 90 +++++++++++++++++++++++++++++++--------------- 3 files changed, 83 insertions(+), 29 deletions(-) (limited to 'drivers/md/dm-bio-prison.c') diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c index 90a56625245a..bbe22a5dc06b 100644 --- a/drivers/md/dm-bio-prison.c +++ b/drivers/md/dm-bio-prison.c @@ -241,6 +241,20 @@ void dm_cell_error(struct dm_bio_prison *prison, } EXPORT_SYMBOL_GPL(dm_cell_error); +void dm_cell_visit_release(struct dm_bio_prison *prison, + void (*visit_fn)(void *, struct dm_bio_prison_cell *), + void *context, + struct dm_bio_prison_cell *cell) +{ + unsigned long flags; + + spin_lock_irqsave(&prison->lock, flags); + visit_fn(context, cell); + rb_erase(&cell->node, &prison->cells); + spin_unlock_irqrestore(&prison->lock, flags); +} +EXPORT_SYMBOL_GPL(dm_cell_visit_release); + /*----------------------------------------------------------------*/ #define DEFERRED_SET_SIZE 64 diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h index c0cddb118582..b03988667740 100644 --- a/drivers/md/dm-bio-prison.h +++ b/drivers/md/dm-bio-prison.h @@ -89,6 +89,14 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison, void dm_cell_error(struct dm_bio_prison *prison, struct dm_bio_prison_cell *cell, int error); +/* + * Visits the cell and then releases. Guarantees no new inmates are + * inserted between the visit and release. + */ +void dm_cell_visit_release(struct dm_bio_prison *prison, + void (*visit_fn)(void *, struct dm_bio_prison_cell *), + void *context, struct dm_bio_prison_cell *cell); + /*----------------------------------------------------------------*/ /* diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 912d7f4d89d1..5036d4b3f368 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -343,6 +343,15 @@ static void cell_release(struct pool *pool, dm_bio_prison_free_cell(pool->prison, cell); } +static void cell_visit_release(struct pool *pool, + void (*fn)(void *, struct dm_bio_prison_cell *), + void *context, + struct dm_bio_prison_cell *cell) +{ + dm_cell_visit_release(pool->prison, fn, context, cell); + dm_bio_prison_free_cell(pool->prison, cell); +} + static void cell_release_no_holder(struct pool *pool, struct dm_bio_prison_cell *cell, struct bio_list *bios) @@ -697,55 +706,75 @@ static void overwrite_endio(struct bio *bio, int err) */ /* - * This sends the bios in the cell back to the deferred_bios list. + * This sends the bios in the cell, except the original holder, back + * to the deferred_bios list. */ -static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell) +static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell) { struct pool *pool = tc->pool; unsigned long flags; spin_lock_irqsave(&tc->lock, flags); - cell_release(pool, cell, &tc->deferred_bio_list); + cell_release_no_holder(pool, cell, &tc->deferred_bio_list); spin_unlock_irqrestore(&tc->lock, flags); wake_worker(pool); } -/* - * Same as cell_defer above, except it omits the original holder of the cell. - */ -static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell) +static void thin_defer_bio(struct thin_c *tc, struct bio *bio); + +struct remap_info { + struct thin_c *tc; + struct bio_list defer_bios; + struct bio_list issue_bios; +}; + +static void __inc_remap_and_issue_cell(void *context, + struct dm_bio_prison_cell *cell) { - struct pool *pool = tc->pool; - unsigned long flags; + struct remap_info *info = context; + struct bio *bio; - spin_lock_irqsave(&tc->lock, flags); - cell_release_no_holder(pool, cell, &tc->deferred_bio_list); - spin_unlock_irqrestore(&tc->lock, flags); + while ((bio = bio_list_pop(&cell->bios))) { + if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) + bio_list_add(&info->defer_bios, bio); + else { + inc_all_io_entry(info->tc->pool, bio); - wake_worker(pool); + /* + * We can't issue the bios with the bio prison lock + * held, so we add them to a list to issue on + * return from this function. + */ + bio_list_add(&info->issue_bios, bio); + } + } } -static void thin_defer_bio(struct thin_c *tc, struct bio *bio); - static void inc_remap_and_issue_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell, dm_block_t block) { struct bio *bio; - struct bio_list bios; + struct remap_info info; - bio_list_init(&bios); - cell_release_no_holder(tc->pool, cell, &bios); + info.tc = tc; + bio_list_init(&info.defer_bios); + bio_list_init(&info.issue_bios); - while ((bio = bio_list_pop(&bios))) { - if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) - thin_defer_bio(tc, bio); - else { - inc_all_io_entry(tc->pool, bio); - remap_and_issue(tc, bio, block); - } - } + /* + * We have to be careful to inc any bios we're about to issue + * before the cell is released, and avoid a race with new bios + * being added to the cell. + */ + cell_visit_release(tc->pool, __inc_remap_and_issue_cell, + &info, cell); + + while ((bio = bio_list_pop(&info.defer_bios))) + thin_defer_bio(tc, bio); + + while ((bio = bio_list_pop(&info.issue_bios))) + remap_and_issue(info.tc, bio, block); } static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) @@ -796,10 +825,13 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) * the bios in the cell. */ if (bio) { - cell_defer_no_holder(tc, m->cell); + inc_remap_and_issue_cell(tc, m->cell, m->data_block); bio_endio(bio, 0); - } else - cell_defer(tc, m->cell); + } else { + inc_all_io_entry(tc->pool, m->cell->holder); + remap_and_issue(tc, m->cell->holder, m->data_block); + inc_remap_and_issue_cell(tc, m->cell, m->data_block); + } out: list_del(&m->list); -- cgit v1.2.3 From 5f274d886598c9fd26d2499bf3f68306f170e9db Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 17 Sep 2014 10:17:39 +0100 Subject: dm bio prison: introduce support for locking ranges of blocks Ranges will be placed in the same cell if they overlap. Range locking is a prerequisite for more efficient multi-block discard support in both the cache and thin-provisioning targets. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-bio-prison.c | 4 ++-- drivers/md/dm-bio-prison.h | 12 ++++++++---- drivers/md/dm-cache-target.c | 3 ++- drivers/md/dm-thin.c | 6 ++++-- 4 files changed, 16 insertions(+), 9 deletions(-) (limited to 'drivers/md/dm-bio-prison.c') diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c index bbe22a5dc06b..be065300e93c 100644 --- a/drivers/md/dm-bio-prison.c +++ b/drivers/md/dm-bio-prison.c @@ -95,10 +95,10 @@ static int cmp_keys(struct dm_cell_key *lhs, if (lhs->dev > rhs->dev) return 1; - if (lhs->block < rhs->block) + if (lhs->block_end <= rhs->block_begin) return -1; - if (lhs->block > rhs->block) + if (lhs->block_begin >= rhs->block_end) return 1; return 0; diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h index b03988667740..74cf01144b1f 100644 --- a/drivers/md/dm-bio-prison.h +++ b/drivers/md/dm-bio-prison.h @@ -23,11 +23,14 @@ */ struct dm_bio_prison; -/* FIXME: this needs to be more abstract */ +/* + * Keys define a range of blocks within either a virtual or physical + * device. + */ struct dm_cell_key { int virtual; dm_thin_id dev; - dm_block_t block; + dm_block_t block_begin, block_end; }; /* @@ -59,7 +62,7 @@ void dm_bio_prison_free_cell(struct dm_bio_prison *prison, struct dm_bio_prison_cell *cell); /* - * Creates, or retrieves a cell for the given key. + * Creates, or retrieves a cell that overlaps the given key. * * Returns 1 if pre-existing cell returned, zero if new cell created using * @cell_prealloc. @@ -70,7 +73,8 @@ int dm_get_cell(struct dm_bio_prison *prison, struct dm_bio_prison_cell **cell_result); /* - * An atomic op that combines retrieving a cell, and adding a bio to it. + * An atomic op that combines retrieving or creating a cell, and adding a + * bio to it. * * Returns 1 if the cell was already held, 0 if @inmate is the new holder. */ diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 69de8b43ca12..890e2fff4074 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -436,7 +436,8 @@ static void build_key(dm_oblock_t oblock, struct dm_cell_key *key) { key->virtual = 0; key->dev = 0; - key->block = from_oblock(oblock); + key->block_begin = from_oblock(oblock); + key->block_end = key->block_begin + 1ULL; } /* diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index fb05f6a4bbfd..8c5504c0e894 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -115,7 +115,8 @@ static void build_data_key(struct dm_thin_device *td, { key->virtual = 0; key->dev = dm_thin_dev_id(td); - key->block = b; + key->block_begin = b; + key->block_end = b + 1ULL; } static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, @@ -123,7 +124,8 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, { key->virtual = 1; key->dev = dm_thin_dev_id(td); - key->block = b; + key->block_begin = b; + key->block_end = b + 1ULL; } /*----------------------------------------------------------------*/ -- cgit v1.2.3