From c2a4f3183a1248f615a695fbd8905da55ad11bba Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Sep 2013 23:17:31 -0700 Subject: bcache: Fix a writeback performance regression Background writeback works by scanning the btree for dirty data and adding those keys into a fixed size buffer, then for each dirty key in the keybuf writing it to the backing device. When read_dirty() finishes and it's time to scan for more dirty data, we need to wait for the outstanding writeback IO to finish - they still take up slots in the keybuf (so that foreground writes can check for them to avoid races) - without that wait, we'll continually rescan when we'll be able to add at most a key or two to the keybuf, and that takes locks that starves foreground IO. Doh. Signed-off-by: Kent Overstreet Cc: linux-stable # >= v3.10 Signed-off-by: Linus Torvalds --- drivers/md/bcache/bcache.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'drivers/md/bcache/bcache.h') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index b39f6f0b45f2..0f12382aa35d 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -498,7 +498,7 @@ struct cached_dev { */ atomic_t has_dirty; - struct ratelimit writeback_rate; + struct bch_ratelimit writeback_rate; struct delayed_work writeback_rate_update; /* @@ -507,10 +507,9 @@ struct cached_dev { */ sector_t last_read; - /* Number of writeback bios in flight */ - atomic_t in_flight; + /* Limit number of writeback bios in flight */ + struct semaphore in_flight; struct closure_with_timer writeback; - struct closure_waitlist writeback_wait; struct keybuf writeback_keys; -- cgit v1.2.3 From 49b1212dfacfe51f951442563d1617bb06aac575 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 17:16:09 -0700 Subject: bcache: Use blkdev_issue_discard() The old asynchronous discard code was really a relic from when all the allocation code was asynchronous - now that allocation runs out of a dedicated thread there's no point in keeping around all that complicated machinery. Signed-off-by: Kent Overstreet --- drivers/md/bcache/alloc.c | 114 +++++---------------------------------------- drivers/md/bcache/bcache.h | 10 ---- drivers/md/bcache/super.c | 4 -- 3 files changed, 11 insertions(+), 117 deletions(-) (limited to 'drivers/md/bcache/bcache.h') diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index e45f5575fd4d..e033b0203b68 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -63,13 +63,12 @@ #include "bcache.h" #include "btree.h" +#include #include #include #include #include -#define MAX_IN_FLIGHT_DISCARDS 8U - /* Bucket heap / gen */ uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) @@ -121,75 +120,6 @@ void bch_rescale_priorities(struct cache_set *c, int sectors) mutex_unlock(&c->bucket_lock); } -/* Discard/TRIM */ - -struct discard { - struct list_head list; - struct work_struct work; - struct cache *ca; - long bucket; - - struct bio bio; - struct bio_vec bv; -}; - -static void discard_finish(struct work_struct *w) -{ - struct discard *d = container_of(w, struct discard, work); - struct cache *ca = d->ca; - char buf[BDEVNAME_SIZE]; - - if (!test_bit(BIO_UPTODATE, &d->bio.bi_flags)) { - pr_notice("discard error on %s, disabling", - bdevname(ca->bdev, buf)); - d->ca->discard = 0; - } - - mutex_lock(&ca->set->bucket_lock); - - fifo_push(&ca->free, d->bucket); - list_add(&d->list, &ca->discards); - atomic_dec(&ca->discards_in_flight); - - mutex_unlock(&ca->set->bucket_lock); - - closure_wake_up(&ca->set->bucket_wait); - wake_up_process(ca->alloc_thread); - - closure_put(&ca->set->cl); -} - -static void discard_endio(struct bio *bio, int error) -{ - struct discard *d = container_of(bio, struct discard, bio); - schedule_work(&d->work); -} - -static void do_discard(struct cache *ca, long bucket) -{ - struct discard *d = list_first_entry(&ca->discards, - struct discard, list); - - list_del(&d->list); - d->bucket = bucket; - - atomic_inc(&ca->discards_in_flight); - closure_get(&ca->set->cl); - - bio_init(&d->bio); - - d->bio.bi_sector = bucket_to_sector(ca->set, d->bucket); - d->bio.bi_bdev = ca->bdev; - d->bio.bi_rw = REQ_WRITE|REQ_DISCARD; - d->bio.bi_max_vecs = 1; - d->bio.bi_io_vec = d->bio.bi_inline_vecs; - d->bio.bi_size = bucket_bytes(ca); - d->bio.bi_end_io = discard_endio; - bio_set_prio(&d->bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); - - submit_bio(0, &d->bio); -} - /* Allocation */ static inline bool can_inc_bucket_gen(struct bucket *b) @@ -398,16 +328,18 @@ static int bch_allocator_thread(void *arg) else break; - allocator_wait(ca, (int) fifo_free(&ca->free) > - atomic_read(&ca->discards_in_flight)); - if (ca->discard) { - allocator_wait(ca, !list_empty(&ca->discards)); - do_discard(ca, bucket); - } else { - fifo_push(&ca->free, bucket); - closure_wake_up(&ca->set->bucket_wait); + mutex_unlock(&ca->set->bucket_lock); + blkdev_issue_discard(ca->bdev, + bucket_to_sector(ca->set, bucket), + ca->sb.block_size, GFP_KERNEL, 0); + mutex_lock(&ca->set->bucket_lock); } + + allocator_wait(ca, !fifo_full(&ca->free)); + + fifo_push(&ca->free, bucket); + closure_wake_up(&ca->set->bucket_wait); } /* @@ -556,22 +488,8 @@ int bch_cache_allocator_start(struct cache *ca) return 0; } -void bch_cache_allocator_exit(struct cache *ca) -{ - struct discard *d; - - while (!list_empty(&ca->discards)) { - d = list_first_entry(&ca->discards, struct discard, list); - cancel_work_sync(&d->work); - list_del(&d->list); - kfree(d); - } -} - int bch_cache_allocator_init(struct cache *ca) { - unsigned i; - /* * Reserve: * Prio/gen writes first @@ -589,15 +507,5 @@ int bch_cache_allocator_init(struct cache *ca) ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + ca->watermark[WATERMARK_MOVINGGC]; - for (i = 0; i < MAX_IN_FLIGHT_DISCARDS; i++) { - struct discard *d = kzalloc(sizeof(*d), GFP_KERNEL); - if (!d) - return -ENOMEM; - - d->ca = ca; - INIT_WORK(&d->work, discard_finish); - list_add(&d->list, &ca->discards); - } - return 0; } diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 0f12382aa35d..bf37474a6888 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -620,15 +620,6 @@ struct cache { bool discard; /* Get rid of? */ - /* - * We preallocate structs for issuing discards to buckets, and keep them - * on this list when they're not in use; do_discard() issues discards - * whenever there's work to do and is called by free_some_buckets() and - * when a discard finishes. - */ - atomic_t discards_in_flight; - struct list_head discards; - struct journal_device journal; /* The rest of this all shows up in sysfs */ @@ -1222,7 +1213,6 @@ int bch_btree_cache_alloc(struct cache_set *); void bch_moving_init_cache_set(struct cache_set *); int bch_cache_allocator_start(struct cache *ca); -void bch_cache_allocator_exit(struct cache *ca); int bch_cache_allocator_init(struct cache *ca); void bch_debug_exit(void); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 547c4c57b052..fd37342a7eb5 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1725,8 +1725,6 @@ void bch_cache_release(struct kobject *kobj) if (ca->set) ca->set->cache[ca->sb.nr_this_dev] = NULL; - bch_cache_allocator_exit(ca); - bio_split_pool_free(&ca->bio_split_hook); free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); @@ -1758,8 +1756,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) __module_get(THIS_MODULE); kobject_init(&ca->kobj, &bch_cache_ktype); - INIT_LIST_HEAD(&ca->discards); - bio_init(&ca->journal.bio); ca->journal.bio.bi_max_vecs = 8; ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; -- cgit v1.2.3 From 77c320eb46e216c17aee5c943949229ccfed6904 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Jul 2013 19:42:51 -0700 Subject: bcache: Add on error panic/unregister setting Works kind of like the ext4 setting, to panic or remount read only on errors. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 6 ++++++ drivers/md/bcache/journal.c | 7 +++---- drivers/md/bcache/super.c | 6 +++++- drivers/md/bcache/sysfs.c | 21 +++++++++++++++++++++ 4 files changed, 35 insertions(+), 5 deletions(-) (limited to 'drivers/md/bcache/bcache.h') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index bf37474a6888..578615604be5 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -843,8 +843,14 @@ struct cache_set { atomic_long_t cache_read_races; atomic_long_t writeback_keys_done; atomic_long_t writeback_keys_failed; + + enum { + ON_ERROR_UNREGISTER, + ON_ERROR_PANIC, + } on_error; unsigned error_limit; unsigned error_decay; + unsigned short journal_delay_ms; unsigned verify:1; unsigned key_merging_disabled:1; diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 7c9e6bf6aaba..9e8775872dba 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -305,10 +305,9 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list, list_for_each_entry(i, list, list) { BUG_ON(i->pin && atomic_read(i->pin) != 1); - if (n != i->j.seq) - pr_err( - "journal entries %llu-%llu missing! (replaying %llu-%llu)\n", - n, i->j.seq - 1, start, end); + cache_set_err_on(n != i->j.seq, s, +"bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)", + n, i->j.seq - 1, start, end); for (k = i->j.start; k < end(&i->j); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index fd37342a7eb5..74f2e902d876 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1260,7 +1260,8 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) { va_list args; - if (test_bit(CACHE_SET_STOPPING, &c->flags)) + if (c->on_error != ON_ERROR_PANIC && + test_bit(CACHE_SET_STOPPING, &c->flags)) return false; /* XXX: we can be called from atomic context @@ -1275,6 +1276,9 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) printk(", disabling caching\n"); + if (c->on_error == ON_ERROR_PANIC) + panic("panic forced after error\n"); + bch_cache_set_unregister(c); return true; } diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 924dcfdae111..4211d82179fc 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -21,6 +21,12 @@ static const char * const cache_replacement_policies[] = { NULL }; +static const char * const error_actions[] = { + "unregister", + "panic", + NULL +}; + write_attribute(attach); write_attribute(detach); write_attribute(unregister); @@ -90,6 +96,7 @@ rw_attribute(discard); rw_attribute(running); rw_attribute(label); rw_attribute(readahead); +rw_attribute(errors); rw_attribute(io_error_limit); rw_attribute(io_error_halflife); rw_attribute(verify); @@ -492,6 +499,10 @@ lock_root: sysfs_print(writeback_keys_failed, atomic_long_read(&c->writeback_keys_failed)); + if (attr == &sysfs_errors) + return bch_snprint_string_list(buf, PAGE_SIZE, error_actions, + c->on_error); + /* See count_io_errors for why 88 */ sysfs_print(io_error_halflife, c->error_decay * 88); sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT); @@ -569,6 +580,15 @@ STORE(__bch_cache_set) sysfs_strtoul(congested_write_threshold_us, c->congested_write_threshold_us); + if (attr == &sysfs_errors) { + ssize_t v = bch_read_string_list(buf, error_actions); + + if (v < 0) + return v; + + c->on_error = v; + } + if (attr == &sysfs_io_error_limit) c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; @@ -620,6 +640,7 @@ static struct attribute *bch_cache_set_files[] = { &sysfs_average_key_size, &sysfs_dirty_data, + &sysfs_errors, &sysfs_io_error_limit, &sysfs_io_error_halflife, &sysfs_congested, -- cgit v1.2.3 From 2d679fc75678551485df62274edaed452becd16d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 17 Aug 2013 02:13:15 -0700 Subject: bcache: Stripe size isn't necessarily a power of two Originally I got this right... except that the divides didn't use do_div(), which broke 32 bit kernels. When I went to fix that, I forgot that the raid stripe size usually isn't a power of two... doh Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/super.c | 7 +++---- drivers/md/bcache/sysfs.c | 2 +- drivers/md/bcache/writeback.c | 33 +++++++++++++++++---------------- drivers/md/bcache/writeback.h | 8 +++++--- 5 files changed, 27 insertions(+), 25 deletions(-) (limited to 'drivers/md/bcache/bcache.h') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 578615604be5..6e836f22f276 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -437,7 +437,7 @@ struct bcache_device { int flush_done; uint64_t nr_stripes; - unsigned stripe_size_bits; + unsigned stripe_size; atomic_t *stripe_sectors_dirty; unsigned long sectors_dirty_last; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 74f2e902d876..d3169c0652f8 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -761,11 +761,10 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, struct request_queue *q; size_t n; - if (!d->stripe_size_bits) - d->stripe_size_bits = 31; + if (!d->stripe_size) + d->stripe_size = 1 << 31; - d->nr_stripes = round_up(sectors, 1 << d->stripe_size_bits) >> - d->stripe_size_bits; + d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size); if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) return -ENOMEM; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 4211d82179fc..b3a66f17231d 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -157,7 +157,7 @@ SHOW(__bch_cached_dev) sysfs_hprint(dirty_data, bcache_dev_sectors_dirty(&dc->disk) << 9); - sysfs_hprint(stripe_size, (1 << dc->disk.stripe_size_bits) << 9); + sysfs_hprint(stripe_size, dc->disk.stripe_size << 9); var_printf(partial_stripes_expensive, "%u"); var_printf(sequential_merge, "%i"); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index ba3ee48320f2..b842fbfbf1db 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -114,25 +114,25 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k) static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) { - uint64_t stripe; + uint64_t stripe = KEY_START(k); unsigned nr_sectors = KEY_SIZE(k); struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys); - unsigned stripe_size = 1 << dc->disk.stripe_size_bits; if (!KEY_DIRTY(k)) return false; - stripe = KEY_START(k) >> dc->disk.stripe_size_bits; - while (1) { - if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) != - stripe_size) - return false; + do_div(stripe, dc->disk.stripe_size); - if (nr_sectors <= stripe_size) + while (1) { + if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) == + dc->disk.stripe_size) return true; - nr_sectors -= stripe_size; + if (nr_sectors <= dc->disk.stripe_size) + return false; + + nr_sectors -= dc->disk.stripe_size; stripe++; } } @@ -186,11 +186,12 @@ static void refill_dirty(struct closure *cl) for (i = 0; i < dc->disk.nr_stripes; i++) if (atomic_read(dc->disk.stripe_sectors_dirty + i) == - 1 << dc->disk.stripe_size_bits) + dc->disk.stripe_size) goto full_stripes; goto normal_refill; full_stripes: + searched_from_start = false; /* not searching entire btree */ bch_refill_keybuf(dc->disk.c, buf, &end, dirty_full_stripe_pred); } else { @@ -252,19 +253,19 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, uint64_t offset, int nr_sectors) { struct bcache_device *d = c->devices[inode]; - unsigned stripe_size, stripe_offset; - uint64_t stripe; + unsigned stripe_offset; + uint64_t stripe = offset; if (!d) return; - stripe_size = 1 << d->stripe_size_bits; - stripe = offset >> d->stripe_size_bits; - stripe_offset = offset & (stripe_size - 1); + do_div(stripe, d->stripe_size); + + stripe_offset = offset & (d->stripe_size - 1); while (nr_sectors) { int s = min_t(unsigned, abs(nr_sectors), - stripe_size - stripe_offset); + d->stripe_size - stripe_offset); if (nr_sectors < 0) s = -s; diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index c91f61bb95b6..34961888b5a9 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -18,16 +18,18 @@ static inline bool bcache_dev_stripe_dirty(struct bcache_device *d, uint64_t offset, unsigned nr_sectors) { - uint64_t stripe = offset >> d->stripe_size_bits; + uint64_t stripe = offset; + + do_div(stripe, d->stripe_size); while (1) { if (atomic_read(d->stripe_sectors_dirty + stripe)) return true; - if (nr_sectors <= 1 << d->stripe_size_bits) + if (nr_sectors <= d->stripe_size) return false; - nr_sectors -= 1 << d->stripe_size_bits; + nr_sectors -= d->stripe_size; stripe++; } } -- cgit v1.2.3 From e7c590eb63509c5d5f48a390d23aa25f4417ac96 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 10 Sep 2013 18:39:16 -0700 Subject: bcache: Convert btree_insert_check_key() to btree_insert_node() This was the main point of all this refactoring - now, btree_insert_check_key() won't fail just because the leaf node happened to be full. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 8 ----- drivers/md/bcache/btree.c | 82 ++++++++++++++++++++++++++------------------- drivers/md/bcache/btree.h | 6 ++-- drivers/md/bcache/request.c | 55 +++++++++++++++--------------- 4 files changed, 79 insertions(+), 72 deletions(-) (limited to 'drivers/md/bcache/bcache.h') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 6e836f22f276..10ce0c825fce 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -1091,14 +1091,6 @@ do { \ for (b = (ca)->buckets + (ca)->sb.first_bucket; \ b < (ca)->buckets + (ca)->sb.nbuckets; b++) -static inline void __bkey_put(struct cache_set *c, struct bkey *k) -{ - unsigned i; - - for (i = 0; i < KEY_PTRS(k); i++) - atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); -} - static inline void cached_dev_put(struct cached_dev *dc) { if (atomic_dec_and_test(&dc->count)) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 60d06465d772..08a85327431c 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -118,6 +118,15 @@ void bch_btree_op_init_stack(struct btree_op *op) /* Btree key manipulation */ +void __bkey_put(struct cache_set *c, struct bkey *k) +{ + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) + if (ptr_available(c, k, i)) + atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); +} + static void bkey_put(struct cache_set *c, struct bkey *k, int level) { if ((level && KEY_OFFSET(k)) || !level) @@ -1855,8 +1864,6 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, bool ret = false; unsigned oldsize = bch_count_data(b); - BUG_ON(!insert_lock(op, b)); - while (!bch_keylist_empty(insert_keys)) { struct bset *i = write_block(b); struct bkey *k = insert_keys->bottom; @@ -1898,39 +1905,6 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, return ret; } -bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, - struct bio *bio) -{ - bool ret = false; - uint64_t btree_ptr = b->key.ptr[0]; - unsigned long seq = b->seq; - BKEY_PADDED(k) tmp; - - rw_unlock(false, b); - rw_lock(true, b, b->level); - - if (b->key.ptr[0] != btree_ptr || - b->seq != seq + 1 || - should_split(b)) - goto out; - - op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio)); - - SET_KEY_PTRS(&op->replace, 1); - get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); - - SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV); - - bkey_copy(&tmp.k, &op->replace); - - BUG_ON(op->type != BTREE_INSERT); - BUG_ON(!btree_insert_key(b, op, &tmp.k)); - ret = true; -out: - downgrade_write(&b->lock); - return ret; -} - static int btree_split(struct btree *b, struct btree_op *op, struct keylist *insert_keys, struct keylist *parent_keys) @@ -2097,6 +2071,44 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, return ret; } +int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, + struct bkey *check_key) +{ + int ret = -EINTR; + uint64_t btree_ptr = b->key.ptr[0]; + unsigned long seq = b->seq; + struct keylist insert; + bool upgrade = op->lock == -1; + + bch_keylist_init(&insert); + + if (upgrade) { + rw_unlock(false, b); + rw_lock(true, b, b->level); + + if (b->key.ptr[0] != btree_ptr || + b->seq != seq + 1) + goto out; + } + + SET_KEY_PTRS(check_key, 1); + get_random_bytes(&check_key->ptr[0], sizeof(uint64_t)); + + SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV); + + bch_keylist_add(&insert, check_key); + + BUG_ON(op->type != BTREE_INSERT); + + ret = bch_btree_insert_node(b, op, &insert); + + BUG_ON(!ret && !bch_keylist_empty(&insert)); +out: + if (upgrade) + downgrade_write(&b->lock); + return ret; +} + static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op) { if (bch_keylist_empty(&op->keys)) diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 6d2fb7550706..73bd62105e39 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -216,6 +216,8 @@ static inline struct bkey *bch_btree_iter_init(struct btree *b, return __bch_btree_iter_init(b, iter, search, b->sets); } +void __bkey_put(struct cache_set *c, struct bkey *k); + /* Looping macros */ #define for_each_cached_btree(b, c, iter) \ @@ -380,8 +382,8 @@ struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *); struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, struct btree_op *); -bool bch_btree_insert_check_key(struct btree *, struct btree_op *, - struct bio *); +int bch_btree_insert_check_key(struct btree *, struct btree_op *, + struct bkey *); int bch_btree_insert(struct btree_op *, struct cache_set *); int bch_btree_search_recurse(struct btree *, struct btree_op *); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 2a7f0dd6abab..9ed334ca484d 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -869,35 +869,39 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, struct bio *bio, unsigned sectors) { int ret = 0; - unsigned reada; + unsigned reada = 0; struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); struct bio *miss; - miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); - if (miss == bio) - s->op.lookup_done = true; + if (s->cache_miss || s->op.skip) { + miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); + if (miss == bio) + s->op.lookup_done = true; + goto out_submit; + } - miss->bi_end_io = request_endio; - miss->bi_private = &s->cl; + if (!(bio->bi_rw & REQ_RAHEAD) && + !(bio->bi_rw & REQ_META) && + s->op.c->gc_stats.in_use < CUTOFF_CACHE_READA) + reada = min_t(sector_t, dc->readahead >> 9, + bdev_sectors(bio->bi_bdev) - bio_end_sector(bio)); - if (s->cache_miss || s->op.skip) - goto out_submit; + s->cache_bio_sectors = min(sectors, bio_sectors(bio) + reada); - if (miss != bio || - (bio->bi_rw & REQ_RAHEAD) || - (bio->bi_rw & REQ_META) || - s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA) - reada = 0; - else { - reada = min(dc->readahead >> 9, - sectors - bio_sectors(miss)); - - if (bio_end_sector(miss) + reada > bdev_sectors(miss->bi_bdev)) - reada = bdev_sectors(miss->bi_bdev) - - bio_end_sector(miss); - } + s->op.replace = KEY(s->op.inode, bio->bi_sector + + s->cache_bio_sectors, s->cache_bio_sectors); + + ret = bch_btree_insert_check_key(b, &s->op, &s->op.replace); + if (ret) + return ret; + + miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); + if (miss == bio) + s->op.lookup_done = true; + else + /* btree_search_recurse()'s btree iterator is no good anymore */ + ret = -EINTR; - s->cache_bio_sectors = bio_sectors(miss) + reada; s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT, DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS), dc->disk.bio_split); @@ -912,11 +916,6 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, s->op.cache_bio->bi_end_io = request_endio; s->op.cache_bio->bi_private = &s->cl; - /* btree_search_recurse()'s btree iterator is no good anymore */ - ret = -EINTR; - if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio)) - goto out_put; - bch_bio_map(s->op.cache_bio, NULL); if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) goto out_put; @@ -931,6 +930,8 @@ out_put: bio_put(s->op.cache_bio); s->op.cache_bio = NULL; out_submit: + miss->bi_end_io = request_endio; + miss->bi_private = &s->cl; closure_bio_submit(miss, &s->cl, s->d); return ret; } -- cgit v1.2.3 From e8e1d4682c8cb06dbcb5ef7bb851bf9bcb889c84 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 17:27:07 -0700 Subject: bcache: Convert try_wait to wait_queue_head_t We never waited on c->try_wait asynchronously, so just use the standard primitives. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 4 +- drivers/md/bcache/btree.c | 150 ++++++++++++++++++--------------------------- drivers/md/bcache/btree.h | 10 ++- drivers/md/bcache/super.c | 10 +-- 4 files changed, 75 insertions(+), 99 deletions(-) (limited to 'drivers/md/bcache/bcache.h') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 10ce0c825fce..c1c44191afb1 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -735,8 +735,8 @@ struct cache_set { * basically a lock for this that we can wait on asynchronously. The * btree_root() macro releases the lock when it returns. */ - struct closure *try_harder; - struct closure_waitlist try_wait; + struct task_struct *try_harder; + wait_queue_head_t try_wait; uint64_t try_harder_start; /* diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 731cd8e3fe90..4d50f1e7006e 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -437,7 +437,7 @@ static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) set_btree_node_dirty(b); - if (op && op->journal) { + if (op->journal) { if (w->journal && journal_pin_cmp(b->c, w, op)) { atomic_dec_bug(w->journal); @@ -574,34 +574,35 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, return b; } -static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) +static int mca_reap(struct btree *b, unsigned min_order, bool flush) { + struct closure cl; + + closure_init_stack(&cl); lockdep_assert_held(&b->c->bucket_lock); if (!down_write_trylock(&b->lock)) return -ENOMEM; - if (b->page_order < min_order) { + BUG_ON(btree_node_dirty(b) && !b->sets[0].data); + + if (b->page_order < min_order || + (!flush && + (btree_node_dirty(b) || + atomic_read(&b->io.cl.remaining) != -1))) { rw_unlock(true, b); return -ENOMEM; } - BUG_ON(btree_node_dirty(b) && !b->sets[0].data); - - if (cl && btree_node_dirty(b)) - bch_btree_node_write(b, NULL); - - if (cl) - closure_wait_event_async(&b->io.wait, cl, - atomic_read(&b->io.cl.remaining) == -1); - - if (btree_node_dirty(b) || - !closure_is_unlocked(&b->io.cl) || - work_pending(&b->work.work)) { - rw_unlock(true, b); - return -EAGAIN; + if (btree_node_dirty(b)) { + bch_btree_node_write(b, &cl); + closure_sync(&cl); } + /* wait for any in flight btree write */ + closure_wait_event_sync(&b->io.wait, &cl, + atomic_read(&b->io.cl.remaining) == -1); + return 0; } @@ -641,7 +642,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, break; if (++i > 3 && - !mca_reap(b, NULL, 0)) { + !mca_reap(b, 0, false)) { mca_data_free(b); rw_unlock(true, b); freed++; @@ -660,7 +661,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, list_rotate_left(&c->btree_cache); if (!b->accessed && - !mca_reap(b, NULL, 0)) { + !mca_reap(b, 0, false)) { mca_bucket_free(b); mca_data_free(b); rw_unlock(true, b); @@ -783,52 +784,27 @@ out: return b; } -static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, - int level, struct closure *cl) +static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) { - int ret = -ENOMEM; - struct btree *i; + struct btree *b; trace_bcache_btree_cache_cannibalize(c); - if (!cl) - return ERR_PTR(-ENOMEM); - - /* - * Trying to free up some memory - i.e. reuse some btree nodes - may - * require initiating IO to flush the dirty part of the node. If we're - * running under generic_make_request(), that IO will never finish and - * we would deadlock. Returning -EAGAIN causes the cache lookup code to - * punt to workqueue and retry. - */ - if (current->bio_list) - return ERR_PTR(-EAGAIN); - - if (c->try_harder && c->try_harder != cl) { - closure_wait_event_async(&c->try_wait, cl, !c->try_harder); - return ERR_PTR(-EAGAIN); - } + if (!c->try_harder) { + c->try_harder = current; + c->try_harder_start = local_clock(); + } else if (c->try_harder != current) + return ERR_PTR(-ENOSPC); - c->try_harder = cl; - c->try_harder_start = local_clock(); -retry: - list_for_each_entry_reverse(i, &c->btree_cache, list) { - int r = mca_reap(i, cl, btree_order(k)); - if (!r) - return i; - if (r != -ENOMEM) - ret = r; - } + list_for_each_entry_reverse(b, &c->btree_cache, list) + if (!mca_reap(b, btree_order(k), false)) + return b; - if (ret == -EAGAIN && - closure_blocking(cl)) { - mutex_unlock(&c->bucket_lock); - closure_sync(cl); - mutex_lock(&c->bucket_lock); - goto retry; - } + list_for_each_entry_reverse(b, &c->btree_cache, list) + if (!mca_reap(b, btree_order(k), true)) + return b; - return ERR_PTR(ret); + return ERR_PTR(-ENOMEM); } /* @@ -839,18 +815,19 @@ retry: */ void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl) { - if (c->try_harder == cl) { + if (c->try_harder == current) { bch_time_stats_update(&c->try_harder_time, c->try_harder_start); c->try_harder = NULL; - __closure_wake_up(&c->try_wait); + wake_up(&c->try_wait); } } -static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, - int level, struct closure *cl) +static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level) { struct btree *b; + BUG_ON(current->bio_list); + lockdep_assert_held(&c->bucket_lock); if (mca_find(c, k)) @@ -860,14 +837,14 @@ static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, * the list. Check if there's any freed nodes there: */ list_for_each_entry(b, &c->btree_cache_freeable, list) - if (!mca_reap(b, NULL, btree_order(k))) + if (!mca_reap(b, btree_order(k), false)) goto out; /* We never free struct btree itself, just the memory that holds the on * disk node. Check the freed list before allocating a new one: */ list_for_each_entry(b, &c->btree_cache_freed, list) - if (!mca_reap(b, NULL, 0)) { + if (!mca_reap(b, 0, false)) { mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); if (!b->sets[0].data) goto err; @@ -901,7 +878,7 @@ err: if (b) rw_unlock(true, b); - b = mca_cannibalize(c, k, level, cl); + b = mca_cannibalize(c, k); if (!IS_ERR(b)) goto out; @@ -919,10 +896,9 @@ err: * level and op->lock. */ struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, - int level, struct btree_op *op) + int level, bool write) { int i = 0; - bool write = level <= op->lock; struct btree *b; BUG_ON(level < 0); @@ -934,7 +910,7 @@ retry: return ERR_PTR(-EAGAIN); mutex_lock(&c->bucket_lock); - b = mca_alloc(c, k, level, &op->cl); + b = mca_alloc(c, k, level); mutex_unlock(&c->bucket_lock); if (!b) @@ -980,7 +956,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) struct btree *b; mutex_lock(&c->bucket_lock); - b = mca_alloc(c, k, level, NULL); + b = mca_alloc(c, k, level); mutex_unlock(&c->bucket_lock); if (!IS_ERR_OR_NULL(b)) { @@ -991,17 +967,12 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) /* Btree alloc */ -static void btree_node_free(struct btree *b, struct btree_op *op) +static void btree_node_free(struct btree *b) { unsigned i; trace_bcache_btree_node_free(b); - /* - * The BUG_ON() in btree_node_get() implies that we must have a write - * lock on parent to free or even invalidate a node - */ - BUG_ON(op->lock <= b->level); BUG_ON(b == b->c->root); if (btree_node_dirty(b)) @@ -1037,7 +1008,7 @@ retry: SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); - b = mca_alloc(c, &k.key, level, cl); + b = mca_alloc(c, &k.key, level); if (IS_ERR(b)) goto err_free; @@ -1173,8 +1144,7 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys, return stale; } -static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, - struct btree_op *op) +static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k) { /* * We block priorities from being written for the duration of garbage @@ -1191,7 +1161,7 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, memcpy(k->ptr, b->key.ptr, sizeof(uint64_t) * KEY_PTRS(&b->key)); - btree_node_free(n, op); + btree_node_free(n); up_write(&n->lock); } @@ -1211,8 +1181,8 @@ struct gc_merge_info { unsigned keys; }; -static void btree_gc_coalesce(struct btree *b, struct btree_op *op, - struct gc_stat *gc, struct gc_merge_info *r) +static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc, + struct gc_merge_info *r) { unsigned nodes = 0, keys = 0, blocks; int i; @@ -1228,7 +1198,7 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op, for (i = nodes - 1; i >= 0; --i) { if (r[i].b->written) - r[i].b = btree_gc_alloc(r[i].b, r[i].k, op); + r[i].b = btree_gc_alloc(r[i].b, r[i].k); if (r[i].b->written) return; @@ -1292,7 +1262,7 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op, r[i - 1].keys = n2->keys; } - btree_node_free(r->b, op); + btree_node_free(r->b); up_write(&r->b->lock); trace_bcache_btree_gc_coalesce(nodes); @@ -1324,7 +1294,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, memset(r, 0, sizeof(r)); while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) { - r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op); + r->b = bch_btree_node_get(b->c, r->k, b->level - 1, true); if (IS_ERR(r->b)) { ret = PTR_ERR(r->b); @@ -1337,7 +1307,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, if (!b->written && (r->b->level || stale > 10 || b->c->gc_always_rewrite)) - r->b = btree_gc_alloc(r->b, r->k, op); + r->b = btree_gc_alloc(r->b, r->k); if (r->b->level) ret = btree_gc_recurse(r->b, op, writes, gc); @@ -1350,7 +1320,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, bkey_copy_key(&b->c->gc_done, r->k); if (!b->written) - btree_gc_coalesce(b, op, gc, r); + btree_gc_coalesce(b, gc, r); if (r[GC_MERGE_NODES - 1].b) write(r[GC_MERGE_NODES - 1].b); @@ -1404,7 +1374,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, if (!IS_ERR_OR_NULL(n)) { closure_sync(&op->cl); bch_btree_set_root(b); - btree_node_free(n, op); + btree_node_free(n); rw_unlock(true, b); } @@ -2004,18 +1974,18 @@ static int btree_split(struct btree *b, struct btree_op *op, } rw_unlock(true, n1); - btree_node_free(b, op); + btree_node_free(b); bch_time_stats_update(&b->c->btree_split_time, start_time); return 0; err_free2: __bkey_put(n2->c, &n2->key); - btree_node_free(n2, op); + btree_node_free(n2); rw_unlock(true, n2); err_free1: __bkey_put(n1->c, &n1->key); - btree_node_free(n1, op); + btree_node_free(n1); rw_unlock(true, n1); err: if (n3 == ERR_PTR(-EAGAIN) || diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 17b7a4e39c7e..72794ab8e8e5 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -326,7 +326,7 @@ static inline void rw_unlock(bool w, struct btree *b) ({ \ int _r, l = (b)->level - 1; \ bool _w = l <= (op)->lock; \ - struct btree *_child = bch_btree_node_get((b)->c, key, l, op); \ + struct btree *_child = bch_btree_node_get((b)->c, key, l, _w); \ if (!IS_ERR(_child)) { \ _child->parent = (b); \ _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ @@ -356,6 +356,11 @@ static inline void rw_unlock(bool w, struct btree *b) } \ rw_unlock(_w, _b); \ bch_cannibalize_unlock(c, &(op)->cl); \ + if (_r == -ENOSPC) { \ + wait_event((c)->try_wait, \ + !(c)->try_harder); \ + _r = -EINTR; \ + } \ } while (_r == -EINTR); \ \ _r; \ @@ -375,8 +380,7 @@ void bch_btree_node_write(struct btree *, struct closure *); void bch_cannibalize_unlock(struct cache_set *, struct closure *); void bch_btree_set_root(struct btree *); struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *); -struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, - int, struct btree_op *); +struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool); int bch_btree_insert_check_key(struct btree *, struct btree_op *, struct bkey *); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index d3169c0652f8..9a164cd4058c 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1436,12 +1436,14 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->sort_crit_factor = int_sqrt(c->btree_pages); - mutex_init(&c->bucket_lock); - mutex_init(&c->sort_lock); - spin_lock_init(&c->sort_time_lock); closure_init_unlocked(&c->sb_write); + mutex_init(&c->bucket_lock); + init_waitqueue_head(&c->try_wait); closure_init_unlocked(&c->uuid_write); + spin_lock_init(&c->sort_time_lock); + mutex_init(&c->sort_lock); spin_lock_init(&c->btree_read_time_lock); + bch_moving_init_cache_set(c); INIT_LIST_HEAD(&c->list); @@ -1529,7 +1531,7 @@ static void run_cache_set(struct cache_set *c) goto err; err = "error reading btree root"; - c->root = bch_btree_node_get(c, k, j->btree_level, &op); + c->root = bch_btree_node_get(c, k, j->btree_level, true); if (IS_ERR_OR_NULL(c->root)) goto err; -- cgit v1.2.3 From 35fcd848d72683141052aa9880542461577f2dbe Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 17:29:09 -0700 Subject: bcache: Convert bucket_wait to wait_queue_head_t At one point we did do fancy asynchronous waiting stuff with bucket_wait, but that's all gone (and bucket_wait is used a lot less than it used to be). So use the standard primitives. Signed-off-by: Kent Overstreet --- drivers/md/bcache/alloc.c | 82 ++++++++++++++++++++++++++------------------- drivers/md/bcache/bcache.h | 8 ++--- drivers/md/bcache/btree.c | 25 ++++++-------- drivers/md/bcache/btree.h | 6 ++-- drivers/md/bcache/request.c | 9 ++--- drivers/md/bcache/super.c | 7 ++-- 6 files changed, 70 insertions(+), 67 deletions(-) (limited to 'drivers/md/bcache/bcache.h') diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index e033b0203b68..1b64e662e81b 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -339,7 +339,7 @@ static int bch_allocator_thread(void *arg) allocator_wait(ca, !fifo_full(&ca->free)); fifo_push(&ca->free, bucket); - closure_wake_up(&ca->set->bucket_wait); + wake_up(&ca->set->bucket_wait); } /* @@ -365,16 +365,41 @@ static int bch_allocator_thread(void *arg) } } -long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl) +long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait) { - long r = -1; -again: + DEFINE_WAIT(w); + struct bucket *b; + long r; + + /* fastpath */ + if (fifo_used(&ca->free) > ca->watermark[watermark]) { + fifo_pop(&ca->free, r); + goto out; + } + + if (!wait) + return -1; + + while (1) { + if (fifo_used(&ca->free) > ca->watermark[watermark]) { + fifo_pop(&ca->free, r); + break; + } + + prepare_to_wait(&ca->set->bucket_wait, &w, + TASK_UNINTERRUPTIBLE); + + mutex_unlock(&ca->set->bucket_lock); + schedule(); + mutex_lock(&ca->set->bucket_lock); + } + + finish_wait(&ca->set->bucket_wait, &w); +out: wake_up_process(ca->alloc_thread); - if (fifo_used(&ca->free) > ca->watermark[watermark] && - fifo_pop(&ca->free, r)) { - struct bucket *b = ca->buckets + r; #ifdef CONFIG_BCACHE_EDEBUG + { size_t iter; long i; @@ -387,36 +412,23 @@ again: BUG_ON(i == r); fifo_for_each(i, &ca->unused, iter) BUG_ON(i == r); -#endif - BUG_ON(atomic_read(&b->pin) != 1); - - SET_GC_SECTORS_USED(b, ca->sb.bucket_size); - - if (watermark <= WATERMARK_METADATA) { - SET_GC_MARK(b, GC_MARK_METADATA); - b->prio = BTREE_PRIO; - } else { - SET_GC_MARK(b, GC_MARK_RECLAIMABLE); - b->prio = INITIAL_PRIO; - } - - return r; } +#endif + b = ca->buckets + r; - trace_bcache_alloc_fail(ca); + BUG_ON(atomic_read(&b->pin) != 1); - if (cl) { - closure_wait(&ca->set->bucket_wait, cl); + SET_GC_SECTORS_USED(b, ca->sb.bucket_size); - if (closure_blocking(cl)) { - mutex_unlock(&ca->set->bucket_lock); - closure_sync(cl); - mutex_lock(&ca->set->bucket_lock); - goto again; - } + if (watermark <= WATERMARK_METADATA) { + SET_GC_MARK(b, GC_MARK_METADATA); + b->prio = BTREE_PRIO; + } else { + SET_GC_MARK(b, GC_MARK_RECLAIMABLE); + b->prio = INITIAL_PRIO; } - return -1; + return r; } void bch_bucket_free(struct cache_set *c, struct bkey *k) @@ -433,7 +445,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k) } int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, - struct bkey *k, int n, struct closure *cl) + struct bkey *k, int n, bool wait) { int i; @@ -446,7 +458,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, for (i = 0; i < n; i++) { struct cache *ca = c->cache_by_alloc[i]; - long b = bch_bucket_alloc(ca, watermark, cl); + long b = bch_bucket_alloc(ca, watermark, wait); if (b == -1) goto err; @@ -466,11 +478,11 @@ err: } int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, - struct bkey *k, int n, struct closure *cl) + struct bkey *k, int n, bool wait) { int ret; mutex_lock(&c->bucket_lock); - ret = __bch_bucket_alloc_set(c, watermark, k, n, cl); + ret = __bch_bucket_alloc_set(c, watermark, k, n, wait); mutex_unlock(&c->bucket_lock); return ret; } diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index c1c44191afb1..d3520748bc27 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -750,7 +750,7 @@ struct cache_set { * written. */ atomic_t prio_blocked; - struct closure_waitlist bucket_wait; + wait_queue_head_t bucket_wait; /* * For any bio we don't skip we subtract the number of sectors from @@ -1162,13 +1162,13 @@ uint8_t bch_inc_gen(struct cache *, struct bucket *); void bch_rescale_priorities(struct cache_set *, int); bool bch_bucket_add_unused(struct cache *, struct bucket *); -long bch_bucket_alloc(struct cache *, unsigned, struct closure *); +long bch_bucket_alloc(struct cache *, unsigned, bool); void bch_bucket_free(struct cache_set *, struct bkey *); int __bch_bucket_alloc_set(struct cache_set *, unsigned, - struct bkey *, int, struct closure *); + struct bkey *, int, bool); int bch_bucket_alloc_set(struct cache_set *, unsigned, - struct bkey *, int, struct closure *); + struct bkey *, int, bool); __printf(2, 3) bool bch_cache_set_error(struct cache_set *, const char *, ...); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 4d50f1e7006e..935d90df397b 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -813,7 +813,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k) * cannibalize_bucket() will take. This means every time we unlock the root of * the btree, we need to release this lock if we have it held. */ -void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl) +void bch_cannibalize_unlock(struct cache_set *c) { if (c->try_harder == current) { bch_time_stats_update(&c->try_harder_time, c->try_harder_start); @@ -995,15 +995,14 @@ static void btree_node_free(struct btree *b) mutex_unlock(&b->c->bucket_lock); } -struct btree *bch_btree_node_alloc(struct cache_set *c, int level, - struct closure *cl) +struct btree *bch_btree_node_alloc(struct cache_set *c, int level) { BKEY_PADDED(key) k; struct btree *b = ERR_PTR(-EAGAIN); mutex_lock(&c->bucket_lock); retry: - if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl)) + if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true)) goto err; SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); @@ -1036,10 +1035,9 @@ err: return b; } -static struct btree *btree_node_alloc_replacement(struct btree *b, - struct closure *cl) +static struct btree *btree_node_alloc_replacement(struct btree *b) { - struct btree *n = bch_btree_node_alloc(b->c, b->level, cl); + struct btree *n = bch_btree_node_alloc(b->c, b->level); if (!IS_ERR_OR_NULL(n)) bch_btree_sort_into(b, n); @@ -1152,7 +1150,7 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k) * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it * our closure. */ - struct btree *n = btree_node_alloc_replacement(b, NULL); + struct btree *n = btree_node_alloc_replacement(b); if (!IS_ERR_OR_NULL(n)) { swap(b, n); @@ -1359,7 +1357,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, int ret = 0, stale = btree_gc_mark_node(b, &keys, gc); if (b->level || stale > 10) - n = btree_node_alloc_replacement(b, NULL); + n = btree_node_alloc_replacement(b); if (!IS_ERR_OR_NULL(n)) swap(b, n); @@ -1882,10 +1880,7 @@ static int btree_split(struct btree *b, struct btree_op *op, struct btree *n1, *n2 = NULL, *n3 = NULL; uint64_t start_time = local_clock(); - if (b->level) - set_closure_blocking(&op->cl); - - n1 = btree_node_alloc_replacement(b, &op->cl); + n1 = btree_node_alloc_replacement(b); if (IS_ERR(n1)) goto err; @@ -1896,12 +1891,12 @@ static int btree_split(struct btree *b, struct btree_op *op, trace_bcache_btree_node_split(b, n1->sets[0].data->keys); - n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); + n2 = bch_btree_node_alloc(b->c, b->level); if (IS_ERR(n2)) goto err_free1; if (!b->parent) { - n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl); + n3 = bch_btree_node_alloc(b->c, b->level + 1); if (IS_ERR(n3)) goto err_free2; } diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 72794ab8e8e5..d691d954730e 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -355,7 +355,7 @@ static inline void rw_unlock(bool w, struct btree *b) _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ } \ rw_unlock(_w, _b); \ - bch_cannibalize_unlock(c, &(op)->cl); \ + bch_cannibalize_unlock(c); \ if (_r == -ENOSPC) { \ wait_event((c)->try_wait, \ !(c)->try_harder); \ @@ -377,9 +377,9 @@ static inline bool should_split(struct btree *b) void bch_btree_node_read(struct btree *); void bch_btree_node_write(struct btree *, struct closure *); -void bch_cannibalize_unlock(struct cache_set *, struct closure *); +void bch_cannibalize_unlock(struct cache_set *); void bch_btree_set_root(struct btree *); -struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *); +struct btree *bch_btree_node_alloc(struct cache_set *, int); struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, bool); int bch_btree_insert_check_key(struct btree *, struct btree_op *, diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index d85c7001df61..26d18f4bf4a0 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -350,14 +350,8 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors, struct cache_set *c = s->op.c; struct open_bucket *b; BKEY_PADDED(key) alloc; - struct closure cl, *w = NULL; unsigned i; - if (s->writeback) { - closure_init_stack(&cl); - w = &cl; - } - /* * We might have to allocate a new bucket, which we can't do with a * spinlock held. So if we have to allocate, we drop the lock, allocate @@ -375,7 +369,8 @@ static bool bch_alloc_sectors(struct bkey *k, unsigned sectors, spin_unlock(&c->data_bucket_lock); - if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w)) + if (bch_bucket_alloc_set(c, watermark, &alloc.key, + 1, s->writeback)) return false; spin_lock(&c->data_bucket_lock); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 9a164cd4058c..84398a82fbe3 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -427,7 +427,7 @@ static int __uuid_write(struct cache_set *c) lockdep_assert_held(&bch_register_lock); - if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, &cl)) + if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, true)) return 1; SET_KEY_SIZE(&k.key, c->sb.bucket_size); @@ -565,7 +565,7 @@ void bch_prio_write(struct cache *ca) p->magic = pset_magic(ca); p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); - bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, &cl); + bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true); BUG_ON(bucket == -1); mutex_unlock(&ca->set->bucket_lock); @@ -1439,6 +1439,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) closure_init_unlocked(&c->sb_write); mutex_init(&c->bucket_lock); init_waitqueue_head(&c->try_wait); + init_waitqueue_head(&c->bucket_wait); closure_init_unlocked(&c->uuid_write); spin_lock_init(&c->sort_time_lock); mutex_init(&c->sort_lock); @@ -1608,7 +1609,7 @@ static void run_cache_set(struct cache_set *c) goto err_unlock_gc; err = "cannot allocate new btree root"; - c->root = bch_btree_node_alloc(c, 0, &op.cl); + c->root = bch_btree_node_alloc(c, 0); if (IS_ERR_OR_NULL(c->root)) goto err_unlock_gc; -- cgit v1.2.3 From 72a44517f3ca3725dc86081d105457df46448679 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 24 Oct 2013 17:19:26 -0700 Subject: bcache: Convert gc to a kthread We needed a dedicated rescuer workqueue for gc anyways... and gc was conceptually a dedicated thread, just one that wasn't running all the time. Switch it to a dedicated thread to make the code a bit more straightforward. Signed-off-by: Kent Overstreet --- drivers/md/bcache/alloc.c | 6 +++--- drivers/md/bcache/bcache.h | 9 ++++---- drivers/md/bcache/btree.c | 50 ++++++++++++++++++++++++++++++-------------- drivers/md/bcache/btree.h | 10 +++++++-- drivers/md/bcache/movinggc.c | 35 +++++++++++++------------------ drivers/md/bcache/request.c | 2 +- drivers/md/bcache/super.c | 20 ++++++++---------- drivers/md/bcache/sysfs.c | 2 +- 8 files changed, 74 insertions(+), 60 deletions(-) (limited to 'drivers/md/bcache/bcache.h') diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 1b64e662e81b..b9bd5866055d 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -210,7 +210,7 @@ static void invalidate_buckets_lru(struct cache *ca) * multiple times when it can't do anything */ ca->invalidate_needs_gc = 1; - bch_queue_gc(ca->set); + wake_up_gc(ca->set); return; } @@ -235,7 +235,7 @@ static void invalidate_buckets_fifo(struct cache *ca) if (++checked >= ca->sb.nbuckets) { ca->invalidate_needs_gc = 1; - bch_queue_gc(ca->set); + wake_up_gc(ca->set); return; } } @@ -260,7 +260,7 @@ static void invalidate_buckets_random(struct cache *ca) if (++checked >= ca->sb.nbuckets / 2) { ca->invalidate_needs_gc = 1; - bch_queue_gc(ca->set); + wake_up_gc(ca->set); return; } } diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d3520748bc27..09410eb07d82 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -773,7 +773,7 @@ struct cache_set { struct gc_stat gc_stats; size_t nbuckets; - struct closure_with_waitlist gc; + struct task_struct *gc_thread; /* Where in the btree gc currently is */ struct bkey gc_done; @@ -786,11 +786,10 @@ struct cache_set { /* Counts how many sectors bio_insert has added to the cache */ atomic_t sectors_to_gc; - struct closure moving_gc; - struct closure_waitlist moving_gc_wait; + wait_queue_head_t moving_gc_wait; struct keybuf moving_gc_keys; /* Number of moving GC bios in flight */ - atomic_t in_flight; + struct semaphore moving_in_flight; struct btree *root; @@ -1176,7 +1175,7 @@ bool bch_cache_set_error(struct cache_set *, const char *, ...); void bch_prio_write(struct cache *); void bch_write_bdev_super(struct cached_dev *, struct closure *); -extern struct workqueue_struct *bcache_wq, *bch_gc_wq; +extern struct workqueue_struct *bcache_wq; extern const char * const bch_cache_modes[]; extern struct mutex bch_register_lock; extern struct list_head bch_cache_sets; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 935d90df397b..17bfd87fc8f4 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -28,7 +28,9 @@ #include #include +#include #include +#include #include #include #include @@ -105,7 +107,6 @@ static const char *op_type(struct btree_op *op) #define PTR_HASH(c, k) \ (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) -struct workqueue_struct *bch_gc_wq; static struct workqueue_struct *btree_io_wq; void bch_btree_op_init_stack(struct btree_op *op) @@ -732,12 +733,9 @@ int bch_btree_cache_alloc(struct cache_set *c) { unsigned i; - /* XXX: doesn't check for errors */ - - closure_init_unlocked(&c->gc); - for (i = 0; i < mca_reserve(c); i++) - mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); + if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL)) + return -ENOMEM; list_splice_init(&c->btree_cache, &c->btree_cache_freeable); @@ -1456,9 +1454,8 @@ size_t bch_btree_gc_finish(struct cache_set *c) return available; } -static void bch_btree_gc(struct closure *cl) +static void bch_btree_gc(struct cache_set *c) { - struct cache_set *c = container_of(cl, struct cache_set, gc.cl); int ret; unsigned long available; struct gc_stat stats; @@ -1483,7 +1480,7 @@ static void bch_btree_gc(struct closure *cl) if (ret) { pr_warn("gc failed!"); - continue_at(cl, bch_btree_gc, bch_gc_wq); + return; } /* Possibly wait for new UUIDs or whatever to hit disk */ @@ -1505,12 +1502,35 @@ static void bch_btree_gc(struct closure *cl) trace_bcache_gc_end(c); - continue_at(cl, bch_moving_gc, bch_gc_wq); + bch_moving_gc(c); +} + +static int bch_gc_thread(void *arg) +{ + struct cache_set *c = arg; + + while (1) { + bch_btree_gc(c); + + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) + break; + + try_to_freeze(); + schedule(); + } + + return 0; } -void bch_queue_gc(struct cache_set *c) +int bch_gc_thread_start(struct cache_set *c) { - closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl); + c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc"); + if (IS_ERR(c->gc_thread)) + return PTR_ERR(c->gc_thread); + + set_task_state(c->gc_thread, TASK_INTERRUPTIBLE); + return 0; } /* Initial partial gc */ @@ -2480,14 +2500,12 @@ void bch_btree_exit(void) { if (btree_io_wq) destroy_workqueue(btree_io_wq); - if (bch_gc_wq) - destroy_workqueue(bch_gc_wq); } int __init bch_btree_init(void) { - if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) || - !(btree_io_wq = create_singlethread_workqueue("bch_btree_io"))) + btree_io_wq = create_singlethread_workqueue("bch_btree_io"); + if (!btree_io_wq) return -ENOMEM; return 0; diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index d691d954730e..fa9641aaed39 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -388,12 +388,18 @@ int bch_btree_insert(struct btree_op *, struct cache_set *, struct keylist *); int bch_btree_search_recurse(struct btree *, struct btree_op *); -void bch_queue_gc(struct cache_set *); +int bch_gc_thread_start(struct cache_set *); size_t bch_btree_gc_finish(struct cache_set *); -void bch_moving_gc(struct closure *); +void bch_moving_gc(struct cache_set *); int bch_btree_check(struct cache_set *, struct btree_op *); uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); +static inline void wake_up_gc(struct cache_set *c) +{ + if (c->gc_thread) + wake_up_process(c->gc_thread); +} + void bch_keybuf_init(struct keybuf *); void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *, keybuf_pred_fn *); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 2c42377a65aa..6ba050456ec8 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -57,8 +57,7 @@ static void write_moving_finish(struct closure *cl) bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); - atomic_dec_bug(&io->s.op.c->in_flight); - closure_wake_up(&io->s.op.c->moving_gc_wait); + up(&io->s.op.c->moving_in_flight); closure_return_with_destructor(cl, moving_io_destructor); } @@ -113,7 +112,7 @@ static void write_moving(struct closure *cl) bch_data_insert(&s->op.cl); } - continue_at(cl, write_moving_finish, bch_gc_wq); + continue_at(cl, write_moving_finish, system_wq); } static void read_moving_submit(struct closure *cl) @@ -124,15 +123,17 @@ static void read_moving_submit(struct closure *cl) bch_submit_bbio(bio, s->op.c, &io->w->key, 0); - continue_at(cl, write_moving, bch_gc_wq); + continue_at(cl, write_moving, system_wq); } -static void read_moving(struct closure *cl) +static void read_moving(struct cache_set *c) { - struct cache_set *c = container_of(cl, struct cache_set, moving_gc); struct keybuf_key *w; struct moving_io *io; struct bio *bio; + struct closure cl; + + closure_init_stack(&cl); /* XXX: if we error, background writeback could stall indefinitely */ @@ -164,13 +165,8 @@ static void read_moving(struct closure *cl) trace_bcache_gc_copy(&w->key); - closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); - - if (atomic_inc_return(&c->in_flight) >= 64) { - closure_wait_event(&c->moving_gc_wait, cl, - atomic_read(&c->in_flight) < 64); - continue_at(cl, read_moving, bch_gc_wq); - } + down(&c->moving_in_flight); + closure_call(&io->s.cl, read_moving_submit, NULL, &cl); } if (0) { @@ -180,7 +176,7 @@ err: if (!IS_ERR_OR_NULL(w->private)) bch_keybuf_del(&c->moving_gc_keys, w); } - closure_return(cl); + closure_sync(&cl); } static bool bucket_cmp(struct bucket *l, struct bucket *r) @@ -193,15 +189,14 @@ static unsigned bucket_heap_top(struct cache *ca) return GC_SECTORS_USED(heap_peek(&ca->heap)); } -void bch_moving_gc(struct closure *cl) +void bch_moving_gc(struct cache_set *c) { - struct cache_set *c = container_of(cl, struct cache_set, gc.cl); struct cache *ca; struct bucket *b; unsigned i; if (!c->copy_gc_enabled) - closure_return(cl); + return; mutex_lock(&c->bucket_lock); @@ -242,13 +237,11 @@ void bch_moving_gc(struct closure *cl) c->moving_gc_keys.last_scanned = ZERO_KEY; - closure_init(&c->moving_gc, cl); - read_moving(&c->moving_gc); - - closure_return(cl); + read_moving(c); } void bch_moving_init_cache_set(struct cache_set *c) { bch_keybuf_init(&c->moving_gc_keys); + sema_init(&c->moving_in_flight, 64); } diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 26d18f4bf4a0..f779eb420d69 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -520,7 +520,7 @@ static void bch_data_insert_start(struct closure *cl) if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { set_gc_sectors(op->c); - bch_queue_gc(op->c); + wake_up_gc(op->c); } /* diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 84398a82fbe3..f89e2296bde1 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1342,6 +1342,9 @@ static void cache_set_flush(struct closure *cl) kobject_put(&c->internal); kobject_del(&c->kobj); + if (c->gc_thread) + kthread_stop(c->gc_thread); + if (!IS_ERR_OR_NULL(c->root)) list_add(&c->root->list, &c->btree_cache); @@ -1579,8 +1582,6 @@ static void run_cache_set(struct cache_set *c) bch_journal_replay(c, &journal, &op); } else { pr_notice("invalidating existing data"); - /* Don't want invalidate_buckets() to queue a gc yet */ - closure_lock(&c->gc, NULL); for_each_cache(ca, c, i) { unsigned j; @@ -1606,12 +1607,12 @@ static void run_cache_set(struct cache_set *c) err = "cannot allocate new UUID bucket"; if (__uuid_write(c)) - goto err_unlock_gc; + goto err; err = "cannot allocate new btree root"; c->root = bch_btree_node_alloc(c, 0); if (IS_ERR_OR_NULL(c->root)) - goto err_unlock_gc; + goto err; bkey_copy_key(&c->root->key, &MAX_KEY); bch_btree_node_write(c->root, &op.cl); @@ -1628,12 +1629,12 @@ static void run_cache_set(struct cache_set *c) bch_journal_next(&c->journal); bch_journal_meta(c, &op.cl); - - /* Unlock */ - closure_set_stopped(&c->gc.cl); - closure_put(&c->gc.cl); } + err = "error starting gc thread"; + if (bch_gc_thread_start(c)) + goto err; + closure_sync(&op.cl); c->sb.last_mount = get_seconds(); bcache_write_super(c); @@ -1644,9 +1645,6 @@ static void run_cache_set(struct cache_set *c) flash_devs_run(c); return; -err_unlock_gc: - closure_set_stopped(&c->gc.cl); - closure_put(&c->gc.cl); err: closure_sync(&op.cl); /* XXX: test this, it's broken */ diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index b3a66f17231d..ab286b9b5e40 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -566,7 +566,7 @@ STORE(__bch_cache_set) } if (attr == &sysfs_trigger_gc) - bch_queue_gc(c); + wake_up_gc(c); if (attr == &sysfs_prune_cache) { struct shrink_control sc; -- cgit v1.2.3 From 5e6926daac267dd99552ae613f041a9e88bbf258 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 17:50:06 -0700 Subject: bcache: Convert writeback to a kthread This simplifies the writeback flow control quite a bit - previously, it was conceptually two coroutines, refill_dirty() and read_dirty(). This makes the code quite a bit more straightforward. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 10 +- drivers/md/bcache/super.c | 3 +- drivers/md/bcache/writeback.c | 371 ++++++++++++++++++++---------------------- drivers/md/bcache/writeback.h | 25 ++- 4 files changed, 203 insertions(+), 206 deletions(-) (limited to 'drivers/md/bcache/bcache.h') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 09410eb07d82..674e2f42e778 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -509,7 +509,7 @@ struct cached_dev { /* Limit number of writeback bios in flight */ struct semaphore in_flight; - struct closure_with_timer writeback; + struct task_struct *writeback_thread; struct keybuf writeback_keys; @@ -1038,7 +1038,11 @@ static inline void bkey_init(struct bkey *k) #define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k)) #define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0) -#define MAX_KEY KEY(~(~0 << 20), ((uint64_t) ~0) >> 1, 0) + +#define MAX_KEY_INODE (~(~0 << 20)) +#define MAX_KEY_OFFSET (((uint64_t) ~0) >> 1) +#define MAX_KEY KEY(MAX_KEY_INODE, MAX_KEY_OFFSET, 0) + #define ZERO_KEY KEY(0, 0, 0) /* @@ -1214,8 +1218,6 @@ int bch_cache_allocator_init(struct cache *ca); void bch_debug_exit(void); int bch_debug_init(struct kobject *); -void bch_writeback_exit(void); -int bch_writeback_init(void); void bch_request_exit(void); int bch_request_init(void); void bch_btree_exit(void); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index f89e2296bde1..b79dd5a6679e 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1029,6 +1029,7 @@ static void cached_dev_free(struct closure *cl) struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); cancel_delayed_work_sync(&dc->writeback_rate_update); + kthread_stop(dc->writeback_thread); mutex_lock(&bch_register_lock); @@ -2006,7 +2007,6 @@ static struct notifier_block reboot = { static void bcache_exit(void) { bch_debug_exit(); - bch_writeback_exit(); bch_request_exit(); bch_btree_exit(); if (bcache_kobj) @@ -2039,7 +2039,6 @@ static int __init bcache_init(void) sysfs_create_files(bcache_kobj, files) || bch_btree_init() || bch_request_init() || - bch_writeback_init() || bch_debug_init(bcache_kobj)) goto err; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 51dc709c9bf7..4392f3f38d62 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -11,18 +11,11 @@ #include "debug.h" #include "writeback.h" +#include +#include +#include #include -static struct workqueue_struct *dirty_wq; - -static void read_dirty(struct closure *); - -struct dirty_io { - struct closure cl; - struct cached_dev *dc; - struct bio bio; -}; - /* Rate limiting */ static void __update_writeback_rate(struct cached_dev *dc) @@ -72,9 +65,6 @@ out: dc->writeback_rate_derivative = derivative; dc->writeback_rate_change = change; dc->writeback_rate_target = target; - - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); } static void update_writeback_rate(struct work_struct *work) @@ -90,6 +80,9 @@ static void update_writeback_rate(struct work_struct *work) __update_writeback_rate(dc); up_read(&dc->writeback_lock); + + schedule_delayed_work(&dc->writeback_rate_update, + dc->writeback_rate_update_seconds * HZ); } static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) @@ -105,37 +98,11 @@ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) return min_t(uint64_t, ret, HZ); } -/* Background writeback */ - -static bool dirty_pred(struct keybuf *buf, struct bkey *k) -{ - return KEY_DIRTY(k); -} - -static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) -{ - uint64_t stripe = KEY_START(k); - unsigned nr_sectors = KEY_SIZE(k); - struct cached_dev *dc = container_of(buf, struct cached_dev, - writeback_keys); - - if (!KEY_DIRTY(k)) - return false; - - do_div(stripe, dc->disk.stripe_size); - - while (1) { - if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) == - dc->disk.stripe_size) - return true; - - if (nr_sectors <= dc->disk.stripe_size) - return false; - - nr_sectors -= dc->disk.stripe_size; - stripe++; - } -} +struct dirty_io { + struct closure cl; + struct cached_dev *dc; + struct bio bio; +}; static void dirty_init(struct keybuf_key *w) { @@ -153,132 +120,6 @@ static void dirty_init(struct keybuf_key *w) bch_bio_map(bio, NULL); } -static void refill_dirty(struct closure *cl) -{ - struct cached_dev *dc = container_of(cl, struct cached_dev, - writeback.cl); - struct keybuf *buf = &dc->writeback_keys; - bool searched_from_start = false; - struct bkey end = MAX_KEY; - SET_KEY_INODE(&end, dc->disk.id); - - if (!atomic_read(&dc->disk.detaching) && - !dc->writeback_running) - closure_return(cl); - - down_write(&dc->writeback_lock); - - if (!atomic_read(&dc->has_dirty)) { - SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); - bch_write_bdev_super(dc, NULL); - - up_write(&dc->writeback_lock); - closure_return(cl); - } - - if (bkey_cmp(&buf->last_scanned, &end) >= 0) { - buf->last_scanned = KEY(dc->disk.id, 0, 0); - searched_from_start = true; - } - - if (dc->partial_stripes_expensive) { - uint64_t i; - - for (i = 0; i < dc->disk.nr_stripes; i++) - if (atomic_read(dc->disk.stripe_sectors_dirty + i) == - dc->disk.stripe_size) - goto full_stripes; - - goto normal_refill; -full_stripes: - searched_from_start = false; /* not searching entire btree */ - bch_refill_keybuf(dc->disk.c, buf, &end, - dirty_full_stripe_pred); - } else { -normal_refill: - bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); - } - - if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { - /* Searched the entire btree - delay awhile */ - - if (RB_EMPTY_ROOT(&buf->keys)) { - atomic_set(&dc->has_dirty, 0); - cached_dev_put(dc); - } - - if (!atomic_read(&dc->disk.detaching)) - closure_delay(&dc->writeback, dc->writeback_delay * HZ); - } - - up_write(&dc->writeback_lock); - - bch_ratelimit_reset(&dc->writeback_rate); - - /* Punt to workqueue only so we don't recurse and blow the stack */ - continue_at(cl, read_dirty, dirty_wq); -} - -void bch_writeback_queue(struct cached_dev *dc) -{ - if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) { - if (!atomic_read(&dc->disk.detaching)) - closure_delay(&dc->writeback, dc->writeback_delay * HZ); - - continue_at(&dc->writeback.cl, refill_dirty, dirty_wq); - } -} - -void bch_writeback_add(struct cached_dev *dc) -{ - if (!atomic_read(&dc->has_dirty) && - !atomic_xchg(&dc->has_dirty, 1)) { - atomic_inc(&dc->count); - - if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { - SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); - /* XXX: should do this synchronously */ - bch_write_bdev_super(dc, NULL); - } - - bch_writeback_queue(dc); - - if (dc->writeback_percent) - schedule_delayed_work(&dc->writeback_rate_update, - dc->writeback_rate_update_seconds * HZ); - } -} - -void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, - uint64_t offset, int nr_sectors) -{ - struct bcache_device *d = c->devices[inode]; - unsigned stripe_offset; - uint64_t stripe = offset; - - if (!d) - return; - - do_div(stripe, d->stripe_size); - - stripe_offset = offset & (d->stripe_size - 1); - - while (nr_sectors) { - int s = min_t(unsigned, abs(nr_sectors), - d->stripe_size - stripe_offset); - - if (nr_sectors < 0) - s = -s; - - atomic_add(s, d->stripe_sectors_dirty + stripe); - nr_sectors -= s; - stripe_offset = 0; - stripe++; - } -} - -/* Background writeback - IO loop */ - static void dirty_io_destructor(struct closure *cl) { struct dirty_io *io = container_of(cl, struct dirty_io, cl); @@ -378,30 +219,33 @@ static void read_dirty_submit(struct closure *cl) continue_at(cl, write_dirty, system_wq); } -static void read_dirty(struct closure *cl) +static void read_dirty(struct cached_dev *dc) { - struct cached_dev *dc = container_of(cl, struct cached_dev, - writeback.cl); - unsigned delay = writeback_delay(dc, 0); + unsigned delay = 0; struct keybuf_key *w; struct dirty_io *io; + struct closure cl; + + closure_init_stack(&cl); /* * XXX: if we error, background writeback just spins. Should use some * mempools. */ - while (1) { + while (!kthread_should_stop()) { + try_to_freeze(); + w = bch_keybuf_next(&dc->writeback_keys); if (!w) break; BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); - if (delay > 0 && - (KEY_START(&w->key) != dc->last_read || - jiffies_to_msecs(delay) > 50)) - delay = schedule_timeout_uninterruptible(delay); + if (KEY_START(&w->key) != dc->last_read || + jiffies_to_msecs(delay) > 50) + while (!kthread_should_stop() && delay) + delay = schedule_timeout_interruptible(delay); dc->last_read = KEY_OFFSET(&w->key); @@ -427,7 +271,7 @@ static void read_dirty(struct closure *cl) trace_bcache_writeback(&w->key); down(&dc->in_flight); - closure_call(&io->cl, read_dirty_submit, NULL, cl); + closure_call(&io->cl, read_dirty_submit, NULL, &cl); delay = writeback_delay(dc, KEY_SIZE(&w->key)); } @@ -443,7 +287,148 @@ err: * Wait for outstanding writeback IOs to finish (and keybuf slots to be * freed) before refilling again */ - continue_at(cl, refill_dirty, dirty_wq); + closure_sync(&cl); +} + +/* Scan for dirty data */ + +void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, + uint64_t offset, int nr_sectors) +{ + struct bcache_device *d = c->devices[inode]; + unsigned stripe_offset; + uint64_t stripe = offset; + + if (!d) + return; + + do_div(stripe, d->stripe_size); + + stripe_offset = offset & (d->stripe_size - 1); + + while (nr_sectors) { + int s = min_t(unsigned, abs(nr_sectors), + d->stripe_size - stripe_offset); + + if (nr_sectors < 0) + s = -s; + + atomic_add(s, d->stripe_sectors_dirty + stripe); + nr_sectors -= s; + stripe_offset = 0; + stripe++; + } +} + +static bool dirty_pred(struct keybuf *buf, struct bkey *k) +{ + return KEY_DIRTY(k); +} + +static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) +{ + uint64_t stripe = KEY_START(k); + unsigned nr_sectors = KEY_SIZE(k); + struct cached_dev *dc = container_of(buf, struct cached_dev, + writeback_keys); + + if (!KEY_DIRTY(k)) + return false; + + do_div(stripe, dc->disk.stripe_size); + + while (1) { + if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) == + dc->disk.stripe_size) + return true; + + if (nr_sectors <= dc->disk.stripe_size) + return false; + + nr_sectors -= dc->disk.stripe_size; + stripe++; + } +} + +static bool refill_dirty(struct cached_dev *dc) +{ + struct keybuf *buf = &dc->writeback_keys; + bool searched_from_start = false; + struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0); + + if (bkey_cmp(&buf->last_scanned, &end) >= 0) { + buf->last_scanned = KEY(dc->disk.id, 0, 0); + searched_from_start = true; + } + + if (dc->partial_stripes_expensive) { + uint64_t i; + + for (i = 0; i < dc->disk.nr_stripes; i++) + if (atomic_read(dc->disk.stripe_sectors_dirty + i) == + dc->disk.stripe_size) + goto full_stripes; + + goto normal_refill; +full_stripes: + searched_from_start = false; /* not searching entire btree */ + bch_refill_keybuf(dc->disk.c, buf, &end, + dirty_full_stripe_pred); + } else { +normal_refill: + bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); + } + + return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start; +} + +static int bch_writeback_thread(void *arg) +{ + struct cached_dev *dc = arg; + bool searched_full_index; + + while (!kthread_should_stop()) { + down_write(&dc->writeback_lock); + if (!atomic_read(&dc->has_dirty) || + (!atomic_read(&dc->disk.detaching) && + !dc->writeback_running)) { + up_write(&dc->writeback_lock); + set_current_state(TASK_INTERRUPTIBLE); + + if (kthread_should_stop()) + return 0; + + try_to_freeze(); + schedule(); + continue; + } + + searched_full_index = refill_dirty(dc); + + if (searched_full_index && + RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { + atomic_set(&dc->has_dirty, 0); + cached_dev_put(dc); + SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); + bch_write_bdev_super(dc, NULL); + } + + up_write(&dc->writeback_lock); + + bch_ratelimit_reset(&dc->writeback_rate); + read_dirty(dc); + + if (searched_full_index) { + unsigned delay = dc->writeback_delay * HZ; + + while (delay && + !kthread_should_stop() && + !atomic_read(&dc->disk.detaching)) + delay = schedule_timeout_interruptible(delay); + } + } + + return 0; } /* Init */ @@ -483,12 +468,10 @@ void bch_sectors_dirty_init(struct cached_dev *dc) btree_root(sectors_dirty_init, dc->disk.c, &op, dc); } -void bch_cached_dev_writeback_init(struct cached_dev *dc) +int bch_cached_dev_writeback_init(struct cached_dev *dc) { sema_init(&dc->in_flight, 64); - closure_init_unlocked(&dc->writeback); init_rwsem(&dc->writeback_lock); - bch_keybuf_init(&dc->writeback_keys); dc->writeback_metadata = true; @@ -502,22 +485,16 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_rate_p_term_inverse = 64; dc->writeback_rate_d_smooth = 8; + dc->writeback_thread = kthread_create(bch_writeback_thread, dc, + "bcache_writeback"); + if (IS_ERR(dc->writeback_thread)) + return PTR_ERR(dc->writeback_thread); + + set_task_state(dc->writeback_thread, TASK_INTERRUPTIBLE); + INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); schedule_delayed_work(&dc->writeback_rate_update, dc->writeback_rate_update_seconds * HZ); -} - -void bch_writeback_exit(void) -{ - if (dirty_wq) - destroy_workqueue(dirty_wq); -} - -int __init bch_writeback_init(void) -{ - dirty_wq = create_workqueue("bcache_writeback"); - if (!dirty_wq) - return -ENOMEM; return 0; } diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 34961888b5a9..60516bfa6052 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -56,11 +56,30 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, in_use <= CUTOFF_WRITEBACK; } +static inline void bch_writeback_queue(struct cached_dev *dc) +{ + wake_up_process(dc->writeback_thread); +} + +static inline void bch_writeback_add(struct cached_dev *dc) +{ + if (!atomic_read(&dc->has_dirty) && + !atomic_xchg(&dc->has_dirty, 1)) { + atomic_inc(&dc->count); + + if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { + SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); + /* XXX: should do this synchronously */ + bch_write_bdev_super(dc, NULL); + } + + bch_writeback_queue(dc); + } +} + void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); -void bch_writeback_queue(struct cached_dev *); -void bch_writeback_add(struct cached_dev *); void bch_sectors_dirty_init(struct cached_dev *dc); -void bch_cached_dev_writeback_init(struct cached_dev *); +int bch_cached_dev_writeback_init(struct cached_dev *); #endif -- cgit v1.2.3 From 48dad8baf92fe8967d9e1358af1cfdda1d2d3298 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 10 Sep 2013 18:48:51 -0700 Subject: bcache: Add btree_map() functions Lots of stuff has been open coding its own btree traversal - which is generally pretty simple code, but there are a few subtleties. This adds new new functions, bch_btree_map_nodes() and bch_btree_map_keys(), which do the traversal for you. Everything that's open coding btree traversal now (with the exception of garbage collection) is slowly going to be converted to these two functions; being able to write other code at a higher level of abstraction is a big improvement w.r.t. overall code quality. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 2 - drivers/md/bcache/bset.c | 33 ++++---- drivers/md/bcache/btree.c | 174 +++++++++++++++++++++++++++++------------- drivers/md/bcache/btree.h | 37 ++++++++- drivers/md/bcache/writeback.c | 37 ++++----- 5 files changed, 186 insertions(+), 97 deletions(-) (limited to 'drivers/md/bcache/bcache.h') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 674e2f42e778..20fe96c121d9 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -384,8 +384,6 @@ struct keybuf_key { void *private; }; -typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); - struct keybuf { struct bkey last_scanned; spinlock_t lock; diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index d0512e451dda..14c2a23d3884 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -842,6 +842,13 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, /* Btree iterator */ +/* + * Returns true if l > r - unless l == r, in which case returns true if l is + * older than r. + * + * Necessary for btree_sort_fixup() - if there are multiple keys that compare + * equal in different sets, we have to process them newest to oldest. + */ static inline bool btree_iter_cmp(struct btree_iter_set l, struct btree_iter_set r) { @@ -1146,16 +1153,16 @@ out: /* Sysfs stuff */ struct bset_stats { + struct btree_op op; size_t nodes; size_t sets_written, sets_unwritten; size_t bytes_written, bytes_unwritten; size_t floats, failed; }; -static int bch_btree_bset_stats(struct btree *b, struct btree_op *op, - struct bset_stats *stats) +static int btree_bset_stats(struct btree_op *op, struct btree *b) { - struct bkey *k; + struct bset_stats *stats = container_of(op, struct bset_stats, op); unsigned i; stats->nodes++; @@ -1180,30 +1187,20 @@ static int bch_btree_bset_stats(struct btree *b, struct btree_op *op, } } - if (b->level) { - struct btree_iter iter; - - for_each_key_filter(b, k, &iter, bch_ptr_bad) { - int ret = btree(bset_stats, k, b, op, stats); - if (ret) - return ret; - } - } - - return 0; + return MAP_CONTINUE; } int bch_bset_print_stats(struct cache_set *c, char *buf) { - struct btree_op op; struct bset_stats t; int ret; - bch_btree_op_init_stack(&op); memset(&t, 0, sizeof(struct bset_stats)); + bch_btree_op_init_stack(&t.op); + t.op.c = c; - ret = btree_root(bset_stats, c, &op, &t); - if (ret) + ret = bch_btree_map_nodes(&t.op, c, &ZERO_KEY, btree_bset_stats); + if (ret < 0) return ret; return snprintf(buf, PAGE_SIZE, diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 17bfd87fc8f4..cfbdcf349b7f 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -2296,6 +2296,82 @@ int bch_btree_search_recurse(struct btree *b, struct btree_op *op) return ret; } +/* Map across nodes or keys */ + +static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, + struct bkey *from, + btree_map_nodes_fn *fn, int flags) +{ + int ret = MAP_CONTINUE; + + if (b->level) { + struct bkey *k; + struct btree_iter iter; + + bch_btree_iter_init(b, &iter, from); + + while ((k = bch_btree_iter_next_filter(&iter, b, + bch_ptr_bad))) { + ret = btree(map_nodes_recurse, k, b, + op, from, fn, flags); + from = NULL; + + if (ret != MAP_CONTINUE) + return ret; + } + } + + if (!b->level || flags == MAP_ALL_NODES) + ret = fn(op, b); + + return ret; +} + +int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_nodes_fn *fn, int flags) +{ + int ret = btree_root(map_nodes_recurse, c, op, from, fn, flags); + if (closure_blocking(&op->cl)) + closure_sync(&op->cl); + return ret; +} + +static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, + struct bkey *from, btree_map_keys_fn *fn, + int flags) +{ + int ret = MAP_CONTINUE; + struct bkey *k; + struct btree_iter iter; + + bch_btree_iter_init(b, &iter, from); + + while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) { + ret = !b->level + ? fn(op, b, k) + : btree(map_keys_recurse, k, b, op, from, fn, flags); + from = NULL; + + if (ret != MAP_CONTINUE) + return ret; + } + + if (!b->level && (flags & MAP_END_KEY)) + ret = fn(op, b, &KEY(KEY_INODE(&b->key), + KEY_OFFSET(&b->key), 0)); + + return ret; +} + +int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_keys_fn *fn, int flags) +{ + int ret = btree_root(map_keys_recurse, c, op, from, fn, flags); + if (closure_blocking(&op->cl)) + closure_sync(&op->cl); + return ret; +} + /* Keybuf code */ static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) @@ -2314,74 +2390,70 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); } -static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, - struct keybuf *buf, struct bkey *end, - keybuf_pred_fn *pred) -{ - struct btree_iter iter; - bch_btree_iter_init(b, &iter, &buf->last_scanned); - - while (!array_freelist_empty(&buf->freelist)) { - struct bkey *k = bch_btree_iter_next_filter(&iter, b, - bch_ptr_bad); - - if (!b->level) { - if (!k) { - buf->last_scanned = b->key; - break; - } +struct refill { + struct btree_op op; + struct keybuf *buf; + struct bkey *end; + keybuf_pred_fn *pred; +}; - buf->last_scanned = *k; - if (bkey_cmp(&buf->last_scanned, end) >= 0) - break; +static int refill_keybuf_fn(struct btree_op *op, struct btree *b, + struct bkey *k) +{ + struct refill *refill = container_of(op, struct refill, op); + struct keybuf *buf = refill->buf; + int ret = MAP_CONTINUE; - if (pred(buf, k)) { - struct keybuf_key *w; + if (bkey_cmp(k, refill->end) >= 0) { + ret = MAP_DONE; + goto out; + } - spin_lock(&buf->lock); + if (!KEY_SIZE(k)) /* end key */ + goto out; - w = array_alloc(&buf->freelist); + if (refill->pred(buf, k)) { + struct keybuf_key *w; - w->private = NULL; - bkey_copy(&w->key, k); + spin_lock(&buf->lock); - if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) - array_free(&buf->freelist, w); + w = array_alloc(&buf->freelist); + if (!w) { + spin_unlock(&buf->lock); + return MAP_DONE; + } - spin_unlock(&buf->lock); - } - } else { - if (!k) - break; + w->private = NULL; + bkey_copy(&w->key, k); - btree(refill_keybuf, k, b, op, buf, end, pred); - /* - * Might get an error here, but can't really do anything - * and it'll get logged elsewhere. Just read what we - * can. - */ + if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) + array_free(&buf->freelist, w); - if (bkey_cmp(&buf->last_scanned, end) >= 0) - break; + if (array_freelist_empty(&buf->freelist)) + ret = MAP_DONE; - cond_resched(); - } + spin_unlock(&buf->lock); } - - return 0; +out: + buf->last_scanned = *k; + return ret; } void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, struct bkey *end, keybuf_pred_fn *pred) { struct bkey start = buf->last_scanned; - struct btree_op op; - bch_btree_op_init_stack(&op); + struct refill refill; cond_resched(); - btree_root(refill_keybuf, c, &op, buf, end, pred); - closure_sync(&op.cl); + bch_btree_op_init_stack(&refill.op); + refill.buf = buf; + refill.end = end; + refill.pred = pred; + + bch_btree_map_keys(&refill.op, c, &buf->last_scanned, + refill_keybuf_fn, MAP_END_KEY); pr_debug("found %s keys from %llu:%llu to %llu:%llu", RB_EMPTY_ROOT(&buf->keys) ? "no" : @@ -2465,9 +2537,9 @@ struct keybuf_key *bch_keybuf_next(struct keybuf *buf) } struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, - struct keybuf *buf, - struct bkey *end, - keybuf_pred_fn *pred) + struct keybuf *buf, + struct bkey *end, + keybuf_pred_fn *pred) { struct keybuf_key *ret; diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index fa9641aaed39..cafdeb01e219 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -400,9 +400,42 @@ static inline void wake_up_gc(struct cache_set *c) wake_up_process(c->gc_thread); } +#define MAP_DONE 0 +#define MAP_CONTINUE 1 + +#define MAP_ALL_NODES 0 +#define MAP_LEAF_NODES 1 + +#define MAP_END_KEY 1 + +typedef int (btree_map_nodes_fn)(struct btree_op *, struct btree *); +int __bch_btree_map_nodes(struct btree_op *, struct cache_set *, + struct bkey *, btree_map_nodes_fn *, int); + +static inline int bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_nodes_fn *fn) +{ + return __bch_btree_map_nodes(op, c, from, fn, MAP_ALL_NODES); +} + +static inline int bch_btree_map_leaf_nodes(struct btree_op *op, + struct cache_set *c, + struct bkey *from, + btree_map_nodes_fn *fn) +{ + return __bch_btree_map_nodes(op, c, from, fn, MAP_LEAF_NODES); +} + +typedef int (btree_map_keys_fn)(struct btree_op *, struct btree *, + struct bkey *); +int bch_btree_map_keys(struct btree_op *, struct cache_set *, + struct bkey *, btree_map_keys_fn *, int); + +typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); + void bch_keybuf_init(struct keybuf *); -void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *, - keybuf_pred_fn *); +void bch_refill_keybuf(struct cache_set *, struct keybuf *, + struct bkey *, keybuf_pred_fn *); bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, struct bkey *); void bch_keybuf_del(struct keybuf *, struct keybuf_key *); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 4392f3f38d62..c68de9f12618 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -433,31 +433,17 @@ static int bch_writeback_thread(void *arg) /* Init */ -static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op, - struct cached_dev *dc) +static int sectors_dirty_init_fn(struct btree_op *op, struct btree *b, + struct bkey *k) { - struct bkey *k; - struct btree_iter iter; - - bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0)); - while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) - if (!b->level) { - if (KEY_INODE(k) > dc->disk.id) - break; - - if (KEY_DIRTY(k)) - bcache_dev_sectors_dirty_add(b->c, dc->disk.id, - KEY_START(k), - KEY_SIZE(k)); - } else { - btree(sectors_dirty_init, k, b, op, dc); - if (KEY_INODE(k) > dc->disk.id) - break; - - cond_resched(); - } + if (KEY_INODE(k) > op->inode) + return MAP_DONE; - return 0; + if (KEY_DIRTY(k)) + bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), + KEY_START(k), KEY_SIZE(k)); + + return MAP_CONTINUE; } void bch_sectors_dirty_init(struct cached_dev *dc) @@ -465,7 +451,10 @@ void bch_sectors_dirty_init(struct cached_dev *dc) struct btree_op op; bch_btree_op_init_stack(&op); - btree_root(sectors_dirty_init, dc->disk.c, &op, dc); + op.inode = dc->disk.id; + + bch_btree_map_keys(&op, dc->disk.c, &KEY(op.inode, 0, 0), + sectors_dirty_init_fn, 0); } int bch_cached_dev_writeback_init(struct cached_dev *dc) -- cgit v1.2.3 From 2599b53b7b0ea6103d1661dca74d35480cb8fa1f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Jul 2013 18:11:11 -0700 Subject: bcache: Move sector allocator to alloc.c Just reorganizing things a bit. Signed-off-by: Kent Overstreet --- drivers/md/bcache/alloc.c | 180 ++++++++++++++++++++++++++++++++++++++++++ drivers/md/bcache/bcache.h | 4 + drivers/md/bcache/request.c | 186 +------------------------------------------- drivers/md/bcache/request.h | 5 +- 4 files changed, 189 insertions(+), 186 deletions(-) (limited to 'drivers/md/bcache/bcache.h') diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index b9bd5866055d..4970ddc6a7f6 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -487,8 +487,188 @@ int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, return ret; } +/* Sector allocator */ + +struct open_bucket { + struct list_head list; + unsigned last_write_point; + unsigned sectors_free; + BKEY_PADDED(key); +}; + +/* + * We keep multiple buckets open for writes, and try to segregate different + * write streams for better cache utilization: first we look for a bucket where + * the last write to it was sequential with the current write, and failing that + * we look for a bucket that was last used by the same task. + * + * The ideas is if you've got multiple tasks pulling data into the cache at the + * same time, you'll get better cache utilization if you try to segregate their + * data and preserve locality. + * + * For example, say you've starting Firefox at the same time you're copying a + * bunch of files. Firefox will likely end up being fairly hot and stay in the + * cache awhile, but the data you copied might not be; if you wrote all that + * data to the same buckets it'd get invalidated at the same time. + * + * Both of those tasks will be doing fairly random IO so we can't rely on + * detecting sequential IO to segregate their data, but going off of the task + * should be a sane heuristic. + */ +static struct open_bucket *pick_data_bucket(struct cache_set *c, + const struct bkey *search, + unsigned write_point, + struct bkey *alloc) +{ + struct open_bucket *ret, *ret_task = NULL; + + list_for_each_entry_reverse(ret, &c->data_buckets, list) + if (!bkey_cmp(&ret->key, search)) + goto found; + else if (ret->last_write_point == write_point) + ret_task = ret; + + ret = ret_task ?: list_first_entry(&c->data_buckets, + struct open_bucket, list); +found: + if (!ret->sectors_free && KEY_PTRS(alloc)) { + ret->sectors_free = c->sb.bucket_size; + bkey_copy(&ret->key, alloc); + bkey_init(alloc); + } + + if (!ret->sectors_free) + ret = NULL; + + return ret; +} + +/* + * Allocates some space in the cache to write to, and k to point to the newly + * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the + * end of the newly allocated space). + * + * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many + * sectors were actually allocated. + * + * If s->writeback is true, will not fail. + */ +bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors, + unsigned write_point, unsigned write_prio, bool wait) +{ + struct open_bucket *b; + BKEY_PADDED(key) alloc; + unsigned i; + + /* + * We might have to allocate a new bucket, which we can't do with a + * spinlock held. So if we have to allocate, we drop the lock, allocate + * and then retry. KEY_PTRS() indicates whether alloc points to + * allocated bucket(s). + */ + + bkey_init(&alloc.key); + spin_lock(&c->data_bucket_lock); + + while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) { + unsigned watermark = write_prio + ? WATERMARK_MOVINGGC + : WATERMARK_NONE; + + spin_unlock(&c->data_bucket_lock); + + if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, wait)) + return false; + + spin_lock(&c->data_bucket_lock); + } + + /* + * If we had to allocate, we might race and not need to allocate the + * second time we call find_data_bucket(). If we allocated a bucket but + * didn't use it, drop the refcount bch_bucket_alloc_set() took: + */ + if (KEY_PTRS(&alloc.key)) + __bkey_put(c, &alloc.key); + + for (i = 0; i < KEY_PTRS(&b->key); i++) + EBUG_ON(ptr_stale(c, &b->key, i)); + + /* Set up the pointer to the space we're allocating: */ + + for (i = 0; i < KEY_PTRS(&b->key); i++) + k->ptr[i] = b->key.ptr[i]; + + sectors = min(sectors, b->sectors_free); + + SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); + SET_KEY_SIZE(k, sectors); + SET_KEY_PTRS(k, KEY_PTRS(&b->key)); + + /* + * Move b to the end of the lru, and keep track of what this bucket was + * last used for: + */ + list_move_tail(&b->list, &c->data_buckets); + bkey_copy_key(&b->key, k); + b->last_write_point = write_point; + + b->sectors_free -= sectors; + + for (i = 0; i < KEY_PTRS(&b->key); i++) { + SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); + + atomic_long_add(sectors, + &PTR_CACHE(c, &b->key, i)->sectors_written); + } + + if (b->sectors_free < c->sb.block_size) + b->sectors_free = 0; + + /* + * k takes refcounts on the buckets it points to until it's inserted + * into the btree, but if we're done with this bucket we just transfer + * get_data_bucket()'s refcount. + */ + if (b->sectors_free) + for (i = 0; i < KEY_PTRS(&b->key); i++) + atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin); + + spin_unlock(&c->data_bucket_lock); + return true; +} + /* Init */ +void bch_open_buckets_free(struct cache_set *c) +{ + struct open_bucket *b; + + while (!list_empty(&c->data_buckets)) { + b = list_first_entry(&c->data_buckets, + struct open_bucket, list); + list_del(&b->list); + kfree(b); + } +} + +int bch_open_buckets_alloc(struct cache_set *c) +{ + int i; + + spin_lock_init(&c->data_bucket_lock); + + for (i = 0; i < 6; i++) { + struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); + if (!b) + return -ENOMEM; + + list_add(&b->list, &c->data_buckets); + } + + return 0; +} + int bch_cache_allocator_start(struct cache *ca) { struct task_struct *k = kthread_run(bch_allocator_thread, diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 20fe96c121d9..e32f6fd91755 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -1170,6 +1170,8 @@ int __bch_bucket_alloc_set(struct cache_set *, unsigned, struct bkey *, int, bool); int bch_bucket_alloc_set(struct cache_set *, unsigned, struct bkey *, int, bool); +bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned, + unsigned, unsigned, bool); __printf(2, 3) bool bch_cache_set_error(struct cache_set *, const char *, ...); @@ -1210,6 +1212,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *); void bch_btree_cache_free(struct cache_set *); int bch_btree_cache_alloc(struct cache_set *); void bch_moving_init_cache_set(struct cache_set *); +int bch_open_buckets_alloc(struct cache_set *); +void bch_open_buckets_free(struct cache_set *); int bch_cache_allocator_start(struct cache *ca); int bch_cache_allocator_init(struct cache *ca); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 05c7c216f65e..cf7850a7592c 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -255,186 +255,6 @@ static void bch_data_insert_keys(struct closure *cl) closure_return(cl); } -struct open_bucket { - struct list_head list; - struct task_struct *last; - unsigned sectors_free; - BKEY_PADDED(key); -}; - -void bch_open_buckets_free(struct cache_set *c) -{ - struct open_bucket *b; - - while (!list_empty(&c->data_buckets)) { - b = list_first_entry(&c->data_buckets, - struct open_bucket, list); - list_del(&b->list); - kfree(b); - } -} - -int bch_open_buckets_alloc(struct cache_set *c) -{ - int i; - - spin_lock_init(&c->data_bucket_lock); - - for (i = 0; i < 6; i++) { - struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); - if (!b) - return -ENOMEM; - - list_add(&b->list, &c->data_buckets); - } - - return 0; -} - -/* - * We keep multiple buckets open for writes, and try to segregate different - * write streams for better cache utilization: first we look for a bucket where - * the last write to it was sequential with the current write, and failing that - * we look for a bucket that was last used by the same task. - * - * The ideas is if you've got multiple tasks pulling data into the cache at the - * same time, you'll get better cache utilization if you try to segregate their - * data and preserve locality. - * - * For example, say you've starting Firefox at the same time you're copying a - * bunch of files. Firefox will likely end up being fairly hot and stay in the - * cache awhile, but the data you copied might not be; if you wrote all that - * data to the same buckets it'd get invalidated at the same time. - * - * Both of those tasks will be doing fairly random IO so we can't rely on - * detecting sequential IO to segregate their data, but going off of the task - * should be a sane heuristic. - */ -static struct open_bucket *pick_data_bucket(struct cache_set *c, - const struct bkey *search, - struct task_struct *task, - struct bkey *alloc) -{ - struct open_bucket *ret, *ret_task = NULL; - - list_for_each_entry_reverse(ret, &c->data_buckets, list) - if (!bkey_cmp(&ret->key, search)) - goto found; - else if (ret->last == task) - ret_task = ret; - - ret = ret_task ?: list_first_entry(&c->data_buckets, - struct open_bucket, list); -found: - if (!ret->sectors_free && KEY_PTRS(alloc)) { - ret->sectors_free = c->sb.bucket_size; - bkey_copy(&ret->key, alloc); - bkey_init(alloc); - } - - if (!ret->sectors_free) - ret = NULL; - - return ret; -} - -/* - * Allocates some space in the cache to write to, and k to point to the newly - * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the - * end of the newly allocated space). - * - * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many - * sectors were actually allocated. - * - * If s->writeback is true, will not fail. - */ -static bool bch_alloc_sectors(struct data_insert_op *op, - struct bkey *k, unsigned sectors) -{ - struct cache_set *c = op->c; - struct open_bucket *b; - BKEY_PADDED(key) alloc; - unsigned i; - - /* - * We might have to allocate a new bucket, which we can't do with a - * spinlock held. So if we have to allocate, we drop the lock, allocate - * and then retry. KEY_PTRS() indicates whether alloc points to - * allocated bucket(s). - */ - - bkey_init(&alloc.key); - spin_lock(&c->data_bucket_lock); - - while (!(b = pick_data_bucket(c, k, op->task, &alloc.key))) { - unsigned watermark = op->write_prio - ? WATERMARK_MOVINGGC - : WATERMARK_NONE; - - spin_unlock(&c->data_bucket_lock); - - if (bch_bucket_alloc_set(c, watermark, &alloc.key, - 1, op->writeback)) - return false; - - spin_lock(&c->data_bucket_lock); - } - - /* - * If we had to allocate, we might race and not need to allocate the - * second time we call find_data_bucket(). If we allocated a bucket but - * didn't use it, drop the refcount bch_bucket_alloc_set() took: - */ - if (KEY_PTRS(&alloc.key)) - __bkey_put(c, &alloc.key); - - for (i = 0; i < KEY_PTRS(&b->key); i++) - EBUG_ON(ptr_stale(c, &b->key, i)); - - /* Set up the pointer to the space we're allocating: */ - - for (i = 0; i < KEY_PTRS(&b->key); i++) - k->ptr[i] = b->key.ptr[i]; - - sectors = min(sectors, b->sectors_free); - - SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); - SET_KEY_SIZE(k, sectors); - SET_KEY_PTRS(k, KEY_PTRS(&b->key)); - - /* - * Move b to the end of the lru, and keep track of what this bucket was - * last used for: - */ - list_move_tail(&b->list, &c->data_buckets); - bkey_copy_key(&b->key, k); - b->last = op->task; - - b->sectors_free -= sectors; - - for (i = 0; i < KEY_PTRS(&b->key); i++) { - SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); - - atomic_long_add(sectors, - &PTR_CACHE(c, &b->key, i)->sectors_written); - } - - if (b->sectors_free < c->sb.block_size) - b->sectors_free = 0; - - /* - * k takes refcounts on the buckets it points to until it's inserted - * into the btree, but if we're done with this bucket we just transfer - * get_data_bucket()'s refcount. - */ - if (b->sectors_free) - for (i = 0; i < KEY_PTRS(&b->key); i++) - atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin); - - spin_unlock(&c->data_bucket_lock); - return true; -} - static void bch_data_invalidate(struct closure *cl) { struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); @@ -545,7 +365,9 @@ static void bch_data_insert_start(struct closure *cl) SET_KEY_INODE(k, op->inode); SET_KEY_OFFSET(k, bio->bi_sector); - if (!bch_alloc_sectors(op, k, bio_sectors(bio))) + if (!bch_alloc_sectors(op->c, k, bio_sectors(bio), + op->write_point, op->write_prio, + op->writeback)) goto err; n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); @@ -968,7 +790,7 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d) s->iop.c = d->c; s->d = d; s->op.lock = -1; - s->iop.task = current; + s->iop.write_point = hash_long((unsigned long) current, 16); s->orig_bio = bio; s->write = (bio->bi_rw & REQ_WRITE) != 0; s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 54d7de27356f..2cd65bf073c2 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -6,10 +6,10 @@ struct data_insert_op { struct closure cl; struct cache_set *c; - struct task_struct *task; struct bio *bio; unsigned inode; + uint16_t write_point; uint16_t write_prio; short error; @@ -31,9 +31,6 @@ struct data_insert_op { unsigned bch_get_congested(struct cache_set *); void bch_data_insert(struct closure *cl); -void bch_open_buckets_free(struct cache_set *); -int bch_open_buckets_alloc(struct cache_set *); - void bch_cached_dev_request_init(struct cached_dev *dc); void bch_flash_dev_request_init(struct bcache_device *d); -- cgit v1.2.3 From 81ab4190ac17df41686a37c97f701623276b652a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 31 Oct 2013 15:46:42 -0700 Subject: bcache: Pull on disk data structures out into a separate header Now, the on disk data structures are in a header that can be exported to userspace - and having them all centralized is nice too. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 244 +---------------------------- drivers/md/bcache/bset.c | 4 +- drivers/md/bcache/bset.h | 31 ---- drivers/md/bcache/btree.c | 2 +- drivers/md/bcache/journal.c | 4 +- drivers/md/bcache/journal.h | 37 ----- drivers/md/bcache/request.c | 9 +- drivers/md/bcache/super.c | 13 +- drivers/md/bcache/util.h | 10 -- include/uapi/linux/bcache.h | 373 ++++++++++++++++++++++++++++++++++++++++++++ 10 files changed, 387 insertions(+), 340 deletions(-) create mode 100644 include/uapi/linux/bcache.h (limited to 'drivers/md/bcache/bcache.h') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index e32f6fd91755..045cb99f1ca6 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -177,6 +177,7 @@ #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ +#include #include #include #include @@ -210,168 +211,6 @@ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); #define GC_MARK_METADATA 2 BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14); -struct bkey { - uint64_t high; - uint64_t low; - uint64_t ptr[]; -}; - -/* Enough for a key with 6 pointers */ -#define BKEY_PAD 8 - -#define BKEY_PADDED(key) \ - union { struct bkey key; uint64_t key ## _pad[BKEY_PAD]; } - -/* Version 0: Cache device - * Version 1: Backing device - * Version 2: Seed pointer into btree node checksum - * Version 3: Cache device with new UUID format - * Version 4: Backing device with data offset - */ -#define BCACHE_SB_VERSION_CDEV 0 -#define BCACHE_SB_VERSION_BDEV 1 -#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 -#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 -#define BCACHE_SB_MAX_VERSION 4 - -#define SB_SECTOR 8 -#define SB_SIZE 4096 -#define SB_LABEL_SIZE 32 -#define SB_JOURNAL_BUCKETS 256U -/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */ -#define MAX_CACHES_PER_SET 8 - -#define BDEV_DATA_START_DEFAULT 16 /* sectors */ - -struct cache_sb { - uint64_t csum; - uint64_t offset; /* sector where this sb was written */ - uint64_t version; - - uint8_t magic[16]; - - uint8_t uuid[16]; - union { - uint8_t set_uuid[16]; - uint64_t set_magic; - }; - uint8_t label[SB_LABEL_SIZE]; - - uint64_t flags; - uint64_t seq; - uint64_t pad[8]; - - union { - struct { - /* Cache devices */ - uint64_t nbuckets; /* device size */ - - uint16_t block_size; /* sectors */ - uint16_t bucket_size; /* sectors */ - - uint16_t nr_in_set; - uint16_t nr_this_dev; - }; - struct { - /* Backing devices */ - uint64_t data_offset; - - /* - * block_size from the cache device section is still used by - * backing devices, so don't add anything here until we fix - * things to not need it for backing devices anymore - */ - }; - }; - - uint32_t last_mount; /* time_t */ - - uint16_t first_bucket; - union { - uint16_t njournal_buckets; - uint16_t keys; - }; - uint64_t d[SB_JOURNAL_BUCKETS]; /* journal buckets */ -}; - -BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1); -BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1); -BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3); -#define CACHE_REPLACEMENT_LRU 0U -#define CACHE_REPLACEMENT_FIFO 1U -#define CACHE_REPLACEMENT_RANDOM 2U - -BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4); -#define CACHE_MODE_WRITETHROUGH 0U -#define CACHE_MODE_WRITEBACK 1U -#define CACHE_MODE_WRITEAROUND 2U -#define CACHE_MODE_NONE 3U -BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2); -#define BDEV_STATE_NONE 0U -#define BDEV_STATE_CLEAN 1U -#define BDEV_STATE_DIRTY 2U -#define BDEV_STATE_STALE 3U - -/* Version 1: Seed pointer into btree node checksum - */ -#define BCACHE_BSET_VERSION 1 - -/* - * This is the on disk format for btree nodes - a btree node on disk is a list - * of these; within each set the keys are sorted - */ -struct bset { - uint64_t csum; - uint64_t magic; - uint64_t seq; - uint32_t version; - uint32_t keys; - - union { - struct bkey start[0]; - uint64_t d[0]; - }; -}; - -/* - * On disk format for priorities and gens - see super.c near prio_write() for - * more. - */ -struct prio_set { - uint64_t csum; - uint64_t magic; - uint64_t seq; - uint32_t version; - uint32_t pad; - - uint64_t next_bucket; - - struct bucket_disk { - uint16_t prio; - uint8_t gen; - } __attribute((packed)) data[]; -}; - -struct uuid_entry { - union { - struct { - uint8_t uuid[16]; - uint8_t label[32]; - uint32_t first_reg; - uint32_t last_reg; - uint32_t invalidated; - - uint32_t flags; - /* Size of flash only volumes */ - uint64_t sectors; - }; - - uint8_t pad[128]; - }; -}; - -BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1); - #include "journal.h" #include "stats.h" struct search; @@ -868,12 +707,6 @@ static inline bool key_merging_disabled(struct cache_set *c) #endif } -static inline bool SB_IS_BDEV(const struct cache_sb *sb) -{ - return sb->version == BCACHE_SB_VERSION_BDEV - || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; -} - struct bbio { unsigned submit_time_us; union { @@ -927,59 +760,6 @@ static inline unsigned local_clock_us(void) #define prio_buckets(c) \ DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c)) -#define JSET_MAGIC 0x245235c1a3625032ULL -#define PSET_MAGIC 0x6750e15f87337f91ULL -#define BSET_MAGIC 0x90135c78b99e07f5ULL - -#define jset_magic(c) ((c)->sb.set_magic ^ JSET_MAGIC) -#define pset_magic(c) ((c)->sb.set_magic ^ PSET_MAGIC) -#define bset_magic(c) ((c)->sb.set_magic ^ BSET_MAGIC) - -/* Bkey fields: all units are in sectors */ - -#define KEY_FIELD(name, field, offset, size) \ - BITMASK(name, struct bkey, field, offset, size) - -#define PTR_FIELD(name, offset, size) \ - static inline uint64_t name(const struct bkey *k, unsigned i) \ - { return (k->ptr[i] >> offset) & ~(((uint64_t) ~0) << size); } \ - \ - static inline void SET_##name(struct bkey *k, unsigned i, uint64_t v)\ - { \ - k->ptr[i] &= ~(~((uint64_t) ~0 << size) << offset); \ - k->ptr[i] |= v << offset; \ - } - -KEY_FIELD(KEY_PTRS, high, 60, 3) -KEY_FIELD(HEADER_SIZE, high, 58, 2) -KEY_FIELD(KEY_CSUM, high, 56, 2) -KEY_FIELD(KEY_PINNED, high, 55, 1) -KEY_FIELD(KEY_DIRTY, high, 36, 1) - -KEY_FIELD(KEY_SIZE, high, 20, 16) -KEY_FIELD(KEY_INODE, high, 0, 20) - -/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */ - -static inline uint64_t KEY_OFFSET(const struct bkey *k) -{ - return k->low; -} - -static inline void SET_KEY_OFFSET(struct bkey *k, uint64_t v) -{ - k->low = v; -} - -PTR_FIELD(PTR_DEV, 51, 12) -PTR_FIELD(PTR_OFFSET, 8, 43) -PTR_FIELD(PTR_GEN, 0, 8) - -#define PTR_CHECK_DEV ((1 << 12) - 1) - -#define PTR(gen, offset, dev) \ - ((((uint64_t) dev) << 51) | ((uint64_t) offset) << 8 | gen) - static inline size_t sector_to_bucket(struct cache_set *c, sector_t s) { return s >> c->bucket_bits; @@ -1018,31 +798,11 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c, /* Btree key macros */ -/* - * The high bit being set is a relic from when we used it to do binary - * searches - it told you where a key started. It's not used anymore, - * and can probably be safely dropped. - */ -#define KEY(dev, sector, len) \ -((struct bkey) { \ - .high = (1ULL << 63) | ((uint64_t) (len) << 20) | (dev), \ - .low = (sector) \ -}) - static inline void bkey_init(struct bkey *k) { - *k = KEY(0, 0, 0); + *k = ZERO_KEY; } -#define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k)) -#define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0) - -#define MAX_KEY_INODE (~(~0 << 20)) -#define MAX_KEY_OFFSET (((uint64_t) ~0) >> 1) -#define MAX_KEY KEY(MAX_KEY_INODE, MAX_KEY_OFFSET, 0) - -#define ZERO_KEY KEY(0, 0, 0) - /* * This is used for various on disk data structures - cache_sb, prio_set, bset, * jset: The checksum is _always_ the first 8 bytes of these structs diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index f7b5525ddafa..7b8713c66050 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -684,7 +684,7 @@ void bch_bset_init_next(struct btree *b) } else get_random_bytes(&i->seq, sizeof(uint64_t)); - i->magic = bset_magic(b->c); + i->magic = bset_magic(&b->c->sb); i->version = 0; i->keys = 0; @@ -1034,7 +1034,7 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter, * memcpy() */ - out->magic = bset_magic(b->c); + out->magic = bset_magic(&b->c->sb); out->seq = b->sets[0].data->seq; out->version = b->sets[0].data->version; swap(out, b->sets[0].data); diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 8a9305685b7e..5cd90565dfe2 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -193,37 +193,6 @@ static __always_inline int64_t bkey_cmp(const struct bkey *l, : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); } -static inline size_t bkey_u64s(const struct bkey *k) -{ - BUG_ON(KEY_CSUM(k) > 1); - return 2 + KEY_PTRS(k) + (KEY_CSUM(k) ? 1 : 0); -} - -static inline size_t bkey_bytes(const struct bkey *k) -{ - return bkey_u64s(k) * sizeof(uint64_t); -} - -static inline void bkey_copy(struct bkey *dest, const struct bkey *src) -{ - memcpy(dest, src, bkey_bytes(src)); -} - -static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src) -{ - if (!src) - src = &KEY(0, 0, 0); - - SET_KEY_INODE(dest, KEY_INODE(src)); - SET_KEY_OFFSET(dest, KEY_OFFSET(src)); -} - -static inline struct bkey *bkey_next(const struct bkey *k) -{ - uint64_t *d = (void *) k; - return (struct bkey *) (d + bkey_u64s(k)); -} - /* Keylists */ struct keylist { diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index f5aa4adadf1d..aba787d954e5 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -231,7 +231,7 @@ static void bch_btree_node_read_done(struct btree *b) goto err; err = "bad magic"; - if (i->magic != bset_magic(b->c)) + if (i->magic != bset_magic(&b->c->sb)) goto err; err = "bad checksum"; diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 86de64a6bf26..ecdaa671bd50 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -74,7 +74,7 @@ reread: left = ca->sb.bucket_size - offset; struct list_head *where; size_t blocks, bytes = set_bytes(j); - if (j->magic != jset_magic(ca->set)) + if (j->magic != jset_magic(&ca->sb)) return ret; if (bytes > left << 9) @@ -596,7 +596,7 @@ static void journal_write_unlocked(struct closure *cl) for_each_cache(ca, c, i) w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; - w->data->magic = jset_magic(c); + w->data->magic = jset_magic(&c->sb); w->data->version = BCACHE_JSET_VERSION; w->data->last_seq = last_seq(&c->journal); w->data->csum = csum_set(w->data); diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index 5e9edb9ef376..a6472fda94b2 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -75,43 +75,6 @@ * nodes that are pinning the oldest journal entries first. */ -#define BCACHE_JSET_VERSION_UUIDv1 1 -/* Always latest UUID format */ -#define BCACHE_JSET_VERSION_UUID 1 -#define BCACHE_JSET_VERSION 1 - -/* - * On disk format for a journal entry: - * seq is monotonically increasing; every journal entry has its own unique - * sequence number. - * - * last_seq is the oldest journal entry that still has keys the btree hasn't - * flushed to disk yet. - * - * version is for on disk format changes. - */ -struct jset { - uint64_t csum; - uint64_t magic; - uint64_t seq; - uint32_t version; - uint32_t keys; - - uint64_t last_seq; - - BKEY_PADDED(uuid_bucket); - BKEY_PADDED(btree_root); - uint16_t btree_level; - uint16_t pad[3]; - - uint64_t prio_bucket[MAX_CACHES_PER_SET]; - - union { - struct bkey start[0]; - uint64_t d[0]; - }; -}; - /* * Only used for holding the journal entries we read in btree_journal_read() * during cache_registration diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index cf7850a7592c..932300f18973 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -264,16 +264,17 @@ static void bch_data_invalidate(struct closure *cl) bio_sectors(bio), (uint64_t) bio->bi_sector); while (bio_sectors(bio)) { - unsigned len = min(bio_sectors(bio), 1U << 14); + unsigned sectors = min(bio_sectors(bio), + 1U << (KEY_SIZE_BITS - 1)); if (bch_keylist_realloc(&op->insert_keys, 0, op->c)) goto out; - bio->bi_sector += len; - bio->bi_size -= len << 9; + bio->bi_sector += sectors; + bio->bi_size -= sectors << 9; bch_keylist_add(&op->insert_keys, - &KEY(op->inode, bio->bi_sector, len)); + &KEY(op->inode, bio->bi_sector, sectors)); } op->insert_data_done = true; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a314c771263f..c67d19a8913d 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -45,15 +45,6 @@ const char * const bch_cache_modes[] = { NULL }; -struct uuid_entry_v0 { - uint8_t uuid[16]; - uint8_t label[32]; - uint32_t first_reg; - uint32_t last_reg; - uint32_t invalidated; - uint32_t pad; -}; - static struct kobject *bcache_kobj; struct mutex bch_register_lock; LIST_HEAD(bch_cache_sets); @@ -562,7 +553,7 @@ void bch_prio_write(struct cache *ca) } p->next_bucket = ca->prio_buckets[i + 1]; - p->magic = pset_magic(ca); + p->magic = pset_magic(&ca->sb); p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, true); @@ -613,7 +604,7 @@ static void prio_read(struct cache *ca, uint64_t bucket) if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8)) pr_warn("bad csum reading priorities"); - if (p->magic != pset_magic(ca)) + if (p->magic != pset_magic(&ca->sb)) pr_warn("bad magic reading priorities"); bucket = p->next_bucket; diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index ea345c6896f4..38ae7a4ce928 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -27,16 +27,6 @@ struct closure; #endif -#define BITMASK(name, type, field, offset, size) \ -static inline uint64_t name(const type *k) \ -{ return (k->field >> offset) & ~(((uint64_t) ~0) << size); } \ - \ -static inline void SET_##name(type *k, uint64_t v) \ -{ \ - k->field &= ~(~((uint64_t) ~0 << size) << offset); \ - k->field |= v << offset; \ -} - #define DECLARE_HEAP(type, name) \ struct { \ size_t size, used; \ diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h new file mode 100644 index 000000000000..164a7e263988 --- /dev/null +++ b/include/uapi/linux/bcache.h @@ -0,0 +1,373 @@ +#ifndef _LINUX_BCACHE_H +#define _LINUX_BCACHE_H + +/* + * Bcache on disk data structures + */ + +#include + +#define BITMASK(name, type, field, offset, size) \ +static inline __u64 name(const type *k) \ +{ return (k->field >> offset) & ~(~0ULL << size); } \ + \ +static inline void SET_##name(type *k, __u64 v) \ +{ \ + k->field &= ~(~(~0ULL << size) << offset); \ + k->field |= (v & ~(~0ULL << size)) << offset; \ +} + +/* Btree keys - all units are in sectors */ + +struct bkey { + __u64 high; + __u64 low; + __u64 ptr[]; +}; + +#define KEY_FIELD(name, field, offset, size) \ + BITMASK(name, struct bkey, field, offset, size) + +#define PTR_FIELD(name, offset, size) \ +static inline __u64 name(const struct bkey *k, unsigned i) \ +{ return (k->ptr[i] >> offset) & ~(~0ULL << size); } \ + \ +static inline void SET_##name(struct bkey *k, unsigned i, __u64 v) \ +{ \ + k->ptr[i] &= ~(~(~0ULL << size) << offset); \ + k->ptr[i] |= (v & ~(~0ULL << size)) << offset; \ +} + +#define KEY_SIZE_BITS 16 + +KEY_FIELD(KEY_PTRS, high, 60, 3) +KEY_FIELD(HEADER_SIZE, high, 58, 2) +KEY_FIELD(KEY_CSUM, high, 56, 2) +KEY_FIELD(KEY_PINNED, high, 55, 1) +KEY_FIELD(KEY_DIRTY, high, 36, 1) + +KEY_FIELD(KEY_SIZE, high, 20, KEY_SIZE_BITS) +KEY_FIELD(KEY_INODE, high, 0, 20) + +/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */ + +static inline __u64 KEY_OFFSET(const struct bkey *k) +{ + return k->low; +} + +static inline void SET_KEY_OFFSET(struct bkey *k, __u64 v) +{ + k->low = v; +} + +/* + * The high bit being set is a relic from when we used it to do binary + * searches - it told you where a key started. It's not used anymore, + * and can probably be safely dropped. + */ +#define KEY(inode, offset, size) \ +((struct bkey) { \ + .high = (1ULL << 63) | ((__u64) (size) << 20) | (inode), \ + .low = (offset) \ +}) + +#define ZERO_KEY KEY(0, 0, 0) + +#define MAX_KEY_INODE (~(~0 << 20)) +#define MAX_KEY_OFFSET (~0ULL >> 1) +#define MAX_KEY KEY(MAX_KEY_INODE, MAX_KEY_OFFSET, 0) + +#define KEY_START(k) (KEY_OFFSET(k) - KEY_SIZE(k)) +#define START_KEY(k) KEY(KEY_INODE(k), KEY_START(k), 0) + +#define PTR_DEV_BITS 12 + +PTR_FIELD(PTR_DEV, 51, PTR_DEV_BITS) +PTR_FIELD(PTR_OFFSET, 8, 43) +PTR_FIELD(PTR_GEN, 0, 8) + +#define PTR_CHECK_DEV ((1 << PTR_DEV_BITS) - 1) + +#define PTR(gen, offset, dev) \ + ((((__u64) dev) << 51) | ((__u64) offset) << 8 | gen) + +/* Bkey utility code */ + +static inline unsigned long bkey_u64s(const struct bkey *k) +{ + return (sizeof(struct bkey) / sizeof(__u64)) + KEY_PTRS(k); +} + +static inline unsigned long bkey_bytes(const struct bkey *k) +{ + return bkey_u64s(k) * sizeof(__u64); +} + +#define bkey_copy(_dest, _src) memcpy(_dest, _src, bkey_bytes(_src)) + +static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src) +{ + SET_KEY_INODE(dest, KEY_INODE(src)); + SET_KEY_OFFSET(dest, KEY_OFFSET(src)); +} + +static inline struct bkey *bkey_next(const struct bkey *k) +{ + __u64 *d = (void *) k; + return (struct bkey *) (d + bkey_u64s(k)); +} + +static inline struct bkey *bkey_last(const struct bkey *k, unsigned nr_keys) +{ + __u64 *d = (void *) k; + return (struct bkey *) (d + nr_keys); +} +/* Enough for a key with 6 pointers */ +#define BKEY_PAD 8 + +#define BKEY_PADDED(key) \ + union { struct bkey key; __u64 key ## _pad[BKEY_PAD]; } + +/* Superblock */ + +/* Version 0: Cache device + * Version 1: Backing device + * Version 2: Seed pointer into btree node checksum + * Version 3: Cache device with new UUID format + * Version 4: Backing device with data offset + */ +#define BCACHE_SB_VERSION_CDEV 0 +#define BCACHE_SB_VERSION_BDEV 1 +#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 +#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 +#define BCACHE_SB_MAX_VERSION 4 + +#define SB_SECTOR 8 +#define SB_SIZE 4096 +#define SB_LABEL_SIZE 32 +#define SB_JOURNAL_BUCKETS 256U +/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */ +#define MAX_CACHES_PER_SET 8 + +#define BDEV_DATA_START_DEFAULT 16 /* sectors */ + +struct cache_sb { + __u64 csum; + __u64 offset; /* sector where this sb was written */ + __u64 version; + + __u8 magic[16]; + + __u8 uuid[16]; + union { + __u8 set_uuid[16]; + __u64 set_magic; + }; + __u8 label[SB_LABEL_SIZE]; + + __u64 flags; + __u64 seq; + __u64 pad[8]; + + union { + struct { + /* Cache devices */ + __u64 nbuckets; /* device size */ + + __u16 block_size; /* sectors */ + __u16 bucket_size; /* sectors */ + + __u16 nr_in_set; + __u16 nr_this_dev; + }; + struct { + /* Backing devices */ + __u64 data_offset; + + /* + * block_size from the cache device section is still used by + * backing devices, so don't add anything here until we fix + * things to not need it for backing devices anymore + */ + }; + }; + + __u32 last_mount; /* time_t */ + + __u16 first_bucket; + union { + __u16 njournal_buckets; + __u16 keys; + }; + __u64 d[SB_JOURNAL_BUCKETS]; /* journal buckets */ +}; + +static inline _Bool SB_IS_BDEV(const struct cache_sb *sb) +{ + return sb->version == BCACHE_SB_VERSION_BDEV + || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; +} + +BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1); +BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1); +BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3); +#define CACHE_REPLACEMENT_LRU 0U +#define CACHE_REPLACEMENT_FIFO 1U +#define CACHE_REPLACEMENT_RANDOM 2U + +BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4); +#define CACHE_MODE_WRITETHROUGH 0U +#define CACHE_MODE_WRITEBACK 1U +#define CACHE_MODE_WRITEAROUND 2U +#define CACHE_MODE_NONE 3U +BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2); +#define BDEV_STATE_NONE 0U +#define BDEV_STATE_CLEAN 1U +#define BDEV_STATE_DIRTY 2U +#define BDEV_STATE_STALE 3U + +/* + * Magic numbers + * + * The various other data structures have their own magic numbers, which are + * xored with the first part of the cache set's UUID + */ + +#define JSET_MAGIC 0x245235c1a3625032ULL +#define PSET_MAGIC 0x6750e15f87337f91ULL +#define BSET_MAGIC 0x90135c78b99e07f5ULL + +static inline __u64 jset_magic(struct cache_sb *sb) +{ + return sb->set_magic ^ JSET_MAGIC; +} + +static inline __u64 pset_magic(struct cache_sb *sb) +{ + return sb->set_magic ^ PSET_MAGIC; +} + +static inline __u64 bset_magic(struct cache_sb *sb) +{ + return sb->set_magic ^ BSET_MAGIC; +} + +/* + * Journal + * + * On disk format for a journal entry: + * seq is monotonically increasing; every journal entry has its own unique + * sequence number. + * + * last_seq is the oldest journal entry that still has keys the btree hasn't + * flushed to disk yet. + * + * version is for on disk format changes. + */ + +#define BCACHE_JSET_VERSION_UUIDv1 1 +#define BCACHE_JSET_VERSION_UUID 1 /* Always latest UUID format */ +#define BCACHE_JSET_VERSION 1 + +struct jset { + __u64 csum; + __u64 magic; + __u64 seq; + __u32 version; + __u32 keys; + + __u64 last_seq; + + BKEY_PADDED(uuid_bucket); + BKEY_PADDED(btree_root); + __u16 btree_level; + __u16 pad[3]; + + __u64 prio_bucket[MAX_CACHES_PER_SET]; + + union { + struct bkey start[0]; + __u64 d[0]; + }; +}; + +/* Bucket prios/gens */ + +struct prio_set { + __u64 csum; + __u64 magic; + __u64 seq; + __u32 version; + __u32 pad; + + __u64 next_bucket; + + struct bucket_disk { + __u16 prio; + __u8 gen; + } __attribute((packed)) data[]; +}; + +/* UUIDS - per backing device/flash only volume metadata */ + +struct uuid_entry { + union { + struct { + __u8 uuid[16]; + __u8 label[32]; + __u32 first_reg; + __u32 last_reg; + __u32 invalidated; + + __u32 flags; + /* Size of flash only volumes */ + __u64 sectors; + }; + + __u8 pad[128]; + }; +}; + +BITMASK(UUID_FLASH_ONLY, struct uuid_entry, flags, 0, 1); + +/* Btree nodes */ + +/* Version 1: Seed pointer into btree node checksum + */ +#define BCACHE_BSET_CSUM 1 +#define BCACHE_BSET_VERSION 1 + +/* + * Btree nodes + * + * On disk a btree node is a list/log of these; within each set the keys are + * sorted + */ +struct bset { + __u64 csum; + __u64 magic; + __u64 seq; + __u32 version; + __u32 keys; + + union { + struct bkey start[0]; + __u64 d[0]; + }; +}; + +/* OBSOLETE */ + +/* UUIDS - per backing device/flash only volume metadata */ + +struct uuid_entry_v0 { + __u8 uuid[16]; + __u8 label[32]; + __u32 first_reg; + __u32 last_reg; + __u32 invalidated; + __u32 pad; +}; + +#endif /* _LINUX_BCACHE_H */ -- cgit v1.2.3 From 280481d06c8a683d9aaa26125476222e76b733c5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 24 Oct 2013 16:36:03 -0700 Subject: bcache: Debug code improvements Couple changes: * Consolidate bch_check_keys() and bch_check_key_order(), and move the checks that only check_key_order() could do to bch_btree_iter_next(). * Get rid of CONFIG_BCACHE_EDEBUG - now, all that code is compiled in when CONFIG_BCACHE_DEBUG is enabled, and there's now a sysfs file to flip on the EDEBUG checks at runtime. * Dropped an old not terribly useful check in rw_unlock(), and refactored/improved a some of the other debug code. Signed-off-by: Kent Overstreet --- drivers/md/bcache/Kconfig | 11 +--- drivers/md/bcache/alloc.c | 5 +- drivers/md/bcache/bcache.h | 10 +--- drivers/md/bcache/bset.c | 112 ++++++++++++++++++++----------------- drivers/md/bcache/bset.h | 3 + drivers/md/bcache/btree.c | 8 ++- drivers/md/bcache/btree.h | 8 --- drivers/md/bcache/debug.c | 136 +++++++++++++++++++-------------------------- drivers/md/bcache/debug.h | 46 +++++++-------- drivers/md/bcache/sysfs.c | 5 ++ drivers/md/bcache/util.h | 4 +- 11 files changed, 162 insertions(+), 186 deletions(-) (limited to 'drivers/md/bcache/bcache.h') diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index f950c9d29f3e..2638417b19aa 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -13,15 +13,8 @@ config BCACHE_DEBUG ---help--- Don't select this option unless you're a developer - Enables extra debugging tools (primarily a fuzz tester) - -config BCACHE_EDEBUG - bool "Extended runtime checks" - depends on BCACHE - ---help--- - Don't select this option unless you're a developer - - Enables extra runtime checks which significantly affect performance + Enables extra debugging tools, allows expensive runtime checks to be + turned on. config BCACHE_CLOSURES_DEBUG bool "Debug closures" diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 4970ddc6a7f6..ed5920b20c61 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -398,8 +398,7 @@ long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait) out: wake_up_process(ca->alloc_thread); -#ifdef CONFIG_BCACHE_EDEBUG - { + if (expensive_debug_checks(ca->set)) { size_t iter; long i; @@ -413,7 +412,7 @@ out: fifo_for_each(i, &ca->unused, iter) BUG_ON(i == r); } -#endif + b = ca->buckets + r; BUG_ON(atomic_read(&b->pin) != 1); diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 045cb99f1ca6..d03bc6f66493 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -690,6 +690,7 @@ struct cache_set { unsigned short journal_delay_ms; unsigned verify:1; unsigned key_merging_disabled:1; + unsigned expensive_debug_checks:1; unsigned gc_always_rewrite:1; unsigned shrinker_disabled:1; unsigned copy_gc_enabled:1; @@ -698,15 +699,6 @@ struct cache_set { struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; }; -static inline bool key_merging_disabled(struct cache_set *c) -{ -#ifdef CONFIG_BCACHE_DEBUG - return c->key_merging_disabled; -#else - return 0; -#endif -} - struct bbio { unsigned submit_time_us; union { diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index f32216c75948..6bffde478926 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -106,6 +106,43 @@ bad: return true; } +static bool ptr_bad_expensive_checks(struct btree *b, const struct bkey *k, + unsigned ptr) +{ + struct bucket *g = PTR_BUCKET(b->c, k, ptr); + char buf[80]; + + if (mutex_trylock(&b->c->bucket_lock)) { + if (b->level) { + if (KEY_DIRTY(k) || + g->prio != BTREE_PRIO || + (b->c->gc_mark_valid && + GC_MARK(g) != GC_MARK_METADATA)) + goto err; + + } else { + if (g->prio == BTREE_PRIO) + goto err; + + if (KEY_DIRTY(k) && + b->c->gc_mark_valid && + GC_MARK(g) != GC_MARK_DIRTY) + goto err; + } + mutex_unlock(&b->c->bucket_lock); + } + + return false; +err: + mutex_unlock(&b->c->bucket_lock); + bch_bkey_to_text(buf, sizeof(buf), k); + btree_bug(b, +"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", + buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), + g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); + return true; +} + bool bch_ptr_bad(struct btree *b, const struct bkey *k) { struct bucket *g; @@ -133,46 +170,12 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k) if (stale) return true; -#ifdef CONFIG_BCACHE_EDEBUG - if (!mutex_trylock(&b->c->bucket_lock)) - continue; - - if (b->level) { - if (KEY_DIRTY(k) || - g->prio != BTREE_PRIO || - (b->c->gc_mark_valid && - GC_MARK(g) != GC_MARK_METADATA)) - goto bug; - - } else { - if (g->prio == BTREE_PRIO) - goto bug; - - if (KEY_DIRTY(k) && - b->c->gc_mark_valid && - GC_MARK(g) != GC_MARK_DIRTY) - goto bug; - } - mutex_unlock(&b->c->bucket_lock); -#endif + if (expensive_debug_checks(b->c) && + ptr_bad_expensive_checks(b, k, i)) + return true; } return false; -#ifdef CONFIG_BCACHE_EDEBUG -bug: - mutex_unlock(&b->c->bucket_lock); - - { - char buf[80]; - - bch_bkey_to_text(buf, sizeof(buf), k); - btree_bug(b, -"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", - buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), - g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); - } - return true; -#endif } /* Key/pointer manipulation */ @@ -821,16 +824,16 @@ struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t, } else i = bset_search_write_set(b, t, search); -#ifdef CONFIG_BCACHE_EDEBUG - BUG_ON(bset_written(b, t) && - i.l != t->data->start && - bkey_cmp(tree_to_prev_bkey(t, - inorder_to_tree(bkey_to_cacheline(t, i.l), t)), - search) > 0); + if (expensive_debug_checks(b->c)) { + BUG_ON(bset_written(b, t) && + i.l != t->data->start && + bkey_cmp(tree_to_prev_bkey(t, + inorder_to_tree(bkey_to_cacheline(t, i.l), t)), + search) > 0); - BUG_ON(i.r != end(t->data) && - bkey_cmp(i.r, search) <= 0); -#endif + BUG_ON(i.r != end(t->data) && + bkey_cmp(i.r, search) <= 0); + } while (likely(i.l != i.r) && bkey_cmp(i.l, search) <= 0) @@ -871,12 +874,16 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, } struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter, - struct bkey *search, struct bset_tree *start) + struct bkey *search, struct bset_tree *start) { struct bkey *ret = NULL; iter->size = ARRAY_SIZE(iter->data); iter->used = 0; +#ifdef CONFIG_BCACHE_DEBUG + iter->b = b; +#endif + for (; start <= &b->sets[b->nsets]; start++) { ret = bch_bset_search(b, start, search); bch_btree_iter_push(iter, ret, end(start->data)); @@ -891,6 +898,8 @@ struct bkey *bch_btree_iter_next(struct btree_iter *iter) struct bkey *ret = NULL; if (!btree_iter_end(iter)) { + bch_btree_iter_next_check(iter); + ret = iter->data->k; iter->data->k = bkey_next(iter->data->k); @@ -1002,7 +1011,6 @@ static void btree_mergesort(struct btree *b, struct bset *out, out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0; pr_debug("sorted %i keys", out->keys); - bch_check_key_order(b, out); } static void __btree_sort(struct btree *b, struct btree_iter *iter, @@ -1063,15 +1071,15 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter, void bch_btree_sort_partial(struct btree *b, unsigned start) { - size_t oldsize = 0, order = b->page_order, keys = 0; + size_t order = b->page_order, keys = 0; struct btree_iter iter; + int oldsize = bch_count_data(b); + __bch_btree_iter_init(b, &iter, NULL, &b->sets[start]); BUG_ON(b->sets[b->nsets].data == write_block(b) && (b->sets[b->nsets].size || b->nsets)); - if (b->written) - oldsize = bch_count_data(b); if (start) { unsigned i; @@ -1087,7 +1095,7 @@ void bch_btree_sort_partial(struct btree *b, unsigned start) __btree_sort(b, &iter, start, order, false); - EBUG_ON(b->written && bch_count_data(b) != oldsize); + EBUG_ON(b->written && oldsize >= 0 && bch_count_data(b) != oldsize); } void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter) diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 5cd90565dfe2..a043a92d4dc9 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -148,6 +148,9 @@ struct btree_iter { size_t size, used; +#ifdef CONFIG_BCACHE_DEBUG + struct btree *b; +#endif struct btree_iter_set { struct bkey *k, *end; } data[MAX_BSETS]; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index aba787d954e5..fa4d0b1f6d75 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -216,6 +216,10 @@ static void bch_btree_node_read_done(struct btree *b) iter->size = b->c->sb.bucket_size / b->c->sb.block_size; iter->used = 0; +#ifdef CONFIG_BCACHE_DEBUG + iter->b = b; +#endif + if (!i->seq) goto err; @@ -454,7 +458,7 @@ void bch_btree_node_write(struct btree *b, struct closure *parent) BUG_ON(b->written >= btree_blocks(b)); BUG_ON(b->written && !i->keys); BUG_ON(b->sets->data->seq != i->seq); - bch_check_key_order(b, i); + bch_check_keys(b, "writing"); cancel_delayed_work(&b->work); @@ -1917,7 +1921,7 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, struct bkey *replace_key) { bool ret = false; - unsigned oldsize = bch_count_data(b); + int oldsize = bch_count_data(b); while (!bch_keylist_empty(insert_keys)) { struct bset *i = write_block(b); diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 8fc1e8925399..27e90b189112 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -259,14 +259,6 @@ static inline void rw_lock(bool w, struct btree *b, int level) static inline void rw_unlock(bool w, struct btree *b) { -#ifdef CONFIG_BCACHE_EDEBUG - unsigned i; - - if (w && b->key.ptr[0]) - for (i = 0; i <= b->nsets; i++) - bch_check_key_order(b, b->sets[i].data); -#endif - if (w) b->seq++; (w ? up_write : up_read)(&b->lock); diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index d9ccb3169aa2..e99e6b8852b2 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -76,29 +76,17 @@ int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k) return out - buf; } -int bch_btree_to_text(char *buf, size_t size, const struct btree *b) -{ - return scnprintf(buf, size, "%zu level %i/%i", - PTR_BUCKET_NR(b->c, &b->key, 0), - b->level, b->c->root ? b->c->root->level : -1); -} - -#if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG) - -static bool skipped_backwards(struct btree *b, struct bkey *k) -{ - return bkey_cmp(k, (!b->level) - ? &START_KEY(bkey_next(k)) - : bkey_next(k)) > 0; -} +#ifdef CONFIG_BCACHE_DEBUG static void dump_bset(struct btree *b, struct bset *i) { - struct bkey *k; + struct bkey *k, *next; unsigned j; char buf[80]; - for (k = i->start; k < end(i); k = bkey_next(k)) { + for (k = i->start; k < end(i); k = next) { + next = bkey_next(k); + bch_bkey_to_text(buf, sizeof(buf), k); printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), (uint64_t *) k - i->d, i->keys, buf); @@ -114,15 +102,21 @@ static void dump_bset(struct btree *b, struct bset *i) printk(" %s\n", bch_ptr_status(b->c, k)); - if (bkey_next(k) < end(i) && - skipped_backwards(b, k)) + if (next < end(i) && + bkey_cmp(k, !b->level ? &START_KEY(next) : next) > 0) printk(KERN_ERR "Key skipped backwards\n"); } } -#endif +static void bch_dump_bucket(struct btree *b) +{ + unsigned i; -#ifdef CONFIG_BCACHE_DEBUG + console_lock(); + for (i = 0; i <= b->nsets; i++) + dump_bset(b, b->sets[i].data); + console_unlock(); +} void bch_btree_verify(struct btree *b, struct bset *new) { @@ -211,11 +205,7 @@ out_put: bio_put(check); } -#endif - -#ifdef CONFIG_BCACHE_EDEBUG - -unsigned bch_count_data(struct btree *b) +int __bch_count_data(struct btree *b) { unsigned ret = 0; struct btree_iter iter; @@ -227,72 +217,60 @@ unsigned bch_count_data(struct btree *b) return ret; } -static void vdump_bucket_and_panic(struct btree *b, const char *fmt, - va_list args) -{ - unsigned i; - char buf[80]; - - console_lock(); - - for (i = 0; i <= b->nsets; i++) - dump_bset(b, b->sets[i].data); - - vprintk(fmt, args); - - console_unlock(); - - bch_btree_to_text(buf, sizeof(buf), b); - panic("at %s\n", buf); -} - -void bch_check_key_order_msg(struct btree *b, struct bset *i, - const char *fmt, ...) -{ - struct bkey *k; - - if (!i->keys) - return; - - for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k)) - if (skipped_backwards(b, k)) { - va_list args; - va_start(args, fmt); - - vdump_bucket_and_panic(b, fmt, args); - va_end(args); - } -} - -void bch_check_keys(struct btree *b, const char *fmt, ...) +void __bch_check_keys(struct btree *b, const char *fmt, ...) { va_list args; struct bkey *k, *p = NULL; struct btree_iter iter; - - if (b->level) - return; + const char *err; for_each_key(b, k, &iter) { - if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) { - printk(KERN_ERR "Keys out of order:\n"); - goto bug; - } - - if (bch_ptr_invalid(b, k)) - continue; - - if (p && bkey_cmp(p, &START_KEY(k)) > 0) { - printk(KERN_ERR "Overlapping keys:\n"); - goto bug; + if (!b->level) { + err = "Keys out of order"; + if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) + goto bug; + + if (bch_ptr_invalid(b, k)) + continue; + + err = "Overlapping keys"; + if (p && bkey_cmp(p, &START_KEY(k)) > 0) + goto bug; + } else { + if (bch_ptr_bad(b, k)) + continue; + + err = "Duplicate keys"; + if (p && !bkey_cmp(p, k)) + goto bug; } p = k; } + + err = "Key larger than btree node key"; + if (p && bkey_cmp(p, &b->key) > 0) + goto bug; + return; bug: + bch_dump_bucket(b); + va_start(args, fmt); - vdump_bucket_and_panic(b, fmt, args); + vprintk(fmt, args); va_end(args); + + panic("bcache error: %s:\n", err); +} + +void bch_btree_iter_next_check(struct btree_iter *iter) +{ + struct bkey *k = iter->data->k, *next = bkey_next(k); + + if (next < iter->data->end && + bkey_cmp(k, iter->b->level ? next : &START_KEY(next)) > 0) { + bch_dump_bucket(iter->b); + panic("Key skipped backwards\n"); + } } #endif diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h index 0f4b3440512c..7914ba0ff316 100644 --- a/drivers/md/bcache/debug.h +++ b/drivers/md/bcache/debug.h @@ -4,40 +4,42 @@ /* Btree/bkey debug printing */ int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k); -int bch_btree_to_text(char *buf, size_t size, const struct btree *b); - -#ifdef CONFIG_BCACHE_EDEBUG - -unsigned bch_count_data(struct btree *); -void bch_check_key_order_msg(struct btree *, struct bset *, const char *, ...); -void bch_check_keys(struct btree *, const char *, ...); - -#define bch_check_key_order(b, i) \ - bch_check_key_order_msg(b, i, "keys out of order") -#define EBUG_ON(cond) BUG_ON(cond) - -#else /* EDEBUG */ - -#define bch_count_data(b) 0 -#define bch_check_key_order(b, i) do {} while (0) -#define bch_check_key_order_msg(b, i, ...) do {} while (0) -#define bch_check_keys(b, ...) do {} while (0) -#define EBUG_ON(cond) do {} while (0) - -#endif #ifdef CONFIG_BCACHE_DEBUG void bch_btree_verify(struct btree *, struct bset *); void bch_data_verify(struct cached_dev *, struct bio *); +int __bch_count_data(struct btree *); +void __bch_check_keys(struct btree *, const char *, ...); +void bch_btree_iter_next_check(struct btree_iter *); + +#define EBUG_ON(cond) BUG_ON(cond) +#define expensive_debug_checks(c) ((c)->expensive_debug_checks) +#define key_merging_disabled(c) ((c)->key_merging_disabled) #else /* DEBUG */ static inline void bch_btree_verify(struct btree *b, struct bset *i) {} -static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {}; +static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {} +static inline int __bch_count_data(struct btree *b) { return -1; } +static inline void __bch_check_keys(struct btree *b, const char *fmt, ...) {} +static inline void bch_btree_iter_next_check(struct btree_iter *iter) {} + +#define EBUG_ON(cond) do { if (cond); } while (0) +#define expensive_debug_checks(c) 0 +#define key_merging_disabled(c) 0 #endif +#define bch_count_data(b) \ + (expensive_debug_checks((b)->c) ? __bch_count_data(b) : -1) + +#define bch_check_keys(b, ...) \ +do { \ + if (expensive_debug_checks((b)->c)) \ + __bch_check_keys(b, __VA_ARGS__); \ +} while (0) + #ifdef CONFIG_DEBUG_FS void bch_debug_init_cache_set(struct cache_set *); #else diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index ab286b9b5e40..9687771ec6f3 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -102,6 +102,7 @@ rw_attribute(io_error_halflife); rw_attribute(verify); rw_attribute(key_merging_disabled); rw_attribute(gc_always_rewrite); +rw_attribute(expensive_debug_checks); rw_attribute(freelist_percent); rw_attribute(cache_replacement_policy); rw_attribute(btree_shrinker_disabled); @@ -517,6 +518,8 @@ lock_root: sysfs_print(active_journal_entries, fifo_used(&c->journal.pin)); sysfs_printf(verify, "%i", c->verify); sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled); + sysfs_printf(expensive_debug_checks, + "%i", c->expensive_debug_checks); sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite); sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled); sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); @@ -599,6 +602,7 @@ STORE(__bch_cache_set) sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); sysfs_strtoul(verify, c->verify); sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); + sysfs_strtoul(expensive_debug_checks, c->expensive_debug_checks); sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite); sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled); sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled); @@ -674,6 +678,7 @@ static struct attribute *bch_cache_set_internal_files[] = { #ifdef CONFIG_BCACHE_DEBUG &sysfs_verify, &sysfs_key_merging_disabled, + &sysfs_expensive_debug_checks, #endif &sysfs_gc_always_rewrite, &sysfs_btree_shrinker_disabled, diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 38ae7a4ce928..8ce5aab55962 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -15,12 +15,12 @@ struct closure; -#ifdef CONFIG_BCACHE_EDEBUG +#ifdef CONFIG_BCACHE_DEBUG #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) #define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) -#else /* EDEBUG */ +#else /* DEBUG */ #define atomic_dec_bug(v) atomic_dec(v) #define atomic_inc_bug(v, i) atomic_inc(v) -- cgit v1.2.3 From a1f0358b2bf69be216cb6e4ea40fe7ae4d38b8a6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 10 Sep 2013 19:07:00 -0700 Subject: bcache: Incremental gc Big garbage collection rewrite; now, garbage collection uses the same mechanisms as used elsewhere for inserting/updating btree node pointers, instead of rewriting interior btree nodes in place. This makes the code significantly cleaner and less fragile, and means we can now make garbage collection incremental - it doesn't have to hold a write lock on the root of the btree for the entire duration of garbage collection. This means that there's less of a latency hit for doing garbage collection, which means we can gc more frequently (and do a better job of reclaiming from the cache), and we can coalesce across more btree nodes (improving our space efficiency). Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 1 - drivers/md/bcache/btree.c | 388 ++++++++++++++++++++++++++------------------- drivers/md/bcache/btree.h | 2 +- drivers/md/bcache/sysfs.c | 2 - 4 files changed, 226 insertions(+), 167 deletions(-) (limited to 'drivers/md/bcache/bcache.h') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d03bc6f66493..d6970a651e42 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -477,7 +477,6 @@ struct gc_stat { size_t nkeys; uint64_t data; /* sectors */ - uint64_t dirty; /* sectors */ unsigned in_use; /* percent */ }; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index a3f8ca4ee6e0..7d283d217438 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1176,12 +1176,10 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) -static int btree_gc_mark_node(struct btree *b, unsigned *keys, - struct gc_stat *gc) +static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) { uint8_t stale = 0; - unsigned last_dev = -1; - struct bcache_device *d = NULL; + unsigned keys = 0, good_keys = 0; struct bkey *k; struct btree_iter iter; struct bset_tree *t; @@ -1189,27 +1187,17 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys, gc->nodes++; for_each_key_filter(b, k, &iter, bch_ptr_invalid) { - if (last_dev != KEY_INODE(k)) { - last_dev = KEY_INODE(k); - - d = KEY_INODE(k) < b->c->nr_uuids - ? b->c->devices[last_dev] - : NULL; - } - stale = max(stale, btree_mark_key(b, k)); + keys++; if (bch_ptr_bad(b, k)) continue; - *keys += bkey_u64s(k); - gc->key_bytes += bkey_u64s(k); gc->nkeys++; + good_keys++; gc->data += KEY_SIZE(k); - if (KEY_DIRTY(k)) - gc->dirty += KEY_SIZE(k); } for (t = b->sets; t <= &b->sets[b->nsets]; t++) @@ -1218,79 +1206,74 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys, bkey_cmp(&b->key, &t->end) < 0, b, "found short btree key in gc"); - return stale; -} + if (b->c->gc_always_rewrite) + return true; -static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k) -{ - /* - * We block priorities from being written for the duration of garbage - * collection, so we can't sleep in btree_alloc() -> - * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it - * our closure. - */ - struct btree *n = btree_node_alloc_replacement(b); - - if (!IS_ERR_OR_NULL(n)) { - swap(b, n); + if (stale > 10) + return true; - memcpy(k->ptr, b->key.ptr, - sizeof(uint64_t) * KEY_PTRS(&b->key)); - - btree_node_free(n); - up_write(&n->lock); - } + if ((keys - good_keys) * 2 > keys) + return true; - return b; + return false; } -/* - * Leaving this at 2 until we've got incremental garbage collection done; it - * could be higher (and has been tested with 4) except that garbage collection - * could take much longer, adversely affecting latency. - */ -#define GC_MERGE_NODES 2U +#define GC_MERGE_NODES 4U struct gc_merge_info { struct btree *b; - struct bkey *k; unsigned keys; }; -static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc, - struct gc_merge_info *r) +static int bch_btree_insert_node(struct btree *, struct btree_op *, + struct keylist *, atomic_t *, struct bkey *); + +static int btree_gc_coalesce(struct btree *b, struct btree_op *op, + struct keylist *keylist, struct gc_stat *gc, + struct gc_merge_info *r) { - unsigned nodes = 0, keys = 0, blocks; - int i; + unsigned i, nodes = 0, keys = 0, blocks; + struct btree *new_nodes[GC_MERGE_NODES]; struct closure cl; + struct bkey *k; + memset(new_nodes, 0, sizeof(new_nodes)); closure_init_stack(&cl); - while (nodes < GC_MERGE_NODES && r[nodes].b) + while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b)) keys += r[nodes++].keys; blocks = btree_default_blocks(b->c) * 2 / 3; if (nodes < 2 || __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) - return; - - for (i = nodes - 1; i >= 0; --i) { - if (r[i].b->written) - r[i].b = btree_gc_alloc(r[i].b, r[i].k); + return 0; - if (r[i].b->written) - return; + for (i = 0; i < nodes; i++) { + new_nodes[i] = btree_node_alloc_replacement(r[i].b); + if (IS_ERR_OR_NULL(new_nodes[i])) + goto out_nocoalesce; } for (i = nodes - 1; i > 0; --i) { - struct bset *n1 = r[i].b->sets->data; - struct bset *n2 = r[i - 1].b->sets->data; + struct bset *n1 = new_nodes[i]->sets->data; + struct bset *n2 = new_nodes[i - 1]->sets->data; struct bkey *k, *last = NULL; keys = 0; - if (i == 1) { + if (i > 1) { + for (k = n2->start; + k < end(n2); + k = bkey_next(k)) { + if (__set_blocks(n1, n1->keys + keys + + bkey_u64s(k), b->c) > blocks) + break; + + last = k; + keys += bkey_u64s(k); + } + } else { /* * Last node we're not getting rid of - we're getting * rid of the node at r[0]. Have to try and fit all of @@ -1299,37 +1282,27 @@ static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc, * length keys (shouldn't be possible in practice, * though) */ - if (__set_blocks(n1, n1->keys + r->keys, - b->c) > btree_blocks(r[i].b)) - return; + if (__set_blocks(n1, n1->keys + n2->keys, + b->c) > btree_blocks(new_nodes[i])) + goto out_nocoalesce; keys = n2->keys; + /* Take the key of the node we're getting rid of */ last = &r->b->key; - } else - for (k = n2->start; - k < end(n2); - k = bkey_next(k)) { - if (__set_blocks(n1, n1->keys + keys + - bkey_u64s(k), b->c) > blocks) - break; - - last = k; - keys += bkey_u64s(k); - } + } BUG_ON(__set_blocks(n1, n1->keys + keys, - b->c) > btree_blocks(r[i].b)); + b->c) > btree_blocks(new_nodes[i])); - if (last) { - bkey_copy_key(&r[i].b->key, last); - bkey_copy_key(r[i].k, last); - } + if (last) + bkey_copy_key(&new_nodes[i]->key, last); memcpy(end(n1), n2->start, (void *) node(n2, keys) - (void *) n2->start); n1->keys += keys; + r[i].keys = n1->keys; memmove(n2->start, node(n2, keys), @@ -1337,93 +1310,175 @@ static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc, n2->keys -= keys; - r[i].keys = n1->keys; - r[i - 1].keys = n2->keys; + if (bch_keylist_realloc(keylist, + KEY_PTRS(&new_nodes[i]->key), b->c)) + goto out_nocoalesce; + + bch_btree_node_write(new_nodes[i], &cl); + bch_keylist_add(keylist, &new_nodes[i]->key); } - btree_node_free(r->b); - up_write(&r->b->lock); + for (i = 0; i < nodes; i++) { + if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c)) + goto out_nocoalesce; - trace_bcache_btree_gc_coalesce(nodes); + make_btree_freeing_key(r[i].b, keylist->top); + bch_keylist_push(keylist); + } + + /* We emptied out this node */ + BUG_ON(new_nodes[0]->sets->data->keys); + btree_node_free(new_nodes[0]); + rw_unlock(true, new_nodes[0]); + closure_sync(&cl); + + for (i = 0; i < nodes; i++) { + btree_node_free(r[i].b); + rw_unlock(true, r[i].b); + + r[i].b = new_nodes[i]; + } + + bch_btree_insert_node(b, op, keylist, NULL, NULL); + BUG_ON(!bch_keylist_empty(keylist)); + + memmove(r, r + 1, sizeof(r[0]) * (nodes - 1)); + r[nodes - 1].b = ERR_PTR(-EINTR); + + trace_bcache_btree_gc_coalesce(nodes); gc->nodes--; - nodes--; - memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes); - memset(&r[nodes], 0, sizeof(struct gc_merge_info)); + /* Invalidated our iterator */ + return -EINTR; + +out_nocoalesce: + closure_sync(&cl); + + while ((k = bch_keylist_pop(keylist))) + if (!bkey_cmp(k, &ZERO_KEY)) + atomic_dec(&b->c->prio_blocked); + + for (i = 0; i < nodes; i++) + if (!IS_ERR_OR_NULL(new_nodes[i])) { + btree_node_free(new_nodes[i]); + rw_unlock(true, new_nodes[i]); + } + return 0; } -static int btree_gc_recurse(struct btree *b, struct btree_op *op, - struct closure *writes, struct gc_stat *gc) +static unsigned btree_gc_count_keys(struct btree *b) { - void write(struct btree *r) - { - if (!r->written || btree_node_dirty(r)) - bch_btree_node_write(r, writes); + struct bkey *k; + struct btree_iter iter; + unsigned ret = 0; - up_write(&r->lock); - } + for_each_key_filter(b, k, &iter, bch_ptr_bad) + ret += bkey_u64s(k); - int ret = 0, stale; + return ret; +} + +static int btree_gc_recurse(struct btree *b, struct btree_op *op, + struct closure *writes, struct gc_stat *gc) +{ unsigned i; + int ret = 0; + bool should_rewrite; + struct btree *n; + struct bkey *k; + struct keylist keys; + struct btree_iter iter; struct gc_merge_info r[GC_MERGE_NODES]; + struct gc_merge_info *last = r + GC_MERGE_NODES - 1; - memset(r, 0, sizeof(r)); + bch_keylist_init(&keys); + bch_btree_iter_init(b, &iter, &b->c->gc_done); - while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) { - r->b = bch_btree_node_get(b->c, r->k, b->level - 1, true); + for (i = 0; i < GC_MERGE_NODES; i++) + r[i].b = ERR_PTR(-EINTR); - if (IS_ERR(r->b)) { - ret = PTR_ERR(r->b); - break; + while (1) { + k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); + if (k) { + r->b = bch_btree_node_get(b->c, k, b->level - 1, true); + if (IS_ERR(r->b)) { + ret = PTR_ERR(r->b); + break; + } + + r->keys = btree_gc_count_keys(r->b); + + ret = btree_gc_coalesce(b, op, &keys, gc, r); + if (ret) + break; } - r->keys = 0; - stale = btree_gc_mark_node(r->b, &r->keys, gc); + if (!last->b) + break; - if (!b->written && - (r->b->level || stale > 10 || - b->c->gc_always_rewrite)) - r->b = btree_gc_alloc(r->b, r->k); + if (!IS_ERR(last->b)) { + should_rewrite = btree_gc_mark_node(last->b, gc); + if (should_rewrite) { + n = btree_node_alloc_replacement(last->b); - if (r->b->level) - ret = btree_gc_recurse(r->b, op, writes, gc); + if (!IS_ERR_OR_NULL(n)) { + bch_btree_node_write_sync(n); + bch_keylist_add(&keys, &n->key); - if (ret) { - write(r->b); - break; - } + make_btree_freeing_key(last->b, + keys.top); + bch_keylist_push(&keys); + + btree_node_free(last->b); - bkey_copy_key(&b->c->gc_done, r->k); + bch_btree_insert_node(b, op, &keys, + NULL, NULL); + BUG_ON(!bch_keylist_empty(&keys)); - if (!b->written) - btree_gc_coalesce(b, gc, r); + rw_unlock(true, last->b); + last->b = n; + + /* Invalidated our iterator */ + ret = -EINTR; + break; + } + } - if (r[GC_MERGE_NODES - 1].b) - write(r[GC_MERGE_NODES - 1].b); + if (last->b->level) { + ret = btree_gc_recurse(last->b, op, writes, gc); + if (ret) + break; + } - memmove(&r[1], &r[0], - sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1)); + bkey_copy_key(&b->c->gc_done, &last->b->key); + + /* + * Must flush leaf nodes before gc ends, since replace + * operations aren't journalled + */ + if (btree_node_dirty(last->b)) + bch_btree_node_write(last->b, writes); + rw_unlock(true, last->b); + } + + memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); + r->b = NULL; - /* When we've got incremental GC working, we'll want to do - * if (should_resched()) - * return -EAGAIN; - */ - cond_resched(); -#if 0 if (need_resched()) { ret = -EAGAIN; break; } -#endif } - for (i = 1; i < GC_MERGE_NODES && r[i].b; i++) - write(r[i].b); + for (i = 0; i < GC_MERGE_NODES; i++) + if (!IS_ERR_OR_NULL(r[i].b)) { + if (btree_node_dirty(r[i].b)) + bch_btree_node_write(r[i].b, writes); + rw_unlock(true, r[i].b); + } - /* Might have freed some children, must remove their keys */ - if (!b->written) - bch_btree_sort(b); + bch_keylist_free(&keys); return ret; } @@ -1432,27 +1487,31 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, struct closure *writes, struct gc_stat *gc) { struct btree *n = NULL; - unsigned keys = 0; - int ret = 0, stale = btree_gc_mark_node(b, &keys, gc); + int ret = 0; + bool should_rewrite; - if (b->level || stale > 10) + should_rewrite = btree_gc_mark_node(b, gc); + if (should_rewrite) { n = btree_node_alloc_replacement(b); - if (!IS_ERR_OR_NULL(n)) - swap(b, n); - - if (b->level) - ret = btree_gc_recurse(b, op, writes, gc); + if (!IS_ERR_OR_NULL(n)) { + bch_btree_node_write_sync(n); + bch_btree_set_root(n); + btree_node_free(b); + rw_unlock(true, n); - if (!b->written || btree_node_dirty(b)) - bch_btree_node_write_sync(b); + return -EINTR; + } + } - if (!IS_ERR_OR_NULL(n)) { - bch_btree_set_root(b); - btree_node_free(n); - rw_unlock(true, b); + if (b->level) { + ret = btree_gc_recurse(b, op, writes, gc); + if (ret) + return ret; } + bkey_copy_key(&b->c->gc_done, &b->key); + return ret; } @@ -1550,29 +1609,20 @@ static void bch_btree_gc(struct cache_set *c) btree_gc_start(c); - atomic_inc(&c->prio_blocked); - - ret = btree_root(gc_root, c, &op, &writes, &stats); - closure_sync(&writes); - - if (ret) { - pr_warn("gc failed!"); - return; - } + do { + ret = btree_root(gc_root, c, &op, &writes, &stats); + closure_sync(&writes); - /* Possibly wait for new UUIDs or whatever to hit disk */ - bch_journal_meta(c, &writes); - closure_sync(&writes); + if (ret && ret != -EAGAIN) + pr_warn("gc failed!"); + } while (ret); available = bch_btree_gc_finish(c); - - atomic_dec(&c->prio_blocked); wake_up_allocators(c); bch_time_stats_update(&c->btree_gc_time, start_time); stats.key_bytes *= sizeof(uint64_t); - stats.dirty <<= 9; stats.data <<= 9; stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); @@ -1585,14 +1635,28 @@ static void bch_btree_gc(struct cache_set *c) static int bch_gc_thread(void *arg) { struct cache_set *c = arg; + struct cache *ca; + unsigned i; while (1) { +again: bch_btree_gc(c); set_current_state(TASK_INTERRUPTIBLE); if (kthread_should_stop()) break; + mutex_lock(&c->bucket_lock); + + for_each_cache(ca, c, i) + if (ca->invalidate_needs_gc) { + mutex_unlock(&c->bucket_lock); + set_current_state(TASK_RUNNING); + goto again; + } + + mutex_unlock(&c->bucket_lock); + try_to_freeze(); schedule(); } @@ -2083,8 +2147,6 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, bch_keylist_init(&split_keys); - BUG_ON(b->level); - do { BUG_ON(b->level && replace_key); diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index e11bb8571d24..b5a46affe8eb 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -201,7 +201,7 @@ static inline bool bkey_written(struct btree *b, struct bkey *k) static inline void set_gc_sectors(struct cache_set *c) { - atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8); + atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16); } static inline struct bkey *bch_btree_iter_init(struct btree *b, diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 9687771ec6f3..c5f73e34d016 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -489,7 +489,6 @@ lock_root: sysfs_print(btree_used_percent, btree_used(c)); sysfs_print(btree_nodes, c->gc_stats.nodes); - sysfs_hprint(dirty_data, c->gc_stats.dirty); sysfs_hprint(average_key_size, average_key_size(c)); sysfs_print(cache_read_races, @@ -642,7 +641,6 @@ static struct attribute *bch_cache_set_files[] = { &sysfs_cache_available_percent, &sysfs_average_key_size, - &sysfs_dirty_data, &sysfs_errors, &sysfs_io_error_limit, -- cgit v1.2.3 From 8aee122071a69ca6fa3314da7713bdf0b61dc07c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 30 Jul 2013 22:34:40 -0700 Subject: bcache: Kill sequential_merge option It never really made sense to expose this, so just kill it. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 1 - drivers/md/bcache/request.c | 43 ++++++++++++++++++------------------------- drivers/md/bcache/super.c | 1 - drivers/md/bcache/sysfs.c | 4 ---- 4 files changed, 18 insertions(+), 31 deletions(-) (limited to 'drivers/md/bcache/bcache.h') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d6970a651e42..322735547eab 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -364,7 +364,6 @@ struct cached_dev { unsigned sequential_cutoff; unsigned readahead; - unsigned sequential_merge:1; unsigned verify:1; unsigned partial_stripes_expensive:1; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 932300f18973..f645da61189a 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -510,6 +510,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) unsigned mode = cache_mode(dc, bio); unsigned sectors, congested = bch_get_congested(c); struct task_struct *task = current; + struct io *i; if (atomic_read(&dc->disk.detaching) || c->gc_stats.in_use > CUTOFF_CACHE_ADD || @@ -536,38 +537,30 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) (bio->bi_rw & REQ_SYNC)) goto rescale; - if (dc->sequential_merge) { - struct io *i; + spin_lock(&dc->io_lock); - spin_lock(&dc->io_lock); + hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash) + if (i->last == bio->bi_sector && + time_before(jiffies, i->jiffies)) + goto found; - hlist_for_each_entry(i, iohash(dc, bio->bi_sector), hash) - if (i->last == bio->bi_sector && - time_before(jiffies, i->jiffies)) - goto found; + i = list_first_entry(&dc->io_lru, struct io, lru); - i = list_first_entry(&dc->io_lru, struct io, lru); - - add_sequential(task); - i->sequential = 0; + add_sequential(task); + i->sequential = 0; found: - if (i->sequential + bio->bi_size > i->sequential) - i->sequential += bio->bi_size; - - i->last = bio_end_sector(bio); - i->jiffies = jiffies + msecs_to_jiffies(5000); - task->sequential_io = i->sequential; + if (i->sequential + bio->bi_size > i->sequential) + i->sequential += bio->bi_size; - hlist_del(&i->hash); - hlist_add_head(&i->hash, iohash(dc, i->last)); - list_move_tail(&i->lru, &dc->io_lru); + i->last = bio_end_sector(bio); + i->jiffies = jiffies + msecs_to_jiffies(5000); + task->sequential_io = i->sequential; - spin_unlock(&dc->io_lock); - } else { - task->sequential_io = bio->bi_size; + hlist_del(&i->hash); + hlist_add_head(&i->hash, iohash(dc, i->last)); + list_move_tail(&i->lru, &dc->io_lru); - add_sequential(task); - } + spin_unlock(&dc->io_lock); sectors = max(task->sequential_io, task->sequential_io_avg) >> 9; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e21200e98da6..041dd9d1d882 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1079,7 +1079,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) spin_lock_init(&dc->io_lock); bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); - dc->sequential_merge = true; dc->sequential_cutoff = 4 << 20; for (io = dc->io; io < dc->io + RECENT_IO; io++) { diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index c5f73e34d016..4b672449ffaf 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -72,7 +72,6 @@ rw_attribute(congested_read_threshold_us); rw_attribute(congested_write_threshold_us); rw_attribute(sequential_cutoff); -rw_attribute(sequential_merge); rw_attribute(data_csum); rw_attribute(cache_mode); rw_attribute(writeback_metadata); @@ -161,7 +160,6 @@ SHOW(__bch_cached_dev) sysfs_hprint(stripe_size, dc->disk.stripe_size << 9); var_printf(partial_stripes_expensive, "%u"); - var_printf(sequential_merge, "%i"); var_hprint(sequential_cutoff); var_hprint(readahead); @@ -207,7 +205,6 @@ STORE(__cached_dev) dc->writeback_rate_p_term_inverse, 1, INT_MAX); d_strtoul(writeback_rate_d_smooth); - d_strtoul(sequential_merge); d_strtoi_h(sequential_cutoff); d_strtoi_h(readahead); @@ -319,7 +316,6 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_stripe_size, &sysfs_partial_stripes_expensive, &sysfs_sequential_cutoff, - &sysfs_sequential_merge, &sysfs_clear_stats, &sysfs_running, &sysfs_state, -- cgit v1.2.3 From 65d22e911bfc4f46cda4751f1b1926b43c316c14 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 31 Jul 2013 00:03:54 -0700 Subject: bcache: Move spinlock into struct time_stats Minor cleanup. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 2 -- drivers/md/bcache/bset.c | 7 +------ drivers/md/bcache/btree.c | 3 --- drivers/md/bcache/super.c | 8 ++++++-- drivers/md/bcache/util.c | 12 +++++++++--- drivers/md/bcache/util.h | 1 + 6 files changed, 17 insertions(+), 16 deletions(-) (limited to 'drivers/md/bcache/bcache.h') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 322735547eab..816d07958fac 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -666,11 +666,9 @@ struct cache_set { unsigned congested_read_threshold_us; unsigned congested_write_threshold_us; - spinlock_t sort_time_lock; struct time_stats sort_time; struct time_stats btree_gc_time; struct time_stats btree_split_time; - spinlock_t btree_read_time_lock; struct time_stats btree_read_time; struct time_stats try_harder_time; diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index b0fe0dc59ee6..14573391206b 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -1077,11 +1077,8 @@ static void __btree_sort(struct btree *b, struct btree_iter *iter, if (b->written) bset_build_written_tree(b); - if (!start) { - spin_lock(&b->c->sort_time_lock); + if (!start) bch_time_stats_update(&b->c->sort_time, start_time); - spin_unlock(&b->c->sort_time_lock); - } } void bch_btree_sort_partial(struct btree *b, unsigned start) @@ -1128,9 +1125,7 @@ void bch_btree_sort_into(struct btree *b, struct btree *new) btree_mergesort(b, new->sets->data, &iter, false, true); - spin_lock(&b->c->sort_time_lock); bch_time_stats_update(&b->c->sort_time, start_time); - spin_unlock(&b->c->sort_time_lock); bkey_copy_key(&new->key, &b->key); new->sets->size = 0; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index adc5bb0d8e92..1a7530cd1407 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -317,10 +317,7 @@ void bch_btree_node_read(struct btree *b) goto err; bch_btree_node_read_done(b); - - spin_lock(&b->c->btree_read_time_lock); bch_time_stats_update(&b->c->btree_read_time, start_time); - spin_unlock(&b->c->btree_read_time_lock); return; err: diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 041dd9d1d882..4813ef67cef5 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1435,9 +1435,13 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) init_waitqueue_head(&c->try_wait); init_waitqueue_head(&c->bucket_wait); closure_init_unlocked(&c->uuid_write); - spin_lock_init(&c->sort_time_lock); mutex_init(&c->sort_lock); - spin_lock_init(&c->btree_read_time_lock); + + spin_lock_init(&c->sort_time.lock); + spin_lock_init(&c->btree_gc_time.lock); + spin_lock_init(&c->btree_split_time.lock); + spin_lock_init(&c->btree_read_time.lock); + spin_lock_init(&c->try_harder_time.lock); bch_moving_init_cache_set(c); diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 420dad545c7d..462214eeacbe 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -168,10 +168,14 @@ int bch_parse_uuid(const char *s, char *uuid) void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) { - uint64_t now = local_clock(); - uint64_t duration = time_after64(now, start_time) + uint64_t now, duration, last; + + spin_lock(&stats->lock); + + now = local_clock(); + duration = time_after64(now, start_time) ? now - start_time : 0; - uint64_t last = time_after64(now, stats->last) + last = time_after64(now, stats->last) ? now - stats->last : 0; stats->max_duration = max(stats->max_duration, duration); @@ -188,6 +192,8 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) } stats->last = now ?: 1; + + spin_unlock(&stats->lock); } /** diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 8ce5aab55962..362c4b3f8b4a 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -378,6 +378,7 @@ ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[ ssize_t bch_read_string_list(const char *buf, const char * const list[]); struct time_stats { + spinlock_t lock; /* * all fields are in nanoseconds, averages are ewmas stored left shifted * by 8 -- cgit v1.2.3 From 48a915a87f0bd98c3d68d029acf223a2e5116f07 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 31 Oct 2013 15:43:22 -0700 Subject: bcache: Better full stripe scanning The old scanning-by-stripe code burned too much CPU, this should be better. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 5 ++- drivers/md/bcache/btree.c | 19 +++++---- drivers/md/bcache/super.c | 17 +++++++- drivers/md/bcache/writeback.c | 94 ++++++++++++++++++++++++++----------------- drivers/md/bcache/writeback.h | 21 ++++++---- include/trace/events/bcache.h | 29 +++++++++++++ 6 files changed, 128 insertions(+), 57 deletions(-) (limited to 'drivers/md/bcache/bcache.h') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 816d07958fac..ab0b2150fed6 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -237,7 +237,7 @@ struct keybuf { struct rb_root keys; -#define KEYBUF_NR 100 +#define KEYBUF_NR 500 DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR); }; @@ -273,9 +273,10 @@ struct bcache_device { atomic_t detaching; int flush_done; - uint64_t nr_stripes; + unsigned nr_stripes; unsigned stripe_size; atomic_t *stripe_sectors_dirty; + unsigned long *full_dirty_stripes; unsigned long sectors_dirty_last; long sectors_dirty_derivative; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 6def7c9a1228..5e2765aadce1 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -2378,6 +2378,7 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, struct refill { struct btree_op op; + unsigned nr_found; struct keybuf *buf; struct bkey *end; keybuf_pred_fn *pred; @@ -2414,6 +2415,8 @@ static int refill_keybuf_fn(struct btree_op *op, struct btree *b, if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) array_free(&buf->freelist, w); + else + refill->nr_found++; if (array_freelist_empty(&buf->freelist)) ret = MAP_DONE; @@ -2434,18 +2437,18 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, cond_resched(); bch_btree_op_init(&refill.op, -1); - refill.buf = buf; - refill.end = end; - refill.pred = pred; + refill.nr_found = 0; + refill.buf = buf; + refill.end = end; + refill.pred = pred; bch_btree_map_keys(&refill.op, c, &buf->last_scanned, refill_keybuf_fn, MAP_END_KEY); - pr_debug("found %s keys from %llu:%llu to %llu:%llu", - RB_EMPTY_ROOT(&buf->keys) ? "no" : - array_freelist_empty(&buf->freelist) ? "some" : "a few", - KEY_INODE(&start), KEY_OFFSET(&start), - KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned)); + trace_bcache_keyscan(refill.nr_found, + KEY_INODE(&start), KEY_OFFSET(&start), + KEY_INODE(&buf->last_scanned), + KEY_OFFSET(&buf->last_scanned)); spin_lock(&buf->lock); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 4813ef67cef5..43fcfe38be11 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -738,6 +738,10 @@ static void bcache_device_free(struct bcache_device *d) mempool_destroy(d->unaligned_bvec); if (d->bio_split) bioset_free(d->bio_split); + if (is_vmalloc_addr(d->full_dirty_stripes)) + vfree(d->full_dirty_stripes); + else + kfree(d->full_dirty_stripes); if (is_vmalloc_addr(d->stripe_sectors_dirty)) vfree(d->stripe_sectors_dirty); else @@ -757,8 +761,12 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size); - if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) + if (!d->nr_stripes || + d->nr_stripes > INT_MAX || + d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) { + pr_err("nr_stripes too large"); return -ENOMEM; + } n = d->nr_stripes * sizeof(atomic_t); d->stripe_sectors_dirty = n < PAGE_SIZE << 6 @@ -767,6 +775,13 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, if (!d->stripe_sectors_dirty) return -ENOMEM; + n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long); + d->full_dirty_stripes = n < PAGE_SIZE << 6 + ? kzalloc(n, GFP_KERNEL) + : vzalloc(n); + if (!d->full_dirty_stripes) + return -ENOMEM; + if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, sizeof(struct bio_vec) * BIO_MAX_PAGES)) || diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index ab0f6b449111..22e21dc9a037 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -292,14 +292,12 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, uint64_t offset, int nr_sectors) { struct bcache_device *d = c->devices[inode]; - unsigned stripe_offset; - uint64_t stripe = offset; + unsigned stripe_offset, stripe, sectors_dirty; if (!d) return; - do_div(stripe, d->stripe_size); - + stripe = offset_to_stripe(d, offset); stripe_offset = offset & (d->stripe_size - 1); while (nr_sectors) { @@ -309,7 +307,16 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, if (nr_sectors < 0) s = -s; - atomic_add(s, d->stripe_sectors_dirty + stripe); + if (stripe >= d->nr_stripes) + return; + + sectors_dirty = atomic_add_return(s, + d->stripe_sectors_dirty + stripe); + if (sectors_dirty == d->stripe_size) + set_bit(stripe, d->full_dirty_stripes); + else + clear_bit(stripe, d->full_dirty_stripes); + nr_sectors -= s; stripe_offset = 0; stripe++; @@ -321,59 +328,70 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k) return KEY_DIRTY(k); } -static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) +static void refill_full_stripes(struct cached_dev *dc) { - uint64_t stripe = KEY_START(k); - unsigned nr_sectors = KEY_SIZE(k); - struct cached_dev *dc = container_of(buf, struct cached_dev, - writeback_keys); + struct keybuf *buf = &dc->writeback_keys; + unsigned start_stripe, stripe, next_stripe; + bool wrapped = false; + + stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned)); - if (!KEY_DIRTY(k)) - return false; + if (stripe >= dc->disk.nr_stripes) + stripe = 0; - do_div(stripe, dc->disk.stripe_size); + start_stripe = stripe; while (1) { - if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) == - dc->disk.stripe_size) - return true; + stripe = find_next_bit(dc->disk.full_dirty_stripes, + dc->disk.nr_stripes, stripe); - if (nr_sectors <= dc->disk.stripe_size) - return false; + if (stripe == dc->disk.nr_stripes) + goto next; - nr_sectors -= dc->disk.stripe_size; - stripe++; + next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes, + dc->disk.nr_stripes, stripe); + + buf->last_scanned = KEY(dc->disk.id, + stripe * dc->disk.stripe_size, 0); + + bch_refill_keybuf(dc->disk.c, buf, + &KEY(dc->disk.id, + next_stripe * dc->disk.stripe_size, 0), + dirty_pred); + + if (array_freelist_empty(&buf->freelist)) + return; + + stripe = next_stripe; +next: + if (wrapped && stripe > start_stripe) + return; + + if (stripe == dc->disk.nr_stripes) { + stripe = 0; + wrapped = true; + } } } static bool refill_dirty(struct cached_dev *dc) { struct keybuf *buf = &dc->writeback_keys; - bool searched_from_start = false; struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0); + bool searched_from_start = false; + + if (dc->partial_stripes_expensive) { + refill_full_stripes(dc); + if (array_freelist_empty(&buf->freelist)) + return false; + } if (bkey_cmp(&buf->last_scanned, &end) >= 0) { buf->last_scanned = KEY(dc->disk.id, 0, 0); searched_from_start = true; } - if (dc->partial_stripes_expensive) { - uint64_t i; - - for (i = 0; i < dc->disk.nr_stripes; i++) - if (atomic_read(dc->disk.stripe_sectors_dirty + i) == - dc->disk.stripe_size) - goto full_stripes; - - goto normal_refill; -full_stripes: - searched_from_start = false; /* not searching entire btree */ - bch_refill_keybuf(dc->disk.c, buf, &end, - dirty_full_stripe_pred); - } else { -normal_refill: - bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); - } + bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start; } diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 60516bfa6052..fe7d9d56492b 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -14,22 +14,27 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) return ret; } -static inline bool bcache_dev_stripe_dirty(struct bcache_device *d, +static inline unsigned offset_to_stripe(struct bcache_device *d, + uint64_t offset) +{ + do_div(offset, d->stripe_size); + return offset; +} + +static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc, uint64_t offset, unsigned nr_sectors) { - uint64_t stripe = offset; - - do_div(stripe, d->stripe_size); + unsigned stripe = offset_to_stripe(&dc->disk, offset); while (1) { - if (atomic_read(d->stripe_sectors_dirty + stripe)) + if (atomic_read(dc->disk.stripe_sectors_dirty + stripe)) return true; - if (nr_sectors <= d->stripe_size) + if (nr_sectors <= dc->disk.stripe_size) return false; - nr_sectors -= d->stripe_size; + nr_sectors -= dc->disk.stripe_size; stripe++; } } @@ -45,7 +50,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, return false; if (dc->partial_stripes_expensive && - bcache_dev_stripe_dirty(&dc->disk, bio->bi_sector, + bcache_dev_stripe_dirty(dc, bio->bi_sector, bio_sectors(bio))) return true; diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index 32c89b33c391..e2b9576d00e2 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -368,6 +368,35 @@ DEFINE_EVENT(btree_node, bcache_btree_set_root, TP_ARGS(b) ); +TRACE_EVENT(bcache_keyscan, + TP_PROTO(unsigned nr_found, + unsigned start_inode, uint64_t start_offset, + unsigned end_inode, uint64_t end_offset), + TP_ARGS(nr_found, + start_inode, start_offset, + end_inode, end_offset), + + TP_STRUCT__entry( + __field(__u32, nr_found ) + __field(__u32, start_inode ) + __field(__u64, start_offset ) + __field(__u32, end_inode ) + __field(__u64, end_offset ) + ), + + TP_fast_assign( + __entry->nr_found = nr_found; + __entry->start_inode = start_inode; + __entry->start_offset = start_offset; + __entry->end_inode = end_inode; + __entry->end_offset = end_offset; + ), + + TP_printk("found %u keys from %u:%llu to %u:%llu", __entry->nr_found, + __entry->start_inode, __entry->start_offset, + __entry->end_inode, __entry->end_offset) +); + /* Allocator */ TRACE_EVENT(bcache_alloc_invalidate, -- cgit v1.2.3 From c4d951ddb66fe1d087447b0ba65c4fa4446f1083 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 21 Aug 2013 17:49:09 -0700 Subject: bcache: Fix sysfs splat on shutdown with flash only devs Whoops. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 10 ++++------ drivers/md/bcache/request.c | 2 +- drivers/md/bcache/super.c | 41 ++++++++++++++++++++--------------------- drivers/md/bcache/sysfs.c | 2 +- drivers/md/bcache/writeback.c | 6 +++--- drivers/md/bcache/writeback.h | 2 +- 6 files changed, 30 insertions(+), 33 deletions(-) (limited to 'drivers/md/bcache/bcache.h') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index ab0b2150fed6..97ef126b68bb 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -266,12 +266,10 @@ struct bcache_device { struct gendisk *disk; - /* If nonzero, we're closing */ - atomic_t closing; - - /* If nonzero, we're detaching/unregistering from cache set */ - atomic_t detaching; - int flush_done; + unsigned long flags; +#define BCACHE_DEV_CLOSING 0 +#define BCACHE_DEV_DETACHING 1 +#define BCACHE_DEV_UNLINK_DONE 2 unsigned nr_stripes; unsigned stripe_size; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index f645da61189a..9f5a1386f77a 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -512,7 +512,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) struct task_struct *task = current; struct io *i; - if (atomic_read(&dc->disk.detaching) || + if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || c->gc_stats.in_use > CUTOFF_CACHE_ADD || (bio->bi_rw & REQ_DISCARD)) goto skip; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 43fcfe38be11..fa1d53087f88 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -621,7 +621,7 @@ static void prio_read(struct cache *ca, uint64_t bucket) static int open_dev(struct block_device *b, fmode_t mode) { struct bcache_device *d = b->bd_disk->private_data; - if (atomic_read(&d->closing)) + if (test_bit(BCACHE_DEV_CLOSING, &d->flags)) return -ENXIO; closure_get(&d->cl); @@ -650,20 +650,24 @@ static const struct block_device_operations bcache_ops = { void bcache_device_stop(struct bcache_device *d) { - if (!atomic_xchg(&d->closing, 1)) + if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags)) closure_queue(&d->cl); } static void bcache_device_unlink(struct bcache_device *d) { - unsigned i; - struct cache *ca; + lockdep_assert_held(&bch_register_lock); - sysfs_remove_link(&d->c->kobj, d->name); - sysfs_remove_link(&d->kobj, "cache"); + if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) { + unsigned i; + struct cache *ca; - for_each_cache(ca, d->c, i) - bd_unlink_disk_holder(ca->bdev, d->disk); + sysfs_remove_link(&d->c->kobj, d->name); + sysfs_remove_link(&d->kobj, "cache"); + + for_each_cache(ca, d->c, i) + bd_unlink_disk_holder(ca->bdev, d->disk); + } } static void bcache_device_link(struct bcache_device *d, struct cache_set *c, @@ -687,19 +691,16 @@ static void bcache_device_detach(struct bcache_device *d) { lockdep_assert_held(&bch_register_lock); - if (atomic_read(&d->detaching)) { + if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) { struct uuid_entry *u = d->c->uuids + d->id; SET_UUID_FLASH_ONLY(u, 0); memcpy(u->uuid, invalid_uuid, 16); u->invalidated = cpu_to_le32(get_seconds()); bch_uuid_write(d->c); - - atomic_set(&d->detaching, 0); } - if (!d->flush_done) - bcache_device_unlink(d); + bcache_device_unlink(d); d->c->devices[d->id] = NULL; closure_put(&d->c->caching); @@ -879,7 +880,7 @@ static void cached_dev_detach_finish(struct work_struct *w) struct closure cl; closure_init_stack(&cl); - BUG_ON(!atomic_read(&dc->disk.detaching)); + BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); BUG_ON(atomic_read(&dc->count)); mutex_lock(&bch_register_lock); @@ -893,6 +894,8 @@ static void cached_dev_detach_finish(struct work_struct *w) bcache_device_detach(&dc->disk); list_move(&dc->list, &uncached_devices); + clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags); + mutex_unlock(&bch_register_lock); pr_info("Caching disabled for %s", bdevname(dc->bdev, buf)); @@ -905,10 +908,10 @@ void bch_cached_dev_detach(struct cached_dev *dc) { lockdep_assert_held(&bch_register_lock); - if (atomic_read(&dc->disk.closing)) + if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) return; - if (atomic_xchg(&dc->disk.detaching, 1)) + if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) return; /* @@ -1064,11 +1067,7 @@ static void cached_dev_flush(struct closure *cl) struct bcache_device *d = &dc->disk; mutex_lock(&bch_register_lock); - d->flush_done = 1; - - if (d->c) - bcache_device_unlink(d); - + bcache_device_unlink(d); mutex_unlock(&bch_register_lock); bch_cache_accounting_destroy(&dc->accounting); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 4b672449ffaf..194d43782ea4 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -370,7 +370,7 @@ STORE(__bch_flash_dev) } if (attr == &sysfs_unregister) { - atomic_set(&d->detaching, 1); + set_bit(BCACHE_DEV_DETACHING, &d->flags); bcache_device_stop(d); } diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 22e21dc9a037..99053b1251be 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -89,7 +89,7 @@ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) { uint64_t ret; - if (atomic_read(&dc->disk.detaching) || + if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || !dc->writeback_percent) return 0; @@ -404,7 +404,7 @@ static int bch_writeback_thread(void *arg) while (!kthread_should_stop()) { down_write(&dc->writeback_lock); if (!atomic_read(&dc->has_dirty) || - (!atomic_read(&dc->disk.detaching) && + (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && !dc->writeback_running)) { up_write(&dc->writeback_lock); set_current_state(TASK_INTERRUPTIBLE); @@ -437,7 +437,7 @@ static int bch_writeback_thread(void *arg) while (delay && !kthread_should_stop() && - !atomic_read(&dc->disk.detaching)) + !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) delay = schedule_timeout_interruptible(delay); } } diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index fe7d9d56492b..c9ddcf4614b9 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -45,7 +45,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, unsigned in_use = dc->disk.c->gc_stats.in_use; if (cache_mode != CACHE_MODE_WRITEBACK || - atomic_read(&dc->disk.detaching) || + test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || in_use > CUTOFF_WRITEBACK_SYNC) return false; -- cgit v1.2.3 From 5ceaaad7047745c1c02150c39d3fb623b7948d48 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 10 Sep 2013 14:27:42 -0700 Subject: bcache: Bypass torture test More testing ftw! Also, now verify mode doesn't break if you read dirty data. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/debug.c | 15 ++++++++------- drivers/md/bcache/debug.h | 2 ++ drivers/md/bcache/request.c | 14 +++++++++++++- drivers/md/bcache/sysfs.c | 4 ++++ 5 files changed, 28 insertions(+), 8 deletions(-) (limited to 'drivers/md/bcache/bcache.h') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 97ef126b68bb..4beb55a0ff30 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -364,6 +364,7 @@ struct cached_dev { unsigned readahead; unsigned verify:1; + unsigned bypass_torture_test:1; unsigned partial_stripes_expensive:1; unsigned writeback_metadata:1; diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index e99e6b8852b2..264fcfbd6290 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -189,13 +189,14 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) void *p1 = kmap_atomic(bv->bv_page); void *p2 = page_address(check->bi_io_vec[i].bv_page); - if (memcmp(p1 + bv->bv_offset, - p2 + bv->bv_offset, - bv->bv_len)) - printk(KERN_ERR - "bcache (%s): verify failed at sector %llu\n", - bdevname(dc->bdev, name), - (uint64_t) bio->bi_sector); + cache_set_err_on(memcmp(p1 + bv->bv_offset, + p2 + bv->bv_offset, + bv->bv_len), + dc->disk.c, + "verify failed at dev %s sector %llu", + bdevname(dc->bdev, name), + (uint64_t) bio->bi_sector); + kunmap_atomic(p1); } diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h index 7914ba0ff316..2ede60e31874 100644 --- a/drivers/md/bcache/debug.h +++ b/drivers/md/bcache/debug.h @@ -16,6 +16,7 @@ void bch_btree_iter_next_check(struct btree_iter *); #define EBUG_ON(cond) BUG_ON(cond) #define expensive_debug_checks(c) ((c)->expensive_debug_checks) #define key_merging_disabled(c) ((c)->key_merging_disabled) +#define bypass_torture_test(d) ((d)->bypass_torture_test) #else /* DEBUG */ @@ -28,6 +29,7 @@ static inline void bch_btree_iter_next_check(struct btree_iter *iter) {} #define EBUG_ON(cond) do { if (cond); } while (0) #define expensive_debug_checks(c) 0 #define key_merging_disabled(c) 0 +#define bypass_torture_test(d) 0 #endif diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 9f5a1386f77a..fbcc851ed5a5 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -528,6 +528,13 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) goto skip; } + if (bypass_torture_test(dc)) { + if ((get_random_int() & 3) == 3) + goto skip; + else + goto rescale; + } + if (!congested && !dc->sequential_cutoff) goto rescale; @@ -601,6 +608,7 @@ struct search { unsigned recoverable:1; unsigned unaligned_bvec:1; unsigned write:1; + unsigned read_dirty_data:1; unsigned long start_time; @@ -669,6 +677,9 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; + if (KEY_DIRTY(k)) + s->read_dirty_data = true; + n = bch_bio_split(bio, min_t(uint64_t, INT_MAX, KEY_OFFSET(k) - bio->bi_sector), GFP_NOIO, s->d->bio_split); @@ -894,7 +905,8 @@ static void cached_dev_read_done(struct closure *cl) s->cache_miss = NULL; } - if (verify(dc, &s->bio.bio) && s->recoverable && !s->unaligned_bvec) + if (verify(dc, &s->bio.bio) && s->recoverable && + !s->unaligned_bvec && !s->read_dirty_data) bch_data_verify(dc, s->orig_bio); bio_complete(s); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 194d43782ea4..80d4c2bee18a 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -99,6 +99,7 @@ rw_attribute(errors); rw_attribute(io_error_limit); rw_attribute(io_error_halflife); rw_attribute(verify); +rw_attribute(bypass_torture_test); rw_attribute(key_merging_disabled); rw_attribute(gc_always_rewrite); rw_attribute(expensive_debug_checks); @@ -123,6 +124,7 @@ SHOW(__bch_cached_dev) sysfs_printf(data_csum, "%i", dc->disk.data_csum); var_printf(verify, "%i"); + var_printf(bypass_torture_test, "%i"); var_printf(writeback_metadata, "%i"); var_printf(writeback_running, "%i"); var_print(writeback_delay); @@ -191,6 +193,7 @@ STORE(__cached_dev) sysfs_strtoul(data_csum, dc->disk.data_csum); d_strtoul(verify); + d_strtoul(bypass_torture_test); d_strtoul(writeback_metadata); d_strtoul(writeback_running); d_strtoul(writeback_delay); @@ -323,6 +326,7 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_readahead, #ifdef CONFIG_BCACHE_DEBUG &sysfs_verify, + &sysfs_bypass_torture_test, #endif NULL }; -- cgit v1.2.3 From 981aa8c091e164ea51dd1e81b71a1f3852bbcceb Mon Sep 17 00:00:00 2001 From: Nicholas Swenson Date: Thu, 7 Nov 2013 17:53:19 -0800 Subject: bcache: bugfix - moving_gc now moves only correct buckets Removed gc_move_threshold because picking buckets only by threshold could lead moving extra buckets (ei. if there are buckets at the threshold that aren't supposed to be moved do to space considerations). This is replaced by a GC_MOVE bit in the gc_mark bitmask. Now only marked buckets get moved. Signed-off-by: Nicholas Swenson Signed-off-by: Kent Overstreet --- drivers/md/bcache/alloc.c | 2 ++ drivers/md/bcache/bcache.h | 6 +++--- drivers/md/bcache/movinggc.c | 8 +++----- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'drivers/md/bcache/bcache.h') diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 2b46bf1d7e40..4c9852d92b0a 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -421,9 +421,11 @@ out: if (watermark <= WATERMARK_METADATA) { SET_GC_MARK(b, GC_MARK_METADATA); + SET_GC_MOVE(b, 0); b->prio = BTREE_PRIO; } else { SET_GC_MARK(b, GC_MARK_RECLAIMABLE); + SET_GC_MOVE(b, 0); b->prio = INITIAL_PRIO; } diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 4beb55a0ff30..a7b1a7631ed2 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -197,7 +197,7 @@ struct bucket { uint8_t disk_gen; uint8_t last_gc; /* Most out of date gen in the btree */ uint8_t gc_gen; - uint16_t gc_mark; + uint16_t gc_mark; /* Bitfield used by GC. See below for field */ }; /* @@ -209,7 +209,8 @@ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2); #define GC_MARK_RECLAIMABLE 0 #define GC_MARK_DIRTY 1 #define GC_MARK_METADATA 2 -BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14); +BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 13); +BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1); #include "journal.h" #include "stats.h" @@ -445,7 +446,6 @@ struct cache { * call prio_write() to keep gens from wrapping. */ uint8_t need_save_prio; - unsigned gc_move_threshold; /* * If nonzero, we know we aren't going to find any buckets to invalidate diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 46c952379fab..30f347d4e609 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -25,10 +25,9 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k) unsigned i; for (i = 0; i < KEY_PTRS(k); i++) { - struct cache *ca = PTR_CACHE(c, k, i); struct bucket *g = PTR_BUCKET(c, k, i); - if (GC_SECTORS_USED(g) < ca->gc_move_threshold) + if (GC_MOVE(g)) return true; } @@ -227,9 +226,8 @@ void bch_moving_gc(struct cache_set *c) sectors_to_move -= GC_SECTORS_USED(b); } - ca->gc_move_threshold = bucket_heap_top(ca); - - pr_debug("threshold %u", ca->gc_move_threshold); + while (heap_pop(&ca->heap, b, bucket_cmp)) + SET_GC_MOVE(b, 1); } mutex_unlock(&c->bucket_lock); -- cgit v1.2.3 From 16749c23c00c686ed168471963e3ddb0f3fcd855 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Nov 2013 13:58:34 -0800 Subject: bcache: New writeback PD controller The old writeback PD controller could get into states where it had throttled all the way down and take way too long to recover - it was too complicated to really understand what it was doing. This rewrites a good chunk of it to hopefully be simpler and make more sense, and it also pays more attention to units which should make the behaviour a bit easier to understand. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 6 +++--- drivers/md/bcache/sysfs.c | 50 +++++++++++++++++++++++++------------------ drivers/md/bcache/util.c | 8 ++++++- drivers/md/bcache/writeback.c | 47 ++++++++++++++++++++-------------------- 4 files changed, 62 insertions(+), 49 deletions(-) (limited to 'drivers/md/bcache/bcache.h') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index a7b1a7631ed2..754f43177483 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -373,14 +373,14 @@ struct cached_dev { unsigned char writeback_percent; unsigned writeback_delay; - int writeback_rate_change; - int64_t writeback_rate_derivative; uint64_t writeback_rate_target; + int64_t writeback_rate_proportional; + int64_t writeback_rate_derivative; + int64_t writeback_rate_change; unsigned writeback_rate_update_seconds; unsigned writeback_rate_d_term; unsigned writeback_rate_p_term_inverse; - unsigned writeback_rate_d_smooth; }; enum alloc_watermarks { diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 80d4c2bee18a..a1f85612f0b3 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -83,7 +83,6 @@ rw_attribute(writeback_rate); rw_attribute(writeback_rate_update_seconds); rw_attribute(writeback_rate_d_term); rw_attribute(writeback_rate_p_term_inverse); -rw_attribute(writeback_rate_d_smooth); read_attribute(writeback_rate_debug); read_attribute(stripe_size); @@ -129,31 +128,41 @@ SHOW(__bch_cached_dev) var_printf(writeback_running, "%i"); var_print(writeback_delay); var_print(writeback_percent); - sysfs_print(writeback_rate, dc->writeback_rate.rate); + sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); var_print(writeback_rate_update_seconds); var_print(writeback_rate_d_term); var_print(writeback_rate_p_term_inverse); - var_print(writeback_rate_d_smooth); if (attr == &sysfs_writeback_rate_debug) { + char rate[20]; char dirty[20]; - char derivative[20]; char target[20]; - bch_hprint(dirty, - bcache_dev_sectors_dirty(&dc->disk) << 9); - bch_hprint(derivative, dc->writeback_rate_derivative << 9); + char proportional[20]; + char derivative[20]; + char change[20]; + s64 next_io; + + bch_hprint(rate, dc->writeback_rate.rate << 9); + bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); bch_hprint(target, dc->writeback_rate_target << 9); + bch_hprint(proportional,dc->writeback_rate_proportional << 9); + bch_hprint(derivative, dc->writeback_rate_derivative << 9); + bch_hprint(change, dc->writeback_rate_change << 9); + + next_io = div64_s64(dc->writeback_rate.next - local_clock(), + NSEC_PER_MSEC); return sprintf(buf, - "rate:\t\t%u\n" - "change:\t\t%i\n" + "rate:\t\t%s/sec\n" "dirty:\t\t%s\n" + "target:\t\t%s\n" + "proportional:\t%s\n" "derivative:\t%s\n" - "target:\t\t%s\n", - dc->writeback_rate.rate, - dc->writeback_rate_change, - dirty, derivative, target); + "change:\t\t%s/sec\n" + "next io:\t%llims\n", + rate, dirty, target, proportional, + derivative, change, next_io); } sysfs_hprint(dirty_data, @@ -189,6 +198,7 @@ STORE(__cached_dev) struct kobj_uevent_env *env; #define d_strtoul(var) sysfs_strtoul(var, dc->var) +#define d_strtoul_nonzero(var) sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX) #define d_strtoi_h(var) sysfs_hatoi(var, dc->var) sysfs_strtoul(data_csum, dc->disk.data_csum); @@ -197,16 +207,15 @@ STORE(__cached_dev) d_strtoul(writeback_metadata); d_strtoul(writeback_running); d_strtoul(writeback_delay); - sysfs_strtoul_clamp(writeback_rate, - dc->writeback_rate.rate, 1, 1000000); + sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40); - d_strtoul(writeback_rate_update_seconds); + sysfs_strtoul_clamp(writeback_rate, + dc->writeback_rate.rate, 1, INT_MAX); + + d_strtoul_nonzero(writeback_rate_update_seconds); d_strtoul(writeback_rate_d_term); - d_strtoul(writeback_rate_p_term_inverse); - sysfs_strtoul_clamp(writeback_rate_p_term_inverse, - dc->writeback_rate_p_term_inverse, 1, INT_MAX); - d_strtoul(writeback_rate_d_smooth); + d_strtoul_nonzero(writeback_rate_p_term_inverse); d_strtoi_h(sequential_cutoff); d_strtoi_h(readahead); @@ -313,7 +322,6 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_writeback_rate_update_seconds, &sysfs_writeback_rate_d_term, &sysfs_writeback_rate_p_term_inverse, - &sysfs_writeback_rate_d_smooth, &sysfs_writeback_rate_debug, &sysfs_dirty_data, &sysfs_stripe_size, diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 462214eeacbe..bb37618e7664 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -209,7 +209,13 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) { uint64_t now = local_clock(); - d->next += div_u64(done, d->rate); + d->next += div_u64(done * NSEC_PER_SEC, d->rate); + + if (time_before64(now + NSEC_PER_SEC, d->next)) + d->next = now + NSEC_PER_SEC; + + if (time_after64(now - NSEC_PER_SEC * 2, d->next)) + d->next = now - NSEC_PER_SEC * 2; return time_after64(d->next, now) ? div_u64(d->next - now, NSEC_PER_SEC / HZ) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 3cd931d3f26c..6c44fe059c27 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -30,38 +30,40 @@ static void __update_writeback_rate(struct cached_dev *dc) /* PD controller */ - int change = 0; - int64_t error; int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); int64_t derivative = dirty - dc->disk.sectors_dirty_last; + int64_t proportional = dirty - target; + int64_t change; dc->disk.sectors_dirty_last = dirty; - derivative *= dc->writeback_rate_d_term; - derivative = clamp(derivative, -dirty, dirty); + /* Scale to sectors per second */ - derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, - dc->writeback_rate_d_smooth, 0); + proportional *= dc->writeback_rate_update_seconds; + proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse); - /* Avoid divide by zero */ - if (!target) - goto out; + derivative = div_s64(derivative, dc->writeback_rate_update_seconds); - error = div64_s64((dirty + derivative - target) << 8, target); + derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, + (dc->writeback_rate_d_term / + dc->writeback_rate_update_seconds) ?: 1, 0); + + derivative *= dc->writeback_rate_d_term; + derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse); - change = div_s64((dc->writeback_rate.rate * error) >> 8, - dc->writeback_rate_p_term_inverse); + change = proportional + derivative; /* Don't increase writeback rate if the device isn't keeping up */ if (change > 0 && time_after64(local_clock(), - dc->writeback_rate.next + 10 * NSEC_PER_MSEC)) + dc->writeback_rate.next + NSEC_PER_MSEC)) change = 0; dc->writeback_rate.rate = - clamp_t(int64_t, dc->writeback_rate.rate + change, + clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change, 1, NSEC_PER_MSEC); -out: + + dc->writeback_rate_proportional = proportional; dc->writeback_rate_derivative = derivative; dc->writeback_rate_change = change; dc->writeback_rate_target = target; @@ -87,15 +89,11 @@ static void update_writeback_rate(struct work_struct *work) static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) { - uint64_t ret; - if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || !dc->writeback_percent) return 0; - ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); - - return min_t(uint64_t, ret, HZ); + return bch_next_delay(&dc->writeback_rate, sectors); } struct dirty_io { @@ -476,6 +474,8 @@ void bch_sectors_dirty_init(struct cached_dev *dc) bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0), sectors_dirty_init_fn, 0); + + dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk); } int bch_cached_dev_writeback_init(struct cached_dev *dc) @@ -490,10 +490,9 @@ int bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_delay = 30; dc->writeback_rate.rate = 1024; - dc->writeback_rate_update_seconds = 30; - dc->writeback_rate_d_term = 16; - dc->writeback_rate_p_term_inverse = 64; - dc->writeback_rate_d_smooth = 8; + dc->writeback_rate_update_seconds = 5; + dc->writeback_rate_d_term = 30; + dc->writeback_rate_p_term_inverse = 6000; dc->writeback_thread = kthread_create(bch_writeback_thread, dc, "bcache_writeback"); -- cgit v1.2.3