From cafe563591446cf80bfbc2fe3bc72a2e36cf1060 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 23 Mar 2013 16:11:31 -0700 Subject: bcache: A block layer cache Does writethrough and writeback caching, handles unclean shutdown, and has a bunch of other nifty features motivated by real world usage. See the wiki at http://bcache.evilpiepirate.org for more. Signed-off-by: Kent Overstreet --- drivers/md/bcache/alloc.c | 583 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 583 insertions(+) create mode 100644 drivers/md/bcache/alloc.c (limited to 'drivers/md/bcache/alloc.c') diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c new file mode 100644 index 000000000000..ed18115e078e --- /dev/null +++ b/drivers/md/bcache/alloc.c @@ -0,0 +1,583 @@ +/* + * Primary bucket allocation code + * + * Copyright 2012 Google, Inc. + * + * Allocation in bcache is done in terms of buckets: + * + * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in + * btree pointers - they must match for the pointer to be considered valid. + * + * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a + * bucket simply by incrementing its gen. + * + * The gens (along with the priorities; it's really the gens are important but + * the code is named as if it's the priorities) are written in an arbitrary list + * of buckets on disk, with a pointer to them in the journal header. + * + * When we invalidate a bucket, we have to write its new gen to disk and wait + * for that write to complete before we use it - otherwise after a crash we + * could have pointers that appeared to be good but pointed to data that had + * been overwritten. + * + * Since the gens and priorities are all stored contiguously on disk, we can + * batch this up: We fill up the free_inc list with freshly invalidated buckets, + * call prio_write(), and when prio_write() finishes we pull buckets off the + * free_inc list and optionally discard them. + * + * free_inc isn't the only freelist - if it was, we'd often to sleep while + * priorities and gens were being written before we could allocate. c->free is a + * smaller freelist, and buckets on that list are always ready to be used. + * + * If we've got discards enabled, that happens when a bucket moves from the + * free_inc list to the free list. + * + * There is another freelist, because sometimes we have buckets that we know + * have nothing pointing into them - these we can reuse without waiting for + * priorities to be rewritten. These come from freed btree nodes and buckets + * that garbage collection discovered no longer had valid keys pointing into + * them (because they were overwritten). That's the unused list - buckets on the + * unused list move to the free list, optionally being discarded in the process. + * + * It's also important to ensure that gens don't wrap around - with respect to + * either the oldest gen in the btree or the gen on disk. This is quite + * difficult to do in practice, but we explicitly guard against it anyways - if + * a bucket is in danger of wrapping around we simply skip invalidating it that + * time around, and we garbage collect or rewrite the priorities sooner than we + * would have otherwise. + * + * bch_bucket_alloc() allocates a single bucket from a specific cache. + * + * bch_bucket_alloc_set() allocates one or more buckets from different caches + * out of a cache set. + * + * free_some_buckets() drives all the processes described above. It's called + * from bch_bucket_alloc() and a few other places that need to make sure free + * buckets are ready. + * + * invalidate_buckets_(lru|fifo)() find buckets that are available to be + * invalidated, and then invalidate them and stick them on the free_inc list - + * in either lru or fifo order. + */ + +#include "bcache.h" +#include "btree.h" + +#include + +#define MAX_IN_FLIGHT_DISCARDS 8U + +/* Bucket heap / gen */ + +uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) +{ + uint8_t ret = ++b->gen; + + ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b)); + WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX); + + if (CACHE_SYNC(&ca->set->sb)) { + ca->need_save_prio = max(ca->need_save_prio, + bucket_disk_gen(b)); + WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX); + } + + return ret; +} + +void bch_rescale_priorities(struct cache_set *c, int sectors) +{ + struct cache *ca; + struct bucket *b; + unsigned next = c->nbuckets * c->sb.bucket_size / 1024; + unsigned i; + int r; + + atomic_sub(sectors, &c->rescale); + + do { + r = atomic_read(&c->rescale); + + if (r >= 0) + return; + } while (atomic_cmpxchg(&c->rescale, r, r + next) != r); + + mutex_lock(&c->bucket_lock); + + c->min_prio = USHRT_MAX; + + for_each_cache(ca, c, i) + for_each_bucket(b, ca) + if (b->prio && + b->prio != BTREE_PRIO && + !atomic_read(&b->pin)) { + b->prio--; + c->min_prio = min(c->min_prio, b->prio); + } + + mutex_unlock(&c->bucket_lock); +} + +/* Discard/TRIM */ + +struct discard { + struct list_head list; + struct work_struct work; + struct cache *ca; + long bucket; + + struct bio bio; + struct bio_vec bv; +}; + +static void discard_finish(struct work_struct *w) +{ + struct discard *d = container_of(w, struct discard, work); + struct cache *ca = d->ca; + char buf[BDEVNAME_SIZE]; + + if (!test_bit(BIO_UPTODATE, &d->bio.bi_flags)) { + pr_notice("discard error on %s, disabling", + bdevname(ca->bdev, buf)); + d->ca->discard = 0; + } + + mutex_lock(&ca->set->bucket_lock); + + fifo_push(&ca->free, d->bucket); + list_add(&d->list, &ca->discards); + atomic_dec(&ca->discards_in_flight); + + mutex_unlock(&ca->set->bucket_lock); + + closure_wake_up(&ca->set->bucket_wait); + wake_up(&ca->set->alloc_wait); + + closure_put(&ca->set->cl); +} + +static void discard_endio(struct bio *bio, int error) +{ + struct discard *d = container_of(bio, struct discard, bio); + schedule_work(&d->work); +} + +static void do_discard(struct cache *ca, long bucket) +{ + struct discard *d = list_first_entry(&ca->discards, + struct discard, list); + + list_del(&d->list); + d->bucket = bucket; + + atomic_inc(&ca->discards_in_flight); + closure_get(&ca->set->cl); + + bio_init(&d->bio); + + d->bio.bi_sector = bucket_to_sector(ca->set, d->bucket); + d->bio.bi_bdev = ca->bdev; + d->bio.bi_rw = REQ_WRITE|REQ_DISCARD; + d->bio.bi_max_vecs = 1; + d->bio.bi_io_vec = d->bio.bi_inline_vecs; + d->bio.bi_size = bucket_bytes(ca); + d->bio.bi_end_io = discard_endio; + bio_set_prio(&d->bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + + submit_bio(0, &d->bio); +} + +/* Allocation */ + +static inline bool can_inc_bucket_gen(struct bucket *b) +{ + return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX && + bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX; +} + +bool bch_bucket_add_unused(struct cache *ca, struct bucket *b) +{ + BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b)); + + if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] && + CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) + return false; + + b->prio = 0; + + if (can_inc_bucket_gen(b) && + fifo_push(&ca->unused, b - ca->buckets)) { + atomic_inc(&b->pin); + return true; + } + + return false; +} + +static bool can_invalidate_bucket(struct cache *ca, struct bucket *b) +{ + return GC_MARK(b) == GC_MARK_RECLAIMABLE && + !atomic_read(&b->pin) && + can_inc_bucket_gen(b); +} + +static void invalidate_one_bucket(struct cache *ca, struct bucket *b) +{ + bch_inc_gen(ca, b); + b->prio = INITIAL_PRIO; + atomic_inc(&b->pin); + fifo_push(&ca->free_inc, b - ca->buckets); +} + +static void invalidate_buckets_lru(struct cache *ca) +{ + unsigned bucket_prio(struct bucket *b) + { + return ((unsigned) (b->prio - ca->set->min_prio)) * + GC_SECTORS_USED(b); + } + + bool bucket_max_cmp(struct bucket *l, struct bucket *r) + { + return bucket_prio(l) < bucket_prio(r); + } + + bool bucket_min_cmp(struct bucket *l, struct bucket *r) + { + return bucket_prio(l) > bucket_prio(r); + } + + struct bucket *b; + ssize_t i; + + ca->heap.used = 0; + + for_each_bucket(b, ca) { + if (!can_invalidate_bucket(ca, b)) + continue; + + if (!GC_SECTORS_USED(b)) { + if (!bch_bucket_add_unused(ca, b)) + return; + } else { + if (!heap_full(&ca->heap)) + heap_add(&ca->heap, b, bucket_max_cmp); + else if (bucket_max_cmp(b, heap_peek(&ca->heap))) { + ca->heap.data[0] = b; + heap_sift(&ca->heap, 0, bucket_max_cmp); + } + } + } + + if (ca->heap.used * 2 < ca->heap.size) + bch_queue_gc(ca->set); + + for (i = ca->heap.used / 2 - 1; i >= 0; --i) + heap_sift(&ca->heap, i, bucket_min_cmp); + + while (!fifo_full(&ca->free_inc)) { + if (!heap_pop(&ca->heap, b, bucket_min_cmp)) { + /* We don't want to be calling invalidate_buckets() + * multiple times when it can't do anything + */ + ca->invalidate_needs_gc = 1; + bch_queue_gc(ca->set); + return; + } + + invalidate_one_bucket(ca, b); + } +} + +static void invalidate_buckets_fifo(struct cache *ca) +{ + struct bucket *b; + size_t checked = 0; + + while (!fifo_full(&ca->free_inc)) { + if (ca->fifo_last_bucket < ca->sb.first_bucket || + ca->fifo_last_bucket >= ca->sb.nbuckets) + ca->fifo_last_bucket = ca->sb.first_bucket; + + b = ca->buckets + ca->fifo_last_bucket++; + + if (can_invalidate_bucket(ca, b)) + invalidate_one_bucket(ca, b); + + if (++checked >= ca->sb.nbuckets) { + ca->invalidate_needs_gc = 1; + bch_queue_gc(ca->set); + return; + } + } +} + +static void invalidate_buckets_random(struct cache *ca) +{ + struct bucket *b; + size_t checked = 0; + + while (!fifo_full(&ca->free_inc)) { + size_t n; + get_random_bytes(&n, sizeof(n)); + + n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket); + n += ca->sb.first_bucket; + + b = ca->buckets + n; + + if (can_invalidate_bucket(ca, b)) + invalidate_one_bucket(ca, b); + + if (++checked >= ca->sb.nbuckets / 2) { + ca->invalidate_needs_gc = 1; + bch_queue_gc(ca->set); + return; + } + } +} + +static void invalidate_buckets(struct cache *ca) +{ + if (ca->invalidate_needs_gc) + return; + + switch (CACHE_REPLACEMENT(&ca->sb)) { + case CACHE_REPLACEMENT_LRU: + invalidate_buckets_lru(ca); + break; + case CACHE_REPLACEMENT_FIFO: + invalidate_buckets_fifo(ca); + break; + case CACHE_REPLACEMENT_RANDOM: + invalidate_buckets_random(ca); + break; + } +} + +#define allocator_wait(ca, cond) \ +do { \ + DEFINE_WAIT(__wait); \ + \ + while (!(cond)) { \ + prepare_to_wait(&ca->set->alloc_wait, \ + &__wait, TASK_INTERRUPTIBLE); \ + \ + mutex_unlock(&(ca)->set->bucket_lock); \ + if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \ + finish_wait(&ca->set->alloc_wait, &__wait); \ + closure_return(cl); \ + } \ + \ + schedule(); \ + __set_current_state(TASK_RUNNING); \ + mutex_lock(&(ca)->set->bucket_lock); \ + } \ + \ + finish_wait(&ca->set->alloc_wait, &__wait); \ +} while (0) + +void bch_allocator_thread(struct closure *cl) +{ + struct cache *ca = container_of(cl, struct cache, alloc); + + mutex_lock(&ca->set->bucket_lock); + + while (1) { + while (1) { + long bucket; + + if ((!atomic_read(&ca->set->prio_blocked) || + !CACHE_SYNC(&ca->set->sb)) && + !fifo_empty(&ca->unused)) + fifo_pop(&ca->unused, bucket); + else if (!fifo_empty(&ca->free_inc)) + fifo_pop(&ca->free_inc, bucket); + else + break; + + allocator_wait(ca, (int) fifo_free(&ca->free) > + atomic_read(&ca->discards_in_flight)); + + if (ca->discard) { + allocator_wait(ca, !list_empty(&ca->discards)); + do_discard(ca, bucket); + } else { + fifo_push(&ca->free, bucket); + closure_wake_up(&ca->set->bucket_wait); + } + } + + allocator_wait(ca, ca->set->gc_mark_valid); + invalidate_buckets(ca); + + allocator_wait(ca, !atomic_read(&ca->set->prio_blocked) || + !CACHE_SYNC(&ca->set->sb)); + + if (CACHE_SYNC(&ca->set->sb) && + (!fifo_empty(&ca->free_inc) || + ca->need_save_prio > 64)) { + bch_prio_write(ca); + } + } +} + +long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl) +{ + long r = -1; +again: + wake_up(&ca->set->alloc_wait); + + if (fifo_used(&ca->free) > ca->watermark[watermark] && + fifo_pop(&ca->free, r)) { + struct bucket *b = ca->buckets + r; +#ifdef CONFIG_BCACHE_EDEBUG + size_t iter; + long i; + + for (iter = 0; iter < prio_buckets(ca) * 2; iter++) + BUG_ON(ca->prio_buckets[iter] == (uint64_t) r); + + fifo_for_each(i, &ca->free, iter) + BUG_ON(i == r); + fifo_for_each(i, &ca->free_inc, iter) + BUG_ON(i == r); + fifo_for_each(i, &ca->unused, iter) + BUG_ON(i == r); +#endif + BUG_ON(atomic_read(&b->pin) != 1); + + SET_GC_SECTORS_USED(b, ca->sb.bucket_size); + + if (watermark <= WATERMARK_METADATA) { + SET_GC_MARK(b, GC_MARK_METADATA); + b->prio = BTREE_PRIO; + } else { + SET_GC_MARK(b, GC_MARK_RECLAIMABLE); + b->prio = INITIAL_PRIO; + } + + return r; + } + + pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu", + atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free), + fifo_used(&ca->free_inc), fifo_used(&ca->unused)); + + if (cl) { + closure_wait(&ca->set->bucket_wait, cl); + + if (closure_blocking(cl)) { + mutex_unlock(&ca->set->bucket_lock); + closure_sync(cl); + mutex_lock(&ca->set->bucket_lock); + goto again; + } + } + + return -1; +} + +void bch_bucket_free(struct cache_set *c, struct bkey *k) +{ + unsigned i; + + for (i = 0; i < KEY_PTRS(k); i++) { + struct bucket *b = PTR_BUCKET(c, k, i); + + SET_GC_MARK(b, 0); + SET_GC_SECTORS_USED(b, 0); + bch_bucket_add_unused(PTR_CACHE(c, k, i), b); + } +} + +int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, + struct bkey *k, int n, struct closure *cl) +{ + int i; + + lockdep_assert_held(&c->bucket_lock); + BUG_ON(!n || n > c->caches_loaded || n > 8); + + bkey_init(k); + + /* sort by free space/prio of oldest data in caches */ + + for (i = 0; i < n; i++) { + struct cache *ca = c->cache_by_alloc[i]; + long b = bch_bucket_alloc(ca, watermark, cl); + + if (b == -1) + goto err; + + k->ptr[i] = PTR(ca->buckets[b].gen, + bucket_to_sector(c, b), + ca->sb.nr_this_dev); + + SET_KEY_PTRS(k, i + 1); + } + + return 0; +err: + bch_bucket_free(c, k); + __bkey_put(c, k); + return -1; +} + +int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, + struct bkey *k, int n, struct closure *cl) +{ + int ret; + mutex_lock(&c->bucket_lock); + ret = __bch_bucket_alloc_set(c, watermark, k, n, cl); + mutex_unlock(&c->bucket_lock); + return ret; +} + +/* Init */ + +void bch_cache_allocator_exit(struct cache *ca) +{ + struct discard *d; + + while (!list_empty(&ca->discards)) { + d = list_first_entry(&ca->discards, struct discard, list); + cancel_work_sync(&d->work); + list_del(&d->list); + kfree(d); + } +} + +int bch_cache_allocator_init(struct cache *ca) +{ + unsigned i; + + /* + * Reserve: + * Prio/gen writes first + * Then 8 for btree allocations + * Then half for the moving garbage collector + */ + + ca->watermark[WATERMARK_PRIO] = 0; + + ca->watermark[WATERMARK_METADATA] = prio_buckets(ca); + + ca->watermark[WATERMARK_MOVINGGC] = 8 + + ca->watermark[WATERMARK_METADATA]; + + ca->watermark[WATERMARK_NONE] = ca->free.size / 2 + + ca->watermark[WATERMARK_MOVINGGC]; + + for (i = 0; i < MAX_IN_FLIGHT_DISCARDS; i++) { + struct discard *d = kzalloc(sizeof(*d), GFP_KERNEL); + if (!d) + return -ENOMEM; + + d->ca = ca; + INIT_WORK(&d->work, discard_finish); + list_add(&d->list, &ca->discards); + } + + return 0; +} -- cgit v1.2.3