diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-02-11 01:05:11 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-02-11 01:05:11 +0300 |
commit | 9454473c9dccb7b9d25e5baf915a082bfd490b33 (patch) | |
tree | 46f7f1a8886088e2f0184f1cf0e47c8ac12d4849 /drivers | |
parent | cc5cb5af3a3363bc6f0530703895bf9c5fa2f159 (diff) | |
parent | 8525e5ff456592effe83640ea1702525e35b0363 (diff) | |
download | linux-9454473c9dccb7b9d25e5baf915a082bfd490b33.tar.xz |
Merge tag 'for-linus-20180210' of git://git.kernel.dk/linux-block
Pull block fixes from Jens Axboe:
"A few fixes to round off the merge window on the block side:
- a set of bcache fixes by way of Michael Lyle, from the usual bcache
suspects.
- add a simple-to-hook-into function for bpf EIO error injection.
- fix blk-wbt that mischarectized flushes as reads. Improve the logic
so that flushes and writes are accounted as writes, and only reads
as reads. From me.
- fix requeue crash in BFQ, from Paolo"
* tag 'for-linus-20180210' of git://git.kernel.dk/linux-block:
block, bfq: add requeue-request hook
bcache: fix for data collapse after re-attaching an attached device
bcache: return attach error when no cache set exist
bcache: set writeback_rate_update_seconds in range [1, 60] seconds
bcache: fix for allocator and register thread race
bcache: set error_limit correctly
bcache: properly set task state in bch_writeback_thread()
bcache: fix high CPU occupancy during journal
bcache: add journal statistic
block: Add should_fail_bio() for bpf error injection
blk-wbt: account flush requests correctly
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/md/bcache/alloc.c | 4 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 9 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 9 | ||||
-rw-r--r-- | drivers/md/bcache/journal.c | 52 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 25 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 34 | ||||
-rw-r--r-- | drivers/md/bcache/util.h | 2 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.c | 9 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.h | 3 |
9 files changed, 111 insertions, 36 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 6cc6c0f9c3a9..458e1d38577d 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -287,8 +287,10 @@ do { \ break; \ \ mutex_unlock(&(ca)->set->bucket_lock); \ - if (kthread_should_stop()) \ + if (kthread_should_stop()) { \ + set_current_state(TASK_RUNNING); \ return 0; \ + } \ \ schedule(); \ mutex_lock(&(ca)->set->bucket_lock); \ diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 5e2d4e80198e..12e5197f186c 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -658,10 +658,15 @@ struct cache_set { atomic_long_t writeback_keys_done; atomic_long_t writeback_keys_failed; + atomic_long_t reclaim; + atomic_long_t flush_write; + atomic_long_t retry_flush_write; + enum { ON_ERROR_UNREGISTER, ON_ERROR_PANIC, } on_error; +#define DEFAULT_IO_ERROR_LIMIT 8 unsigned error_limit; unsigned error_decay; @@ -675,6 +680,8 @@ struct cache_set { #define BUCKET_HASH_BITS 12 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; + + DECLARE_HEAP(struct btree *, flush_btree); }; struct bbio { @@ -917,7 +924,7 @@ void bcache_write_super(struct cache_set *); int bch_flash_dev_create(struct cache_set *c, uint64_t size); -int bch_cached_dev_attach(struct cached_dev *, struct cache_set *); +int bch_cached_dev_attach(struct cached_dev *, struct cache_set *, uint8_t *); void bch_cached_dev_detach(struct cached_dev *); void bch_cached_dev_run(struct cached_dev *); void bcache_device_stop(struct bcache_device *); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index bf3a48aa9a9a..fad9fe8817eb 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1869,14 +1869,17 @@ void bch_initial_gc_finish(struct cache_set *c) */ for_each_cache(ca, c, i) { for_each_bucket(b, ca) { - if (fifo_full(&ca->free[RESERVE_PRIO])) + if (fifo_full(&ca->free[RESERVE_PRIO]) && + fifo_full(&ca->free[RESERVE_BTREE])) break; if (bch_can_invalidate_bucket(ca, b) && !GC_MARK(b)) { __bch_invalidate_one_bucket(ca, b); - fifo_push(&ca->free[RESERVE_PRIO], - b - ca->buckets); + if (!fifo_push(&ca->free[RESERVE_PRIO], + b - ca->buckets)) + fifo_push(&ca->free[RESERVE_BTREE], + b - ca->buckets); } } } diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index a87165c1d8e5..1b736b860739 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -368,6 +368,12 @@ err: } /* Journalling */ +#define journal_max_cmp(l, r) \ + (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \ + fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) +#define journal_min_cmp(l, r) \ + (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \ + fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) static void btree_flush_write(struct cache_set *c) { @@ -375,28 +381,41 @@ static void btree_flush_write(struct cache_set *c) * Try to find the btree node with that references the oldest journal * entry, best is our current candidate and is locked if non NULL: */ - struct btree *b, *best; - unsigned i; + struct btree *b; + int i; + + atomic_long_inc(&c->flush_write); + retry: - best = NULL; - - for_each_cached_btree(b, c, i) - if (btree_current_write(b)->journal) { - if (!best) - best = b; - else if (journal_pin_cmp(c, - btree_current_write(best)->journal, - btree_current_write(b)->journal)) { - best = b; + spin_lock(&c->journal.lock); + if (heap_empty(&c->flush_btree)) { + for_each_cached_btree(b, c, i) + if (btree_current_write(b)->journal) { + if (!heap_full(&c->flush_btree)) + heap_add(&c->flush_btree, b, + journal_max_cmp); + else if (journal_max_cmp(b, + heap_peek(&c->flush_btree))) { + c->flush_btree.data[0] = b; + heap_sift(&c->flush_btree, 0, + journal_max_cmp); + } } - } - b = best; + for (i = c->flush_btree.used / 2 - 1; i >= 0; --i) + heap_sift(&c->flush_btree, i, journal_min_cmp); + } + + b = NULL; + heap_pop(&c->flush_btree, b, journal_min_cmp); + spin_unlock(&c->journal.lock); + if (b) { mutex_lock(&b->write_lock); if (!btree_current_write(b)->journal) { mutex_unlock(&b->write_lock); /* We raced */ + atomic_long_inc(&c->retry_flush_write); goto retry; } @@ -476,6 +495,8 @@ static void journal_reclaim(struct cache_set *c) unsigned iter, n = 0; atomic_t p; + atomic_long_inc(&c->reclaim); + while (!atomic_read(&fifo_front(&c->journal.pin))) fifo_pop(&c->journal.pin, p); @@ -819,7 +840,8 @@ int bch_journal_alloc(struct cache_set *c) j->w[0].c = c; j->w[1].c = c; - if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || + if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) || + !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) return -ENOMEM; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 133b81225ea9..312895788036 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -957,7 +957,8 @@ void bch_cached_dev_detach(struct cached_dev *dc) cached_dev_put(dc); } -int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) +int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, + uint8_t *set_uuid) { uint32_t rtime = cpu_to_le32(get_seconds()); struct uuid_entry *u; @@ -965,7 +966,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) bdevname(dc->bdev, buf); - if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)) + if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) || + (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))) return -ENOENT; if (dc->disk.c) { @@ -1194,7 +1196,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page, list_add(&dc->list, &uncached_devices); list_for_each_entry(c, &bch_cache_sets, list) - bch_cached_dev_attach(dc, c); + bch_cached_dev_attach(dc, c, NULL); if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE || BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) @@ -1553,7 +1555,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->congested_read_threshold_us = 2000; c->congested_write_threshold_us = 20000; - c->error_limit = 8 << IO_ERROR_SHIFT; + c->error_limit = DEFAULT_IO_ERROR_LIMIT; return c; err: @@ -1716,7 +1718,7 @@ static void run_cache_set(struct cache_set *c) bcache_write_super(c); list_for_each_entry_safe(dc, t, &uncached_devices, list) - bch_cached_dev_attach(dc, c); + bch_cached_dev_attach(dc, c, NULL); flash_devs_run(c); @@ -1833,6 +1835,7 @@ void bch_cache_release(struct kobject *kobj) static int cache_alloc(struct cache *ca) { size_t free; + size_t btree_buckets; struct bucket *b; __module_get(THIS_MODULE); @@ -1840,9 +1843,19 @@ static int cache_alloc(struct cache *ca) bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8); + /* + * when ca->sb.njournal_buckets is not zero, journal exists, + * and in bch_journal_replay(), tree node may split, + * so bucket of RESERVE_BTREE type is needed, + * the worst situation is all journal buckets are valid journal, + * and all the keys need to replay, + * so the number of RESERVE_BTREE type buckets should be as much + * as journal buckets + */ + btree_buckets = ca->sb.njournal_buckets ?: 8; free = roundup_pow_of_two(ca->sb.nbuckets) >> 10; - if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) || + if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets, GFP_KERNEL) || !init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) || diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index b4184092c727..78cd7bd50fdd 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -65,6 +65,9 @@ read_attribute(bset_tree_stats); read_attribute(state); read_attribute(cache_read_races); +read_attribute(reclaim); +read_attribute(flush_write); +read_attribute(retry_flush_write); read_attribute(writeback_keys_done); read_attribute(writeback_keys_failed); read_attribute(io_errors); @@ -195,7 +198,7 @@ STORE(__cached_dev) { struct cached_dev *dc = container_of(kobj, struct cached_dev, disk.kobj); - ssize_t v = size; + ssize_t v; struct cache_set *c; struct kobj_uevent_env *env; @@ -215,7 +218,9 @@ STORE(__cached_dev) sysfs_strtoul_clamp(writeback_rate, dc->writeback_rate.rate, 1, INT_MAX); - d_strtoul_nonzero(writeback_rate_update_seconds); + sysfs_strtoul_clamp(writeback_rate_update_seconds, + dc->writeback_rate_update_seconds, + 1, WRITEBACK_RATE_UPDATE_SECS_MAX); d_strtoul(writeback_rate_i_term_inverse); d_strtoul_nonzero(writeback_rate_p_term_inverse); @@ -267,17 +272,20 @@ STORE(__cached_dev) } if (attr == &sysfs_attach) { - if (bch_parse_uuid(buf, dc->sb.set_uuid) < 16) + uint8_t set_uuid[16]; + + if (bch_parse_uuid(buf, set_uuid) < 16) return -EINVAL; + v = -ENOENT; list_for_each_entry(c, &bch_cache_sets, list) { - v = bch_cached_dev_attach(dc, c); + v = bch_cached_dev_attach(dc, c, set_uuid); if (!v) return size; } pr_err("Can't attach %s: cache set not found", buf); - size = v; + return v; } if (attr == &sysfs_detach && dc->disk.c) @@ -545,6 +553,15 @@ SHOW(__bch_cache_set) sysfs_print(cache_read_races, atomic_long_read(&c->cache_read_races)); + sysfs_print(reclaim, + atomic_long_read(&c->reclaim)); + + sysfs_print(flush_write, + atomic_long_read(&c->flush_write)); + + sysfs_print(retry_flush_write, + atomic_long_read(&c->retry_flush_write)); + sysfs_print(writeback_keys_done, atomic_long_read(&c->writeback_keys_done)); sysfs_print(writeback_keys_failed, @@ -556,7 +573,7 @@ SHOW(__bch_cache_set) /* See count_io_errors for why 88 */ sysfs_print(io_error_halflife, c->error_decay * 88); - sysfs_print(io_error_limit, c->error_limit >> IO_ERROR_SHIFT); + sysfs_print(io_error_limit, c->error_limit); sysfs_hprint(congested, ((uint64_t) bch_get_congested(c)) << 9); @@ -656,7 +673,7 @@ STORE(__bch_cache_set) } if (attr == &sysfs_io_error_limit) - c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT; + c->error_limit = strtoul_or_return(buf); /* See count_io_errors() for why 88 */ if (attr == &sysfs_io_error_halflife) @@ -731,6 +748,9 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_bset_tree_stats, &sysfs_cache_read_races, + &sysfs_reclaim, + &sysfs_flush_write, + &sysfs_retry_flush_write, &sysfs_writeback_keys_done, &sysfs_writeback_keys_failed, diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 4df4c5c1cab2..a6763db7f061 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -112,6 +112,8 @@ do { \ #define heap_full(h) ((h)->used == (h)->size) +#define heap_empty(h) ((h)->used == 0) + #define DECLARE_FIFO(type, name) \ struct { \ size_t front, back, size, mask; \ diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 51306a19ab03..f1d2fc15abcc 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -564,18 +564,21 @@ static int bch_writeback_thread(void *arg) while (!kthread_should_stop()) { down_write(&dc->writeback_lock); + set_current_state(TASK_INTERRUPTIBLE); if (!atomic_read(&dc->has_dirty) || (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && !dc->writeback_running)) { up_write(&dc->writeback_lock); - set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) + if (kthread_should_stop()) { + set_current_state(TASK_RUNNING); return 0; + } schedule(); continue; } + set_current_state(TASK_RUNNING); searched_full_index = refill_dirty(dc); @@ -652,7 +655,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_rate.rate = 1024; dc->writeback_rate_minimum = 8; - dc->writeback_rate_update_seconds = 5; + dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; dc->writeback_rate_p_term_inverse = 40; dc->writeback_rate_i_term_inverse = 10000; diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 66f1c527fa24..587b25599856 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -8,6 +8,9 @@ #define MAX_WRITEBACKS_IN_PASS 5 #define MAX_WRITESIZE_IN_PASS 5000 /* *512b */ +#define WRITEBACK_RATE_UPDATE_SECS_MAX 60 +#define WRITEBACK_RATE_UPDATE_SECS_DEFAULT 5 + /* * 14 (16384ths) is chosen here as something that each backing device * should be a reasonable fraction of the share, and not to blow up |