summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-12-24 22:06:03 +0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-12-24 22:06:03 +0400
commitc5fdd531b593205a0e8c1ab253dd85b764fd0ecb (patch)
tree9117fff70e04077117d447f3eafd9b9d0d1a1041 /drivers/md
parent70e672fa7376d45e262119bda3e1a301e519d4c3 (diff)
parentfc1bc35443741e132dd0118e8dbac53f69a6f76e (diff)
downloadlinux-c5fdd531b593205a0e8c1ab253dd85b764fd0ecb.tar.xz
Merge branch 'for-linus' of git://git.kernel.dk/linux-block
Pull block fixes from Jens Axboe: - fix for a memory leak on certain unplug events - a collection of bcache fixes from Kent and Nicolas - a few null_blk fixes and updates form Matias - a marking of static of functions in the stec pci-e driver * 'for-linus' of git://git.kernel.dk/linux-block: null_blk: support submit_queues on use_per_node_hctx null_blk: set use_per_node_hctx param to false null_blk: corrections to documentation null_blk: warning on ignored submit_queues param null_blk: refactor init and init errors code paths null_blk: documentation null_blk: mem garbage on NUMA systems during init drivers: block: Mark the functions as static in skd_main.c bcache: New writeback PD controller bcache: bugfix for race between moving_gc and bucket_invalidate bcache: fix for gc and writeback race bcache: bugfix - moving_gc now moves only correct buckets bcache: fix for gc crashing when no sectors are used bcache: Fix heap_peek() macro bcache: Fix for can_attach_cache() bcache: Fix dirty_data accounting bcache: Use uninterruptible sleep in writeback bcache: kthread don't set writeback task to INTERUPTIBLE block: fix memory leaks on unplugging block device bcache: fix sparse non static symbol warning
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/alloc.c2
-rw-r--r--drivers/md/bcache/bcache.h12
-rw-r--r--drivers/md/bcache/btree.c27
-rw-r--r--drivers/md/bcache/movinggc.c21
-rw-r--r--drivers/md/bcache/super.c2
-rw-r--r--drivers/md/bcache/sysfs.c50
-rw-r--r--drivers/md/bcache/util.c8
-rw-r--r--drivers/md/bcache/util.h2
-rw-r--r--drivers/md/bcache/writeback.c53
9 files changed, 111 insertions, 66 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 2b46bf1d7e40..4c9852d92b0a 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -421,9 +421,11 @@ out:
if (watermark <= WATERMARK_METADATA) {
SET_GC_MARK(b, GC_MARK_METADATA);
+ SET_GC_MOVE(b, 0);
b->prio = BTREE_PRIO;
} else {
SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
+ SET_GC_MOVE(b, 0);
b->prio = INITIAL_PRIO;
}
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 4beb55a0ff30..754f43177483 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -197,7 +197,7 @@ struct bucket {
uint8_t disk_gen;
uint8_t last_gc; /* Most out of date gen in the btree */
uint8_t gc_gen;
- uint16_t gc_mark;
+ uint16_t gc_mark; /* Bitfield used by GC. See below for field */
};
/*
@@ -209,7 +209,8 @@ BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2);
#define GC_MARK_RECLAIMABLE 0
#define GC_MARK_DIRTY 1
#define GC_MARK_METADATA 2
-BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14);
+BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 13);
+BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1);
#include "journal.h"
#include "stats.h"
@@ -372,14 +373,14 @@ struct cached_dev {
unsigned char writeback_percent;
unsigned writeback_delay;
- int writeback_rate_change;
- int64_t writeback_rate_derivative;
uint64_t writeback_rate_target;
+ int64_t writeback_rate_proportional;
+ int64_t writeback_rate_derivative;
+ int64_t writeback_rate_change;
unsigned writeback_rate_update_seconds;
unsigned writeback_rate_d_term;
unsigned writeback_rate_p_term_inverse;
- unsigned writeback_rate_d_smooth;
};
enum alloc_watermarks {
@@ -445,7 +446,6 @@ struct cache {
* call prio_write() to keep gens from wrapping.
*/
uint8_t need_save_prio;
- unsigned gc_move_threshold;
/*
* If nonzero, we know we aren't going to find any buckets to invalidate
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 5e2765aadce1..31bb53fcc67a 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1561,6 +1561,28 @@ size_t bch_btree_gc_finish(struct cache_set *c)
SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
GC_MARK_METADATA);
+ /* don't reclaim buckets to which writeback keys point */
+ rcu_read_lock();
+ for (i = 0; i < c->nr_uuids; i++) {
+ struct bcache_device *d = c->devices[i];
+ struct cached_dev *dc;
+ struct keybuf_key *w, *n;
+ unsigned j;
+
+ if (!d || UUID_FLASH_ONLY(&c->uuids[i]))
+ continue;
+ dc = container_of(d, struct cached_dev, disk);
+
+ spin_lock(&dc->writeback_keys.lock);
+ rbtree_postorder_for_each_entry_safe(w, n,
+ &dc->writeback_keys.keys, node)
+ for (j = 0; j < KEY_PTRS(&w->key); j++)
+ SET_GC_MARK(PTR_BUCKET(c, &w->key, j),
+ GC_MARK_DIRTY);
+ spin_unlock(&dc->writeback_keys.lock);
+ }
+ rcu_read_unlock();
+
for_each_cache(ca, c, i) {
uint64_t *i;
@@ -1817,7 +1839,8 @@ static bool fix_overlapping_extents(struct btree *b, struct bkey *insert,
if (KEY_START(k) > KEY_START(insert) + sectors_found)
goto check_failed;
- if (KEY_PTRS(replace_key) != KEY_PTRS(k))
+ if (KEY_PTRS(k) != KEY_PTRS(replace_key) ||
+ KEY_DIRTY(k) != KEY_DIRTY(replace_key))
goto check_failed;
/* skip past gen */
@@ -2217,7 +2240,7 @@ struct btree_insert_op {
struct bkey *replace_key;
};
-int btree_insert_fn(struct btree_op *b_op, struct btree *b)
+static int btree_insert_fn(struct btree_op *b_op, struct btree *b)
{
struct btree_insert_op *op = container_of(b_op,
struct btree_insert_op, op);
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 7c1275e66025..f2f0998c4a91 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -25,10 +25,9 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k)
unsigned i;
for (i = 0; i < KEY_PTRS(k); i++) {
- struct cache *ca = PTR_CACHE(c, k, i);
struct bucket *g = PTR_BUCKET(c, k, i);
- if (GC_SECTORS_USED(g) < ca->gc_move_threshold)
+ if (GC_MOVE(g))
return true;
}
@@ -65,11 +64,16 @@ static void write_moving_finish(struct closure *cl)
static void read_moving_endio(struct bio *bio, int error)
{
+ struct bbio *b = container_of(bio, struct bbio, bio);
struct moving_io *io = container_of(bio->bi_private,
struct moving_io, cl);
if (error)
io->op.error = error;
+ else if (!KEY_DIRTY(&b->key) &&
+ ptr_stale(io->op.c, &b->key, 0)) {
+ io->op.error = -EINTR;
+ }
bch_bbio_endio(io->op.c, bio, error, "reading data to move");
}
@@ -141,6 +145,11 @@ static void read_moving(struct cache_set *c)
if (!w)
break;
+ if (ptr_stale(c, &w->key, 0)) {
+ bch_keybuf_del(&c->moving_gc_keys, w);
+ continue;
+ }
+
io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec)
* DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
GFP_KERNEL);
@@ -184,7 +193,8 @@ static bool bucket_cmp(struct bucket *l, struct bucket *r)
static unsigned bucket_heap_top(struct cache *ca)
{
- return GC_SECTORS_USED(heap_peek(&ca->heap));
+ struct bucket *b;
+ return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0;
}
void bch_moving_gc(struct cache_set *c)
@@ -226,9 +236,8 @@ void bch_moving_gc(struct cache_set *c)
sectors_to_move -= GC_SECTORS_USED(b);
}
- ca->gc_move_threshold = bucket_heap_top(ca);
-
- pr_debug("threshold %u", ca->gc_move_threshold);
+ while (heap_pop(&ca->heap, b, bucket_cmp))
+ SET_GC_MOVE(b, 1);
}
mutex_unlock(&c->bucket_lock);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index dec15cd2d797..c57bfa071a57 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1676,7 +1676,7 @@ err:
static bool can_attach_cache(struct cache *ca, struct cache_set *c)
{
return ca->sb.block_size == c->sb.block_size &&
- ca->sb.bucket_size == c->sb.block_size &&
+ ca->sb.bucket_size == c->sb.bucket_size &&
ca->sb.nr_in_set == c->sb.nr_in_set;
}
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 80d4c2bee18a..a1f85612f0b3 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -83,7 +83,6 @@ rw_attribute(writeback_rate);
rw_attribute(writeback_rate_update_seconds);
rw_attribute(writeback_rate_d_term);
rw_attribute(writeback_rate_p_term_inverse);
-rw_attribute(writeback_rate_d_smooth);
read_attribute(writeback_rate_debug);
read_attribute(stripe_size);
@@ -129,31 +128,41 @@ SHOW(__bch_cached_dev)
var_printf(writeback_running, "%i");
var_print(writeback_delay);
var_print(writeback_percent);
- sysfs_print(writeback_rate, dc->writeback_rate.rate);
+ sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9);
var_print(writeback_rate_update_seconds);
var_print(writeback_rate_d_term);
var_print(writeback_rate_p_term_inverse);
- var_print(writeback_rate_d_smooth);
if (attr == &sysfs_writeback_rate_debug) {
+ char rate[20];
char dirty[20];
- char derivative[20];
char target[20];
- bch_hprint(dirty,
- bcache_dev_sectors_dirty(&dc->disk) << 9);
- bch_hprint(derivative, dc->writeback_rate_derivative << 9);
+ char proportional[20];
+ char derivative[20];
+ char change[20];
+ s64 next_io;
+
+ bch_hprint(rate, dc->writeback_rate.rate << 9);
+ bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9);
bch_hprint(target, dc->writeback_rate_target << 9);
+ bch_hprint(proportional,dc->writeback_rate_proportional << 9);
+ bch_hprint(derivative, dc->writeback_rate_derivative << 9);
+ bch_hprint(change, dc->writeback_rate_change << 9);
+
+ next_io = div64_s64(dc->writeback_rate.next - local_clock(),
+ NSEC_PER_MSEC);
return sprintf(buf,
- "rate:\t\t%u\n"
- "change:\t\t%i\n"
+ "rate:\t\t%s/sec\n"
"dirty:\t\t%s\n"
+ "target:\t\t%s\n"
+ "proportional:\t%s\n"
"derivative:\t%s\n"
- "target:\t\t%s\n",
- dc->writeback_rate.rate,
- dc->writeback_rate_change,
- dirty, derivative, target);
+ "change:\t\t%s/sec\n"
+ "next io:\t%llims\n",
+ rate, dirty, target, proportional,
+ derivative, change, next_io);
}
sysfs_hprint(dirty_data,
@@ -189,6 +198,7 @@ STORE(__cached_dev)
struct kobj_uevent_env *env;
#define d_strtoul(var) sysfs_strtoul(var, dc->var)
+#define d_strtoul_nonzero(var) sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX)
#define d_strtoi_h(var) sysfs_hatoi(var, dc->var)
sysfs_strtoul(data_csum, dc->disk.data_csum);
@@ -197,16 +207,15 @@ STORE(__cached_dev)
d_strtoul(writeback_metadata);
d_strtoul(writeback_running);
d_strtoul(writeback_delay);
- sysfs_strtoul_clamp(writeback_rate,
- dc->writeback_rate.rate, 1, 1000000);
+
sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
- d_strtoul(writeback_rate_update_seconds);
+ sysfs_strtoul_clamp(writeback_rate,
+ dc->writeback_rate.rate, 1, INT_MAX);
+
+ d_strtoul_nonzero(writeback_rate_update_seconds);
d_strtoul(writeback_rate_d_term);
- d_strtoul(writeback_rate_p_term_inverse);
- sysfs_strtoul_clamp(writeback_rate_p_term_inverse,
- dc->writeback_rate_p_term_inverse, 1, INT_MAX);
- d_strtoul(writeback_rate_d_smooth);
+ d_strtoul_nonzero(writeback_rate_p_term_inverse);
d_strtoi_h(sequential_cutoff);
d_strtoi_h(readahead);
@@ -313,7 +322,6 @@ static struct attribute *bch_cached_dev_files[] = {
&sysfs_writeback_rate_update_seconds,
&sysfs_writeback_rate_d_term,
&sysfs_writeback_rate_p_term_inverse,
- &sysfs_writeback_rate_d_smooth,
&sysfs_writeback_rate_debug,
&sysfs_dirty_data,
&sysfs_stripe_size,
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 462214eeacbe..bb37618e7664 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -209,7 +209,13 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
{
uint64_t now = local_clock();
- d->next += div_u64(done, d->rate);
+ d->next += div_u64(done * NSEC_PER_SEC, d->rate);
+
+ if (time_before64(now + NSEC_PER_SEC, d->next))
+ d->next = now + NSEC_PER_SEC;
+
+ if (time_after64(now - NSEC_PER_SEC * 2, d->next))
+ d->next = now - NSEC_PER_SEC * 2;
return time_after64(d->next, now)
? div_u64(d->next - now, NSEC_PER_SEC / HZ)
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 362c4b3f8b4a..1030c6020e98 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -110,7 +110,7 @@ do { \
_r; \
})
-#define heap_peek(h) ((h)->size ? (h)->data[0] : NULL)
+#define heap_peek(h) ((h)->used ? (h)->data[0] : NULL)
#define heap_full(h) ((h)->used == (h)->size)
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 99053b1251be..6c44fe059c27 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -30,38 +30,40 @@ static void __update_writeback_rate(struct cached_dev *dc)
/* PD controller */
- int change = 0;
- int64_t error;
int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
int64_t derivative = dirty - dc->disk.sectors_dirty_last;
+ int64_t proportional = dirty - target;
+ int64_t change;
dc->disk.sectors_dirty_last = dirty;
- derivative *= dc->writeback_rate_d_term;
- derivative = clamp(derivative, -dirty, dirty);
+ /* Scale to sectors per second */
- derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
- dc->writeback_rate_d_smooth, 0);
+ proportional *= dc->writeback_rate_update_seconds;
+ proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse);
- /* Avoid divide by zero */
- if (!target)
- goto out;
+ derivative = div_s64(derivative, dc->writeback_rate_update_seconds);
- error = div64_s64((dirty + derivative - target) << 8, target);
+ derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
+ (dc->writeback_rate_d_term /
+ dc->writeback_rate_update_seconds) ?: 1, 0);
+
+ derivative *= dc->writeback_rate_d_term;
+ derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse);
- change = div_s64((dc->writeback_rate.rate * error) >> 8,
- dc->writeback_rate_p_term_inverse);
+ change = proportional + derivative;
/* Don't increase writeback rate if the device isn't keeping up */
if (change > 0 &&
time_after64(local_clock(),
- dc->writeback_rate.next + 10 * NSEC_PER_MSEC))
+ dc->writeback_rate.next + NSEC_PER_MSEC))
change = 0;
dc->writeback_rate.rate =
- clamp_t(int64_t, dc->writeback_rate.rate + change,
+ clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change,
1, NSEC_PER_MSEC);
-out:
+
+ dc->writeback_rate_proportional = proportional;
dc->writeback_rate_derivative = derivative;
dc->writeback_rate_change = change;
dc->writeback_rate_target = target;
@@ -87,15 +89,11 @@ static void update_writeback_rate(struct work_struct *work)
static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
{
- uint64_t ret;
-
if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
!dc->writeback_percent)
return 0;
- ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
-
- return min_t(uint64_t, ret, HZ);
+ return bch_next_delay(&dc->writeback_rate, sectors);
}
struct dirty_io {
@@ -241,7 +239,7 @@ static void read_dirty(struct cached_dev *dc)
if (KEY_START(&w->key) != dc->last_read ||
jiffies_to_msecs(delay) > 50)
while (!kthread_should_stop() && delay)
- delay = schedule_timeout_interruptible(delay);
+ delay = schedule_timeout_uninterruptible(delay);
dc->last_read = KEY_OFFSET(&w->key);
@@ -438,7 +436,7 @@ static int bch_writeback_thread(void *arg)
while (delay &&
!kthread_should_stop() &&
!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
- delay = schedule_timeout_interruptible(delay);
+ delay = schedule_timeout_uninterruptible(delay);
}
}
@@ -476,6 +474,8 @@ void bch_sectors_dirty_init(struct cached_dev *dc)
bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0),
sectors_dirty_init_fn, 0);
+
+ dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk);
}
int bch_cached_dev_writeback_init(struct cached_dev *dc)
@@ -490,18 +490,15 @@ int bch_cached_dev_writeback_init(struct cached_dev *dc)
dc->writeback_delay = 30;
dc->writeback_rate.rate = 1024;
- dc->writeback_rate_update_seconds = 30;
- dc->writeback_rate_d_term = 16;
- dc->writeback_rate_p_term_inverse = 64;
- dc->writeback_rate_d_smooth = 8;
+ dc->writeback_rate_update_seconds = 5;
+ dc->writeback_rate_d_term = 30;
+ dc->writeback_rate_p_term_inverse = 6000;
dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
"bcache_writeback");
if (IS_ERR(dc->writeback_thread))
return PTR_ERR(dc->writeback_thread);
- set_task_state(dc->writeback_thread, TASK_INTERRUPTIBLE);
-
INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
schedule_delayed_work(&dc->writeback_rate_update,
dc->writeback_rate_update_seconds * HZ);