diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2017-03-17 09:18:50 +0300 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@linux.dev> | 2023-10-23 00:08:07 +0300 |
commit | 1c6fdbd8f2465ddfb73a01ec620cbf3d14044e1a (patch) | |
tree | 9192de91a00908ee898bc331ac8b0544d6fc030a /fs/bcachefs/rebalance.c | |
parent | 0d29a833b7b1800bd2759bbc064b5ada4729caf5 (diff) | |
download | linux-1c6fdbd8f2465ddfb73a01ec620cbf3d14044e1a.tar.xz |
bcachefs: Initial commit
Initially forked from drivers/md/bcache, bcachefs is a new copy-on-write
filesystem with every feature you could possibly want.
Website: https://bcachefs.org
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Diffstat (limited to 'fs/bcachefs/rebalance.c')
-rw-r--r-- | fs/bcachefs/rebalance.c | 342 |
1 files changed, 342 insertions, 0 deletions
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c new file mode 100644 index 000000000000..04824f667693 --- /dev/null +++ b/fs/bcachefs/rebalance.c @@ -0,0 +1,342 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "alloc.h" +#include "btree_iter.h" +#include "buckets.h" +#include "clock.h" +#include "disk_groups.h" +#include "extents.h" +#include "io.h" +#include "move.h" +#include "rebalance.h" +#include "super-io.h" +#include "trace.h" + +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/sched/cputime.h> + +static inline bool rebalance_ptr_pred(struct bch_fs *c, + const struct bch_extent_ptr *ptr, + struct bch_extent_crc_unpacked crc, + struct bch_io_opts *io_opts) +{ + if (io_opts->background_target && + !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) && + !ptr->cached) + return true; + + if (io_opts->background_compression && + crc.compression_type != + bch2_compression_opt_to_type[io_opts->background_compression]) + return true; + + return false; +} + +void bch2_rebalance_add_key(struct bch_fs *c, + struct bkey_s_c k, + struct bch_io_opts *io_opts) +{ + const struct bch_extent_ptr *ptr; + struct bch_extent_crc_unpacked crc; + struct bkey_s_c_extent e; + + if (!bkey_extent_is_data(k.k)) + return; + + if (!io_opts->background_target && + !io_opts->background_compression) + return; + + e = bkey_s_c_to_extent(k); + + extent_for_each_ptr_crc(e, ptr, crc) + if (rebalance_ptr_pred(c, ptr, crc, io_opts)) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + if (atomic64_add_return(crc.compressed_size, + &ca->rebalance_work) == + crc.compressed_size) + rebalance_wakeup(c); + } +} + +void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) +{ + if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) == + sectors) + rebalance_wakeup(c); +} + +static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg, + enum bkey_type type, + struct bkey_s_c_extent e, + struct bch_io_opts *io_opts, + struct data_opts *data_opts) +{ + const struct bch_extent_ptr *ptr; + struct bch_extent_crc_unpacked crc; + + /* Make sure we have room to add a new pointer: */ + if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX > + BKEY_EXTENT_VAL_U64s_MAX) + return DATA_SKIP; + + extent_for_each_ptr_crc(e, ptr, crc) + if (rebalance_ptr_pred(c, ptr, crc, io_opts)) + goto found; + + return DATA_SKIP; +found: + data_opts->target = io_opts->background_target; + data_opts->btree_insert_flags = 0; + return DATA_ADD_REPLICAS; +} + +struct rebalance_work { + int dev_most_full_idx; + unsigned dev_most_full_percent; + u64 dev_most_full_work; + u64 dev_most_full_capacity; + u64 total_work; +}; + +static void rebalance_work_accumulate(struct rebalance_work *w, + u64 dev_work, u64 unknown_dev, u64 capacity, int idx) +{ + unsigned percent_full; + u64 work = dev_work + unknown_dev; + + if (work < dev_work || work < unknown_dev) + work = U64_MAX; + work = min(work, capacity); + + percent_full = div_u64(work * 100, capacity); + + if (percent_full >= w->dev_most_full_percent) { + w->dev_most_full_idx = idx; + w->dev_most_full_percent = percent_full; + w->dev_most_full_work = work; + w->dev_most_full_capacity = capacity; + } + + if (w->total_work + dev_work >= w->total_work && + w->total_work + dev_work >= dev_work) + w->total_work += dev_work; +} + +static struct rebalance_work rebalance_work(struct bch_fs *c) +{ + struct bch_dev *ca; + struct rebalance_work ret = { .dev_most_full_idx = -1 }; + u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev); + unsigned i; + + for_each_online_member(ca, c, i) + rebalance_work_accumulate(&ret, + atomic64_read(&ca->rebalance_work), + unknown_dev, + bucket_to_sector(ca, ca->mi.nbuckets - + ca->mi.first_bucket), + i); + + rebalance_work_accumulate(&ret, + unknown_dev, 0, c->capacity, -1); + + return ret; +} + +static void rebalance_work_reset(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + for_each_online_member(ca, c, i) + atomic64_set(&ca->rebalance_work, 0); + + atomic64_set(&c->rebalance.work_unknown_dev, 0); +} + +static unsigned long curr_cputime(void) +{ + u64 utime, stime; + + task_cputime_adjusted(current, &utime, &stime); + return nsecs_to_jiffies(utime + stime); +} + +static int bch2_rebalance_thread(void *arg) +{ + struct bch_fs *c = arg; + struct bch_fs_rebalance *r = &c->rebalance; + struct io_clock *clock = &c->io_clock[WRITE]; + struct rebalance_work w, p; + unsigned long start, prev_start; + unsigned long prev_run_time, prev_run_cputime; + unsigned long cputime, prev_cputime; + unsigned long io_start; + long throttle; + + set_freezable(); + + io_start = atomic_long_read(&clock->now); + p = rebalance_work(c); + prev_start = jiffies; + prev_cputime = curr_cputime(); + + while (!kthread_wait_freezable(r->enabled)) { + start = jiffies; + cputime = curr_cputime(); + + prev_run_time = start - prev_start; + prev_run_cputime = cputime - prev_cputime; + + w = rebalance_work(c); + BUG_ON(!w.dev_most_full_capacity); + + if (!w.total_work) { + r->state = REBALANCE_WAITING; + kthread_wait_freezable(rebalance_work(c).total_work); + continue; + } + + /* + * If there isn't much work to do, throttle cpu usage: + */ + throttle = prev_run_cputime * 100 / + max(1U, w.dev_most_full_percent) - + prev_run_time; + + if (w.dev_most_full_percent < 20 && throttle > 0) { + r->state = REBALANCE_THROTTLED; + r->throttled_until_iotime = io_start + + div_u64(w.dev_most_full_capacity * + (20 - w.dev_most_full_percent), + 50); + r->throttled_until_cputime = start + throttle; + + bch2_kthread_io_clock_wait(clock, + r->throttled_until_iotime, + throttle); + continue; + } + + /* minimum 1 mb/sec: */ + r->pd.rate.rate = + max_t(u64, 1 << 11, + r->pd.rate.rate * + max(p.dev_most_full_percent, 1U) / + max(w.dev_most_full_percent, 1U)); + + io_start = atomic_long_read(&clock->now); + p = w; + prev_start = start; + prev_cputime = cputime; + + r->state = REBALANCE_RUNNING; + memset(&r->move_stats, 0, sizeof(r->move_stats)); + rebalance_work_reset(c); + + bch2_move_data(c, + /* ratelimiting disabled for now */ + NULL, /* &r->pd.rate, */ + writepoint_ptr(&c->rebalance_write_point), + POS_MIN, POS_MAX, + rebalance_pred, NULL, + &r->move_stats); + } + + return 0; +} + +ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf) +{ + char *out = buf, *end = out + PAGE_SIZE; + struct bch_fs_rebalance *r = &c->rebalance; + struct rebalance_work w = rebalance_work(c); + char h1[21], h2[21]; + + bch2_hprint(h1, w.dev_most_full_work << 9); + bch2_hprint(h2, w.dev_most_full_capacity << 9); + out += scnprintf(out, end - out, + "fullest_dev (%i):\t%s/%s\n", + w.dev_most_full_idx, h1, h2); + + bch2_hprint(h1, w.total_work << 9); + bch2_hprint(h2, c->capacity << 9); + out += scnprintf(out, end - out, + "total work:\t\t%s/%s\n", + h1, h2); + + out += scnprintf(out, end - out, + "rate:\t\t\t%u\n", + r->pd.rate.rate); + + switch (r->state) { + case REBALANCE_WAITING: + out += scnprintf(out, end - out, "waiting\n"); + break; + case REBALANCE_THROTTLED: + bch2_hprint(h1, + (r->throttled_until_iotime - + atomic_long_read(&c->io_clock[WRITE].now)) << 9); + out += scnprintf(out, end - out, + "throttled for %lu sec or %s io\n", + (r->throttled_until_cputime - jiffies) / HZ, + h1); + break; + case REBALANCE_RUNNING: + out += scnprintf(out, end - out, "running\n"); + out += scnprintf(out, end - out, "pos %llu:%llu\n", + r->move_stats.iter.pos.inode, + r->move_stats.iter.pos.offset); + break; + } + + return out - buf; +} + +void bch2_rebalance_stop(struct bch_fs *c) +{ + struct task_struct *p; + + c->rebalance.pd.rate.rate = UINT_MAX; + bch2_ratelimit_reset(&c->rebalance.pd.rate); + + p = rcu_dereference_protected(c->rebalance.thread, 1); + c->rebalance.thread = NULL; + + if (p) { + /* for sychronizing with rebalance_wakeup() */ + synchronize_rcu(); + + kthread_stop(p); + put_task_struct(p); + } +} + +int bch2_rebalance_start(struct bch_fs *c) +{ + struct task_struct *p; + + if (c->opts.nochanges) + return 0; + + p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance"); + if (IS_ERR(p)) + return PTR_ERR(p); + + get_task_struct(p); + rcu_assign_pointer(c->rebalance.thread, p); + wake_up_process(p); + return 0; +} + +void bch2_fs_rebalance_init(struct bch_fs *c) +{ + bch2_pd_controller_init(&c->rebalance.pd); + + atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX); +} |