diff options
Diffstat (limited to 'fs/bcachefs/btree_journal_iter.c')
-rw-r--r-- | fs/bcachefs/btree_journal_iter.c | 635 |
1 files changed, 635 insertions, 0 deletions
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c new file mode 100644 index 000000000000..c1657182c275 --- /dev/null +++ b/fs/bcachefs/btree_journal_iter.c @@ -0,0 +1,635 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bkey_buf.h" +#include "bset.h" +#include "btree_cache.h" +#include "btree_journal_iter.h" +#include "journal_io.h" + +#include <linux/sort.h> + +/* + * For managing keys we read from the journal: until journal replay works normal + * btree lookups need to be able to find and return keys from the journal where + * they overwrite what's in the btree, so we have a special iterator and + * operations for the regular btree iter code to use: + */ + +static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) +{ + size_t gap_size = keys->size - keys->nr; + + if (idx >= keys->gap) + idx += gap_size; + return idx; +} + +static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) +{ + return keys->data + idx_to_pos(keys, idx); +} + +static size_t __bch2_journal_key_search(struct journal_keys *keys, + enum btree_id id, unsigned level, + struct bpos pos) +{ + size_t l = 0, r = keys->nr, m; + + while (l < r) { + m = l + ((r - l) >> 1); + if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0) + l = m + 1; + else + r = m; + } + + BUG_ON(l < keys->nr && + __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0); + + BUG_ON(l && + __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0); + + return l; +} + +static size_t bch2_journal_key_search(struct journal_keys *keys, + enum btree_id id, unsigned level, + struct bpos pos) +{ + return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); +} + +/* Returns first non-overwritten key >= search key: */ +struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, + unsigned level, struct bpos pos, + struct bpos end_pos, size_t *idx) +{ + struct journal_keys *keys = &c->journal_keys; + unsigned iters = 0; + struct journal_key *k; + + BUG_ON(*idx > keys->nr); +search: + if (!*idx) + *idx = __bch2_journal_key_search(keys, btree_id, level, pos); + + while (*idx && + __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) { + --(*idx); + iters++; + if (iters == 10) { + *idx = 0; + goto search; + } + } + + while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { + if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) + return NULL; + + if (k->overwritten) { + (*idx)++; + continue; + } + + if (__journal_key_cmp(btree_id, level, pos, k) <= 0) + return k->k; + + (*idx)++; + iters++; + if (iters == 10) { + *idx = 0; + goto search; + } + } + + return NULL; +} + +struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, + unsigned level, struct bpos pos) +{ + size_t idx = 0; + + return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); +} + +static void journal_iter_verify(struct journal_iter *iter) +{ + struct journal_keys *keys = iter->keys; + size_t gap_size = keys->size - keys->nr; + + BUG_ON(iter->idx >= keys->gap && + iter->idx < keys->gap + gap_size); + + if (iter->idx < keys->size) { + struct journal_key *k = keys->data + iter->idx; + + int cmp = cmp_int(k->btree_id, iter->btree_id) ?: + cmp_int(k->level, iter->level); + BUG_ON(cmp < 0); + } +} + +static void journal_iters_fix(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + /* The key we just inserted is immediately before the gap: */ + size_t gap_end = keys->gap + (keys->size - keys->nr); + struct journal_key *new_key = &keys->data[keys->gap - 1]; + struct journal_iter *iter; + + /* + * If an iterator points one after the key we just inserted, decrement + * the iterator so it points at the key we just inserted - if the + * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will + * handle that: + */ + list_for_each_entry(iter, &c->journal_iters, list) { + journal_iter_verify(iter); + if (iter->idx == gap_end && + new_key->btree_id == iter->btree_id && + new_key->level == iter->level) + iter->idx = keys->gap - 1; + journal_iter_verify(iter); + } +} + +static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) +{ + struct journal_keys *keys = &c->journal_keys; + struct journal_iter *iter; + size_t gap_size = keys->size - keys->nr; + + list_for_each_entry(iter, &c->journal_iters, list) { + if (iter->idx > old_gap) + iter->idx -= gap_size; + if (iter->idx >= new_gap) + iter->idx += gap_size; + } +} + +int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, + unsigned level, struct bkey_i *k) +{ + struct journal_key n = { + .btree_id = id, + .level = level, + .k = k, + .allocated = true, + /* + * Ensure these keys are done last by journal replay, to unblock + * journal reclaim: + */ + .journal_seq = U32_MAX, + }; + struct journal_keys *keys = &c->journal_keys; + size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); + + BUG_ON(test_bit(BCH_FS_rw, &c->flags)); + + if (idx < keys->size && + journal_key_cmp(&n, &keys->data[idx]) == 0) { + if (keys->data[idx].allocated) + kfree(keys->data[idx].k); + keys->data[idx] = n; + return 0; + } + + if (idx > keys->gap) + idx -= keys->size - keys->nr; + + size_t old_gap = keys->gap; + + if (keys->nr == keys->size) { + journal_iters_move_gap(c, old_gap, keys->size); + old_gap = keys->size; + + struct journal_keys new_keys = { + .nr = keys->nr, + .size = max_t(size_t, keys->size, 8) * 2, + }; + + new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL); + if (!new_keys.data) { + bch_err(c, "%s: error allocating new key array (size %zu)", + __func__, new_keys.size); + return -BCH_ERR_ENOMEM_journal_key_insert; + } + + /* Since @keys was full, there was no gap: */ + memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr); + kvfree(keys->data); + keys->data = new_keys.data; + keys->nr = new_keys.nr; + keys->size = new_keys.size; + + /* And now the gap is at the end: */ + keys->gap = keys->nr; + } + + journal_iters_move_gap(c, old_gap, idx); + + move_gap(keys, idx); + + keys->nr++; + keys->data[keys->gap++] = n; + + journal_iters_fix(c); + + return 0; +} + +/* + * Can only be used from the recovery thread while we're still RO - can't be + * used once we've got RW, as journal_keys is at that point used by multiple + * threads: + */ +int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, + unsigned level, struct bkey_i *k) +{ + struct bkey_i *n; + int ret; + + n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); + if (!n) + return -BCH_ERR_ENOMEM_journal_key_insert; + + bkey_copy(n, k); + ret = bch2_journal_key_insert_take(c, id, level, n); + if (ret) + kfree(n); + return ret; +} + +int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, + unsigned level, struct bpos pos) +{ + struct bkey_i whiteout; + + bkey_init(&whiteout.k); + whiteout.k.p = pos; + + return bch2_journal_key_insert(c, id, level, &whiteout); +} + +bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree, + unsigned level, struct bpos pos) +{ + struct journal_keys *keys = &trans->c->journal_keys; + size_t idx = bch2_journal_key_search(keys, btree, level, pos); + + if (!trans->journal_replay_not_finished) + return false; + + return (idx < keys->size && + keys->data[idx].btree_id == btree && + keys->data[idx].level == level && + bpos_eq(keys->data[idx].k->k.p, pos) && + bkey_deleted(&keys->data[idx].k->k)); +} + +void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, + unsigned level, struct bpos pos) +{ + struct journal_keys *keys = &c->journal_keys; + size_t idx = bch2_journal_key_search(keys, btree, level, pos); + + if (idx < keys->size && + keys->data[idx].btree_id == btree && + keys->data[idx].level == level && + bpos_eq(keys->data[idx].k->k.p, pos)) + keys->data[idx].overwritten = true; +} + +static void bch2_journal_iter_advance(struct journal_iter *iter) +{ + if (iter->idx < iter->keys->size) { + iter->idx++; + if (iter->idx == iter->keys->gap) + iter->idx += iter->keys->size - iter->keys->nr; + } +} + +static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) +{ + journal_iter_verify(iter); + + while (iter->idx < iter->keys->size) { + struct journal_key *k = iter->keys->data + iter->idx; + + int cmp = cmp_int(k->btree_id, iter->btree_id) ?: + cmp_int(k->level, iter->level); + if (cmp > 0) + break; + BUG_ON(cmp); + + if (!k->overwritten) + return bkey_i_to_s_c(k->k); + + bch2_journal_iter_advance(iter); + } + + return bkey_s_c_null; +} + +static void bch2_journal_iter_exit(struct journal_iter *iter) +{ + list_del(&iter->list); +} + +static void bch2_journal_iter_init(struct bch_fs *c, + struct journal_iter *iter, + enum btree_id id, unsigned level, + struct bpos pos) +{ + iter->btree_id = id; + iter->level = level; + iter->keys = &c->journal_keys; + iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); + + journal_iter_verify(iter); +} + +static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) +{ + return bch2_btree_node_iter_peek_unpack(&iter->node_iter, + iter->b, &iter->unpacked); +} + +static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) +{ + bch2_btree_node_iter_advance(&iter->node_iter, iter->b); +} + +void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) +{ + if (bpos_eq(iter->pos, SPOS_MAX)) + iter->at_end = true; + else + iter->pos = bpos_successor(iter->pos); +} + +static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter) +{ + struct btree_and_journal_iter iter = *_iter; + struct bch_fs *c = iter.trans->c; + unsigned level = iter.journal.level; + struct bkey_buf tmp; + unsigned nr = test_bit(BCH_FS_started, &c->flags) + ? (level > 1 ? 0 : 2) + : (level > 1 ? 1 : 16); + + iter.prefetch = false; + bch2_bkey_buf_init(&tmp); + + while (nr--) { + bch2_btree_and_journal_iter_advance(&iter); + struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter); + if (!k.k) + break; + + bch2_bkey_buf_reassemble(&tmp, c, k); + bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1); + } + + bch2_bkey_buf_exit(&tmp, c); +} + +struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) +{ + struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret; + + if (iter->prefetch && iter->journal.level) + btree_and_journal_iter_prefetch(iter); +again: + if (iter->at_end) + return bkey_s_c_null; + + while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && + bpos_lt(btree_k.k->p, iter->pos)) + bch2_journal_iter_advance_btree(iter); + + if (iter->trans->journal_replay_not_finished) + while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && + bpos_lt(journal_k.k->p, iter->pos)) + bch2_journal_iter_advance(&iter->journal); + + ret = journal_k.k && + (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p)) + ? journal_k + : btree_k; + + if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key)) + ret = bkey_s_c_null; + + if (ret.k) { + iter->pos = ret.k->p; + if (bkey_deleted(ret.k)) { + bch2_btree_and_journal_iter_advance(iter); + goto again; + } + } else { + iter->pos = SPOS_MAX; + iter->at_end = true; + } + + return ret; +} + +void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) +{ + bch2_journal_iter_exit(&iter->journal); +} + +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, + struct btree_and_journal_iter *iter, + struct btree *b, + struct btree_node_iter node_iter, + struct bpos pos) +{ + memset(iter, 0, sizeof(*iter)); + + iter->trans = trans; + iter->b = b; + iter->node_iter = node_iter; + iter->pos = b->data->min_key; + iter->at_end = false; + INIT_LIST_HEAD(&iter->journal.list); + + if (trans->journal_replay_not_finished) { + bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos); + if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags)) + list_add(&iter->journal.list, &trans->c->journal_iters); + } +} + +/* + * this version is used by btree_gc before filesystem has gone RW and + * multithreaded, so uses the journal_iters list: + */ +void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, + struct btree_and_journal_iter *iter, + struct btree *b) +{ + struct btree_node_iter node_iter; + + bch2_btree_node_iter_init_from_start(&node_iter, b); + __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key); +} + +/* sort and dedup all keys in the journal: */ + +void bch2_journal_entries_free(struct bch_fs *c) +{ + struct journal_replay **i; + struct genradix_iter iter; + + genradix_for_each(&c->journal_entries, iter, i) + kvfree(*i); + genradix_free(&c->journal_entries); +} + +/* + * When keys compare equal, oldest compares first: + */ +static int journal_sort_key_cmp(const void *_l, const void *_r) +{ + const struct journal_key *l = _l; + const struct journal_key *r = _r; + + return journal_key_cmp(l, r) ?: + cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->journal_offset, r->journal_offset); +} + +void bch2_journal_keys_put(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + + BUG_ON(atomic_read(&keys->ref) <= 0); + + if (!atomic_dec_and_test(&keys->ref)) + return; + + move_gap(keys, keys->nr); + + darray_for_each(*keys, i) + if (i->allocated) + kfree(i->k); + + kvfree(keys->data); + keys->data = NULL; + keys->nr = keys->gap = keys->size = 0; + + bch2_journal_entries_free(c); +} + +static void __journal_keys_sort(struct journal_keys *keys) +{ + sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL); + + cond_resched(); + + struct journal_key *dst = keys->data; + + darray_for_each(*keys, src) { + /* + * We don't accumulate accounting keys here because we have to + * compare each individual accounting key against the version in + * the btree during replay: + */ + if (src->k->k.type != KEY_TYPE_accounting && + src + 1 < &darray_top(*keys) && + !journal_key_cmp(src, src + 1)) + continue; + + *dst++ = *src; + } + + keys->nr = dst - keys->data; +} + +int bch2_journal_keys_sort(struct bch_fs *c) +{ + struct genradix_iter iter; + struct journal_replay *i, **_i; + struct journal_keys *keys = &c->journal_keys; + size_t nr_read = 0; + + genradix_for_each(&c->journal_entries, iter, _i) { + i = *_i; + + if (journal_replay_ignore(i)) + continue; + + cond_resched(); + + for_each_jset_key(k, entry, &i->j) { + struct journal_key n = (struct journal_key) { + .btree_id = entry->btree_id, + .level = entry->level, + .k = k, + .journal_seq = le64_to_cpu(i->j.seq), + .journal_offset = k->_data - i->j._data, + }; + + if (darray_push(keys, n)) { + __journal_keys_sort(keys); + + if (keys->nr * 8 > keys->size * 7) { + bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", + keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); + return -BCH_ERR_ENOMEM_journal_keys_sort; + } + + BUG_ON(darray_push(keys, n)); + } + + nr_read++; + } + } + + __journal_keys_sort(keys); + keys->gap = keys->nr; + + bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr); + return 0; +} + +void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree, + unsigned level_min, unsigned level_max, + struct bpos start, struct bpos end) +{ + struct journal_keys *keys = &c->journal_keys; + size_t dst = 0; + + move_gap(keys, keys->nr); + + darray_for_each(*keys, i) + if (!(i->btree_id == btree && + i->level >= level_min && + i->level <= level_max && + bpos_ge(i->k->k.p, start) && + bpos_le(i->k->k.p, end))) + keys->data[dst++] = *i; + keys->nr = keys->gap = dst; +} + +void bch2_journal_keys_dump(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + struct printbuf buf = PRINTBUF; + + pr_info("%zu keys:", keys->nr); + + move_gap(keys, keys->nr); + + darray_for_each(*keys, i) { + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); + pr_err("%s l=%u %s", bch2_btree_id_str(i->btree_id), i->level, buf.buf); + } + printbuf_exit(&buf); +} |