summaryrefslogtreecommitdiff
path: root/fs/bcachefs/btree_iter.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/bcachefs/btree_iter.c')
-rw-r--r--fs/bcachefs/btree_iter.c1313
1 files changed, 809 insertions, 504 deletions
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index eef9b89c561d..f8829b667ad3 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -16,6 +16,7 @@
#include "journal_io.h"
#include "replicas.h"
#include "snapshot.h"
+#include "super.h"
#include "trace.h"
#include <linux/random.h>
@@ -114,11 +115,9 @@ static inline bool btree_path_pos_in_node(struct btree_path *path,
!btree_path_pos_after_node(path, b);
}
-/* Btree iterator: */
+/* Debug: */
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-static void bch2_btree_path_verify_cached(struct btree_trans *trans,
+static void __bch2_btree_path_verify_cached(struct btree_trans *trans,
struct btree_path *path)
{
struct bkey_cached *ck;
@@ -135,7 +134,7 @@ static void bch2_btree_path_verify_cached(struct btree_trans *trans,
btree_node_unlock(trans, path, 0);
}
-static void bch2_btree_path_verify_level(struct btree_trans *trans,
+static void __bch2_btree_path_verify_level(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
struct btree_path_level *l;
@@ -147,16 +146,13 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans,
struct printbuf buf3 = PRINTBUF;
const char *msg;
- if (!bch2_debug_check_iterators)
- return;
-
l = &path->l[level];
tmp = l->iter;
locked = btree_node_locked(path, level);
if (path->cached) {
if (!level)
- bch2_btree_path_verify_cached(trans, path);
+ __bch2_btree_path_verify_cached(trans, path);
return;
}
@@ -217,7 +213,7 @@ err:
msg, level, buf1.buf, buf2.buf, buf3.buf);
}
-static void bch2_btree_path_verify(struct btree_trans *trans,
+static void __bch2_btree_path_verify(struct btree_trans *trans,
struct btree_path *path)
{
struct bch_fs *c = trans->c;
@@ -229,25 +225,23 @@ static void bch2_btree_path_verify(struct btree_trans *trans,
break;
}
- bch2_btree_path_verify_level(trans, path, i);
+ __bch2_btree_path_verify_level(trans, path, i);
}
- bch2_btree_path_verify_locks(path);
+ bch2_btree_path_verify_locks(trans, path);
}
-void bch2_trans_verify_paths(struct btree_trans *trans)
+void __bch2_trans_verify_paths(struct btree_trans *trans)
{
struct btree_path *path;
unsigned iter;
trans_for_each_path(trans, path, iter)
- bch2_btree_path_verify(trans, path);
+ __bch2_btree_path_verify(trans, path);
}
-static void bch2_btree_iter_verify(struct btree_iter *iter)
+static void __bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter *iter)
{
- struct btree_trans *trans = iter->trans;
-
BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached);
BUG_ON((iter->flags & BTREE_ITER_is_extents) &&
@@ -258,11 +252,11 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
!btree_type_has_snapshot_field(iter->btree_id));
if (iter->update_path)
- bch2_btree_path_verify(trans, &trans->paths[iter->update_path]);
- bch2_btree_path_verify(trans, btree_iter_path(trans, iter));
+ __bch2_btree_path_verify(trans, &trans->paths[iter->update_path]);
+ __bch2_btree_path_verify(trans, btree_iter_path(trans, iter));
}
-static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
+static void __bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
{
BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) &&
!iter->pos.snapshot);
@@ -270,20 +264,19 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) &&
iter->pos.snapshot != iter->snapshot);
- BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
- bkey_gt(iter->pos, iter->k.p));
+ BUG_ON(iter->flags & BTREE_ITER_all_snapshots ? !bpos_eq(iter->pos, iter->k.p) :
+ !(iter->flags & BTREE_ITER_is_extents) ? !bkey_eq(iter->pos, iter->k.p) :
+ (bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
+ bkey_gt(iter->pos, iter->k.p)));
}
-static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
+static int __bch2_btree_iter_verify_ret(struct btree_trans *trans,
+ struct btree_iter *iter, struct bkey_s_c k)
{
- struct btree_trans *trans = iter->trans;
struct btree_iter copy;
struct bkey_s_c prev;
int ret = 0;
- if (!bch2_debug_check_iterators)
- return 0;
-
if (!(iter->flags & BTREE_ITER_filter_snapshots))
return 0;
@@ -297,7 +290,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k
bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
BTREE_ITER_nopreserve|
BTREE_ITER_all_snapshots);
- prev = bch2_btree_iter_prev(&copy);
+ prev = bch2_btree_iter_prev(trans, &copy);
if (!prev.k)
goto out;
@@ -324,10 +317,10 @@ out:
return ret;
}
-void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
+void __bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
struct bpos pos)
{
- bch2_trans_verify_not_unlocked(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
struct btree_path *path;
struct trans_for_each_path_inorder_iter iter;
@@ -357,17 +350,40 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
panic("not locked: %s %s\n", bch2_btree_id_str(id), buf.buf);
}
-#else
-
static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
- struct btree_path *path, unsigned l) {}
+ struct btree_path *path, unsigned l)
+{
+ if (static_branch_unlikely(&bch2_debug_check_iterators))
+ __bch2_btree_path_verify_level(trans, path, l);
+}
+
static inline void bch2_btree_path_verify(struct btree_trans *trans,
- struct btree_path *path) {}
-static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
-static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
-static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
+ struct btree_path *path)
+{
+ if (static_branch_unlikely(&bch2_debug_check_iterators))
+ __bch2_btree_path_verify(trans, path);
+}
-#endif
+static inline void bch2_btree_iter_verify(struct btree_trans *trans,
+ struct btree_iter *iter)
+{
+ if (static_branch_unlikely(&bch2_debug_check_iterators))
+ __bch2_btree_iter_verify(trans, iter);
+}
+
+static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
+{
+ if (static_branch_unlikely(&bch2_debug_check_iterators))
+ __bch2_btree_iter_verify_entry_exit(iter);
+}
+
+static inline int bch2_btree_iter_verify_ret(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ return static_branch_unlikely(&bch2_debug_check_iterators)
+ ? __bch2_btree_iter_verify_ret(trans, iter, k)
+ : 0;
+}
/* Btree path: fixups after btree updates */
@@ -521,7 +537,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans,
__bch2_btree_node_iter_fix(path, b, node_iter, t,
where, clobber_u64s, new_u64s);
- if (bch2_debug_check_iterators)
+ if (static_branch_unlikely(&bch2_debug_check_iterators))
bch2_btree_node_iter_verify(node_iter, b);
}
@@ -560,20 +576,6 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
bch2_btree_node_iter_peek_all(&l->iter, l->b));
}
-static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
- struct btree_path *path,
- struct btree_path_level *l,
- struct bkey *u)
-{
- struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
- bch2_btree_node_iter_peek(&l->iter, l->b));
-
- path->pos = k.k ? k.k->p : l->b->key.k.p;
- trans->paths_sorted = false;
- bch2_btree_path_verify_level(trans, path, l - path->l);
- return k;
-}
-
static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
struct btree_path *path,
struct btree_path_level *l,
@@ -697,6 +699,19 @@ void bch2_trans_node_add(struct btree_trans *trans,
bch2_trans_revalidate_updates_in_node(trans, b);
}
+void bch2_trans_node_drop(struct btree_trans *trans,
+ struct btree *b)
+{
+ struct btree_path *path;
+ unsigned i, level = b->c.level;
+
+ trans_for_each_path(trans, path, i)
+ if (path->l[level].b == b) {
+ btree_node_unlock(trans, path, level);
+ path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
+ }
+}
+
/*
* A btree node has been modified in such a way as to invalidate iterators - fix
* them:
@@ -720,7 +735,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
unsigned long trace_ip)
{
struct bch_fs *c = trans->c;
- struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b;
+ struct btree_root *r = bch2_btree_id_root(c, path->btree_id);
enum six_lock_type lock_type;
unsigned i;
int ret;
@@ -728,7 +743,12 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
EBUG_ON(path->nodes_locked);
while (1) {
- b = READ_ONCE(*rootp);
+ struct btree *b = READ_ONCE(r->b);
+ if (unlikely(!b)) {
+ BUG_ON(!r->error);
+ return r->error;
+ }
+
path->level = READ_ONCE(b->c.level);
if (unlikely(path->level < depth_want)) {
@@ -748,14 +768,12 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
ret = btree_node_lock(trans, path, &b->c,
path->level, lock_type, trace_ip);
if (unlikely(ret)) {
- if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed))
- continue;
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
return ret;
BUG();
}
- if (likely(b == READ_ONCE(*rootp) &&
+ if (likely(b == READ_ONCE(r->b) &&
b->c.level == path->level &&
!race_fault())) {
for (i = 0; i < path->level; i++)
@@ -825,6 +843,8 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p
bch2_bkey_buf_init(&tmp);
+ jiter->fail_if_too_many_whiteouts = true;
+
while (nr-- && !ret) {
if (!bch2_btree_node_relock(trans, path, path->level))
break;
@@ -870,8 +890,7 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
struct btree_path *path,
- unsigned flags,
- struct bkey_buf *out)
+ unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree_path_level *l = path_l(path);
@@ -895,7 +914,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
goto err;
}
- bch2_bkey_buf_reassemble(out, c, k);
+ bkey_reassemble(&trans->btree_path_down, k);
if ((flags & BTREE_ITER_prefetch) &&
c->opts.btree_node_prefetch)
@@ -906,6 +925,22 @@ err:
return ret;
}
+static noinline_for_stack int btree_node_missing_err(struct btree_trans *trans,
+ struct btree_path *path)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "node not found at pos ");
+ bch2_bpos_to_text(&buf, path->pos);
+ prt_str(&buf, " within parent node ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&path_l(path)->b->key));
+
+ bch2_fs_fatal_error(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ return bch_err_throw(c, btree_need_topology_repair);
+}
+
static __always_inline int btree_path_down(struct btree_trans *trans,
struct btree_path *path,
unsigned flags,
@@ -916,51 +951,38 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
struct btree *b;
unsigned level = path->level - 1;
enum six_lock_type lock_type = __btree_lock_want(path, level);
- struct bkey_buf tmp;
int ret;
EBUG_ON(!btree_node_locked(path, path->level));
- bch2_bkey_buf_init(&tmp);
-
if (unlikely(trans->journal_replay_not_finished)) {
- ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
+ ret = btree_node_iter_and_journal_peek(trans, path, flags);
if (ret)
- goto err;
+ return ret;
} else {
struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b);
- if (!k) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "node not found at pos ");
- bch2_bpos_to_text(&buf, path->pos);
- prt_str(&buf, " within parent node ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key));
-
- bch2_fs_fatal_error(c, "%s", buf.buf);
- printbuf_exit(&buf);
- ret = -BCH_ERR_btree_need_topology_repair;
- goto err;
- }
+ if (unlikely(!k))
+ return btree_node_missing_err(trans, path);
- bch2_bkey_buf_unpack(&tmp, c, l->b, k);
+ bch2_bkey_unpack(l->b, &trans->btree_path_down, k);
- if ((flags & BTREE_ITER_prefetch) &&
+ if (unlikely((flags & BTREE_ITER_prefetch)) &&
c->opts.btree_node_prefetch) {
ret = btree_path_prefetch(trans, path);
if (ret)
- goto err;
+ return ret;
}
}
- b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
+ b = bch2_btree_node_get(trans, path, &trans->btree_path_down,
+ level, lock_type, trace_ip);
ret = PTR_ERR_OR_ZERO(b);
if (unlikely(ret))
- goto err;
+ return ret;
- if (likely(!trans->journal_replay_not_finished &&
- tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
- unlikely(b != btree_node_mem_ptr(tmp.k)))
+ if (unlikely(b != btree_node_mem_ptr(&trans->btree_path_down)) &&
+ likely(!trans->journal_replay_not_finished &&
+ trans->btree_path_down.k.type == KEY_TYPE_btree_ptr_v2))
btree_node_mem_ptr_set(trans, path, level + 1, b);
if (btree_node_read_locked(path, level + 1))
@@ -971,10 +993,8 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
path->level = level;
bch2_btree_path_level_init(trans, path, b);
- bch2_btree_path_verify_locks(path);
-err:
- bch2_bkey_buf_exit(&tmp, c);
- return ret;
+ bch2_btree_path_verify_locks(trans, path);
+ return 0;
}
static int bch2_btree_path_traverse_all(struct btree_trans *trans)
@@ -986,7 +1006,7 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans)
int ret = 0;
if (trans->in_traverse_all)
- return -BCH_ERR_transaction_restart_in_traverse_all;
+ return bch_err_throw(c, transaction_restart_in_traverse_all);
trans->in_traverse_all = true;
retry_all:
@@ -1000,7 +1020,7 @@ retry_all:
bch2_trans_unlock(trans);
cond_resched();
- trans_set_locked(trans);
+ trans_set_locked(trans, false);
if (unlikely(trans->memory_allocation_failure)) {
struct closure cl;
@@ -1083,7 +1103,7 @@ static void btree_path_set_level_down(struct btree_trans *trans,
if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
btree_node_unlock(trans, path, l);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
bch2_btree_path_verify(trans, path);
}
@@ -1156,7 +1176,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
}
if (path->cached) {
- ret = bch2_btree_path_traverse_cached(trans, path, flags);
+ ret = bch2_btree_path_traverse_cached(trans, path_idx, flags);
goto out;
}
@@ -1267,7 +1287,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
{
int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos);
- bch2_trans_verify_not_in_restart(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
EBUG_ON(!trans->paths[path_idx].ref);
trace_btree_path_set_pos(trans, trans->paths + path_idx, &new_pos);
@@ -1281,7 +1301,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
if (unlikely(path->cached)) {
btree_node_unlock(trans, path, 0);
path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
goto out;
}
@@ -1310,7 +1330,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
}
if (unlikely(level != path->level)) {
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
__bch2_btree_path_unlock(trans, path);
}
out:
@@ -1379,45 +1399,45 @@ static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_p
void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent)
{
- struct btree_path *path = trans->paths + path_idx, *dup;
+ struct btree_path *path = trans->paths + path_idx, *dup = NULL;
if (!__btree_path_put(trans, path, intent))
return;
+ if (!path->preserve && !path->should_be_locked)
+ goto free;
+
dup = path->preserve
? have_path_at_pos(trans, path)
: have_node_at_pos(trans, path);
-
- trace_btree_path_free(trans, path_idx, dup);
-
- if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
+ if (!dup)
return;
- if (path->should_be_locked && !trans->restarted) {
- if (!dup)
- return;
-
+ /*
+ * If we need this path locked, the duplicate also has te be locked
+ * before we free this one:
+ */
+ if (path->should_be_locked &&
+ !dup->should_be_locked &&
+ !trans->restarted) {
if (!(trans->locked
? bch2_btree_path_relock_norestart(trans, dup)
: bch2_btree_path_can_relock(trans, dup)))
return;
- }
- if (dup) {
- dup->preserve |= path->preserve;
- dup->should_be_locked |= path->should_be_locked;
+ dup->should_be_locked = true;
}
- __bch2_path_free(trans, path_idx);
-}
-
-static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path,
- bool intent)
-{
- if (!__btree_path_put(trans, trans->paths + path, intent))
- return;
+ BUG_ON(path->should_be_locked &&
+ !trans->restarted &&
+ trans->locked &&
+ !btree_node_locked(dup, dup->level));
- __bch2_path_free(trans, path);
+ path->should_be_locked = false;
+ dup->preserve |= path->preserve;
+free:
+ trace_btree_path_free(trans, path_idx, dup);
+ __bch2_path_free(trans, path_idx);
}
void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
@@ -1427,17 +1447,31 @@ void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_
(void *) trans->last_begin_ip);
}
-void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
+static void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
{
+#ifdef CONFIG_BCACHEFS_DEBUG
+ struct printbuf buf = PRINTBUF;
+ bch2_prt_backtrace(&buf, &trans->last_restarted_trace);
+ panic("in transaction restart: %s, last restarted by\n%s",
+ bch2_err_str(trans->restarted),
+ buf.buf);
+#else
panic("in transaction restart: %s, last restarted by %pS\n",
bch2_err_str(trans->restarted),
(void *) trans->last_restarted_ip);
+#endif
}
-void __noreturn bch2_trans_unlocked_error(struct btree_trans *trans)
+void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *trans)
{
- panic("trans should be locked, unlocked by %pS\n",
- (void *) trans->last_unlock_ip);
+ if (trans->restarted)
+ bch2_trans_in_restart_error(trans);
+
+ if (!trans->locked)
+ panic("trans should be locked, unlocked by %pS\n",
+ (void *) trans->last_unlock_ip);
+
+ BUG();
}
noinline __cold
@@ -1450,10 +1484,11 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
trans_for_each_update(trans, i) {
struct bkey_s_c old = { &i->old_k, i->old_v };
- prt_printf(buf, "update: btree=%s cached=%u %pS\n",
- bch2_btree_id_str(i->btree_id),
- i->cached,
- (void *) i->ip_allocated);
+ prt_str(buf, "update: btree=");
+ bch2_btree_id_to_text(buf, i->btree_id);
+ prt_printf(buf, " cached=%u %pS\n",
+ i->cached,
+ (void *) i->ip_allocated);
prt_printf(buf, " old ");
bch2_bkey_val_to_text(buf, trans->c, old);
@@ -1464,35 +1499,27 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
prt_newline(buf);
}
- for (struct jset_entry *e = trans->journal_entries;
+ for (struct jset_entry *e = btree_trans_journal_entries_start(trans);
e != btree_trans_journal_entries_top(trans);
- e = vstruct_next(e))
+ e = vstruct_next(e)) {
bch2_journal_entry_to_text(buf, trans->c, e);
+ prt_newline(buf);
+ }
printbuf_indent_sub(buf, 2);
}
-noinline __cold
-void bch2_dump_trans_updates(struct btree_trans *trans)
-{
- struct printbuf buf = PRINTBUF;
-
- bch2_trans_updates_to_text(&buf, trans);
- bch2_print_str(trans->c, buf.buf);
- printbuf_exit(&buf);
-}
-
static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
{
struct btree_path *path = trans->paths + path_idx;
- prt_printf(out, "path: idx %3u ref %u:%u %c %c %c btree=%s l=%u pos ",
+ prt_printf(out, "path: idx %3u ref %u:%u %c %c %c ",
path_idx, path->ref, path->intent_ref,
path->preserve ? 'P' : ' ',
path->should_be_locked ? 'S' : ' ',
- path->cached ? 'C' : 'B',
- bch2_btree_id_str(path->btree_id),
- path->level);
+ path->cached ? 'C' : 'B');
+ bch2_btree_id_level_to_text(out, path->btree_id, path->level);
+ prt_str(out, " pos ");
bch2_bpos_to_text(out, path->pos);
if (!path->cached && btree_node_locked(path, path->level)) {
@@ -1578,7 +1605,7 @@ void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
__bch2_trans_paths_to_text(&buf, trans, nosort);
bch2_trans_updates_to_text(&buf, trans);
- bch2_print_str(trans->c, buf.buf);
+ bch2_print_str(trans->c, KERN_ERR, buf.buf);
printbuf_exit(&buf);
}
@@ -1717,12 +1744,15 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
struct trans_for_each_path_inorder_iter iter;
btree_path_idx_t path_pos = 0, path_idx;
- bch2_trans_verify_not_unlocked(trans);
- bch2_trans_verify_not_in_restart(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
bch2_trans_verify_locks(trans);
btree_trans_sort_paths(trans);
+ if (intent)
+ locks_want = max(locks_want, level + 1);
+ locks_want = min(locks_want, BTREE_MAX_DEPTH);
+
trans_for_each_path_inorder(trans, path, iter) {
if (__btree_path_cmp(path,
btree_id,
@@ -1737,7 +1767,8 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
if (path_pos &&
trans->paths[path_pos].cached == cached &&
trans->paths[path_pos].btree_id == btree_id &&
- trans->paths[path_pos].level == level) {
+ trans->paths[path_pos].level == level &&
+ bch2_btree_path_upgrade_norestart(trans, trans->paths + path_pos, locks_want)) {
trace_btree_path_get(trans, trans->paths + path_pos, &pos);
__btree_path_get(trans, trans->paths + path_pos, intent);
@@ -1769,9 +1800,6 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
if (!(flags & BTREE_ITER_nopreserve))
path->preserve = true;
- if (path->intent_ref)
- locks_want = max(locks_want, level + 1);
-
/*
* If the path has locks_want greater than requested, we don't downgrade
* it here - on transaction restart because btree node split needs to
@@ -1780,10 +1808,6 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
* a successful transaction commit.
*/
- locks_want = min(locks_want, BTREE_MAX_DEPTH);
- if (locks_want > path->locks_want)
- bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL);
-
return path_idx;
}
@@ -1833,7 +1857,7 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *
!bkey_eq(path->pos, ck->key.pos));
*u = ck->k->k;
- k = bkey_i_to_s_c(ck->k);
+ k = (struct bkey_s_c) { u, &ck->k->v };
}
return k;
@@ -1843,11 +1867,8 @@ hole:
return (struct bkey_s_c) { u, NULL };
}
-
-void bch2_set_btree_iter_dontneed(struct btree_iter *iter)
+void bch2_set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter)
{
- struct btree_trans *trans = iter->trans;
-
if (!iter->path || trans->restarted)
return;
@@ -1859,25 +1880,22 @@ void bch2_set_btree_iter_dontneed(struct btree_iter *iter)
/* Btree iterators: */
int __must_check
-__bch2_btree_iter_traverse(struct btree_iter *iter)
+__bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter)
{
- return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
+ return bch2_btree_path_traverse(trans, iter->path, iter->flags);
}
int __must_check
-bch2_btree_iter_traverse(struct btree_iter *iter)
+bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter)
{
- struct btree_trans *trans = iter->trans;
- int ret;
-
- bch2_trans_verify_not_unlocked(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
iter->path = bch2_btree_path_set_pos(trans, iter->path,
btree_iter_search_key(iter),
iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
- ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
+ int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (ret)
return ret;
@@ -1889,14 +1907,14 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
/* Iterate across nodes (leaf and interior nodes) */
-struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
+struct btree *bch2_btree_iter_peek_node(struct btree_trans *trans,
+ struct btree_iter *iter)
{
- struct btree_trans *trans = iter->trans;
struct btree *b = NULL;
int ret;
EBUG_ON(trans->paths[iter->path].cached);
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (ret)
@@ -1918,7 +1936,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
out:
bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
return b;
err:
@@ -1927,26 +1945,26 @@ err:
}
/* Only kept for -tools */
-struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter)
+struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *trans,
+ struct btree_iter *iter)
{
struct btree *b;
- while (b = bch2_btree_iter_peek_node(iter),
+ while (b = bch2_btree_iter_peek_node(trans, iter),
bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart))
- bch2_trans_begin(iter->trans);
+ bch2_trans_begin(trans);
return b;
}
-struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
+struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_iter *iter)
{
- struct btree_trans *trans = iter->trans;
struct btree *b = NULL;
int ret;
EBUG_ON(trans->paths[iter->path].cached);
- bch2_trans_verify_not_in_restart(trans);
- bch2_btree_iter_verify(iter);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
+ bch2_btree_iter_verify(trans, iter);
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (ret)
@@ -1961,17 +1979,24 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
/* got to end? */
if (!btree_path_node(path, path->level + 1)) {
+ path->should_be_locked = false;
btree_path_set_level_up(trans, path);
return NULL;
}
+ /*
+ * We don't correctly handle nodes with extra intent locks here:
+ * downgrade so we don't violate locking invariants
+ */
+ bch2_btree_path_downgrade(trans, path);
+
if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
+ trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
__bch2_btree_path_unlock(trans, path);
path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
- trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
+ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
goto err;
}
@@ -2013,7 +2038,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
EBUG_ON(btree_iter_path(trans, iter)->uptodate);
out:
bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
return b;
err:
@@ -2023,7 +2048,7 @@ err:
/* Iterate across keys (in leaf nodes only) */
-inline bool bch2_btree_iter_advance(struct btree_iter *iter)
+inline bool bch2_btree_iter_advance(struct btree_trans *trans, struct btree_iter *iter)
{
struct bpos pos = iter->k.p;
bool ret = !(iter->flags & BTREE_ITER_all_snapshots
@@ -2032,11 +2057,11 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter)
if (ret && !(iter->flags & BTREE_ITER_is_extents))
pos = bkey_successor(iter, pos);
- bch2_btree_iter_set_pos(iter, pos);
+ bch2_btree_iter_set_pos(trans, iter, pos);
return ret;
}
-inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
+inline bool bch2_btree_iter_rewind(struct btree_trans *trans, struct btree_iter *iter)
{
struct bpos pos = bkey_start_pos(&iter->k);
bool ret = !(iter->flags & BTREE_ITER_all_snapshots
@@ -2045,20 +2070,20 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
if (ret && !(iter->flags & BTREE_ITER_is_extents))
pos = bkey_predecessor(iter, pos);
- bch2_btree_iter_set_pos(iter, pos);
+ bch2_btree_iter_set_pos(trans, iter, pos);
return ret;
}
static noinline
void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c *k)
+ struct bpos search_key, struct bkey_s_c *k)
{
struct bpos end = path_l(btree_iter_path(trans, iter))->b->data->min_key;
trans_for_each_update(trans, i)
if (!i->key_cache_already_flushed &&
i->btree_id == iter->btree_id &&
- bpos_le(i->k->k.p, iter->pos) &&
+ bpos_le(i->k->k.p, search_key) &&
bpos_ge(i->k->k.p, k->k ? k->k->p : end)) {
iter->k = i->k->k;
*k = bkey_i_to_s_c(i->k);
@@ -2067,6 +2092,7 @@ void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_
static noinline
void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos search_key,
struct bkey_s_c *k)
{
struct btree_path *path = btree_iter_path(trans, iter);
@@ -2075,7 +2101,7 @@ void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter
trans_for_each_update(trans, i)
if (!i->key_cache_already_flushed &&
i->btree_id == iter->btree_id &&
- bpos_ge(i->k->k.p, path->pos) &&
+ bpos_ge(i->k->k.p, search_key) &&
bpos_le(i->k->k.p, k->k ? k->k->p : end)) {
iter->k = i->k->k;
*k = bkey_i_to_s_c(i->k);
@@ -2097,13 +2123,14 @@ void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_
static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
struct btree_iter *iter,
+ struct bpos search_pos,
struct bpos end_pos)
{
struct btree_path *path = btree_iter_path(trans, iter);
- return bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
+ return bch2_journal_keys_peek_max(trans->c, iter->btree_id,
path->level,
- path->pos,
+ search_pos,
end_pos,
&iter->journal_idx);
}
@@ -2113,7 +2140,7 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
struct btree_iter *iter)
{
struct btree_path *path = btree_iter_path(trans, iter);
- struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos);
+ struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos, path->pos);
if (k) {
iter->k = k->k;
@@ -2124,21 +2151,50 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
}
static noinline
-struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
+void btree_trans_peek_journal(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos search_key,
+ struct bkey_s_c *k)
{
struct btree_path *path = btree_iter_path(trans, iter);
struct bkey_i *next_journal =
- bch2_btree_journal_peek(trans, iter,
- k.k ? k.k->p : path_l(path)->b->key.k.p);
-
+ bch2_btree_journal_peek(trans, iter, search_key,
+ k->k ? k->k->p : path_l(path)->b->key.k.p);
if (next_journal) {
iter->k = next_journal->k;
- k = bkey_i_to_s_c(next_journal);
+ *k = bkey_i_to_s_c(next_journal);
}
+}
- return k;
+static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos search_key,
+ struct bpos end_pos)
+{
+ struct btree_path *path = btree_iter_path(trans, iter);
+
+ return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id,
+ path->level,
+ search_key,
+ end_pos,
+ &iter->journal_idx);
+}
+
+static noinline
+void btree_trans_peek_prev_journal(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos search_key,
+ struct bkey_s_c *k)
+{
+ struct btree_path *path = btree_iter_path(trans, iter);
+ struct bkey_i *next_journal =
+ bch2_btree_journal_peek_prev(trans, iter, search_key,
+ k->k ? k->k->p : path_l(path)->b->data->min_key);
+
+ if (next_journal) {
+ iter->k = next_journal->k;
+ *k = bkey_i_to_s_c(next_journal);
+ }
}
/*
@@ -2146,16 +2202,15 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
* bkey_s_c_null:
*/
static noinline
-struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
+struct bkey_s_c btree_trans_peek_key_cache(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos pos)
{
- struct btree_trans *trans = iter->trans;
struct bch_fs *c = trans->c;
struct bkey u;
struct bkey_s_c k;
int ret;
- bch2_trans_verify_not_in_restart(trans);
- bch2_trans_verify_not_unlocked(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
if ((iter->flags & BTREE_ITER_key_cache_fill) &&
bpos_eq(iter->pos, pos))
@@ -2181,28 +2236,30 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
if (unlikely(ret))
return bkey_s_c_err(ret);
- btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
-
k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u);
- if (k.k && !bkey_err(k)) {
- iter->k = u;
- k.k = &iter->k;
- }
+ if (!k.k)
+ return k;
+
+ if ((iter->flags & BTREE_ITER_all_snapshots) &&
+ !bpos_eq(pos, k.k->p))
+ return bkey_s_c_null;
+
+ iter->k = u;
+ k.k = &iter->k;
+ btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
return k;
}
-static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
+static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos search_key)
{
- struct btree_trans *trans = iter->trans;
struct bkey_s_c k, k2;
int ret;
EBUG_ON(btree_iter_path(trans, iter)->cached);
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
while (1) {
- struct btree_path_level *l;
-
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
@@ -2210,19 +2267,19 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (unlikely(ret)) {
/* ensure that iter->k is consistent with iter->pos: */
- bch2_btree_iter_set_pos(iter, iter->pos);
+ bch2_btree_iter_set_pos(trans, iter, iter->pos);
k = bkey_s_c_err(ret);
- goto out;
+ break;
}
struct btree_path *path = btree_iter_path(trans, iter);
- l = path_l(path);
+ struct btree_path_level *l = path_l(path);
if (unlikely(!l->b)) {
/* No btree nodes at requested level: */
- bch2_btree_iter_set_pos(iter, SPOS_MAX);
+ bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
k = bkey_s_c_null;
- goto out;
+ break;
}
btree_path_set_should_be_locked(trans, path);
@@ -2231,21 +2288,20 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
k.k &&
- (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
+ (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) {
k = k2;
- ret = bkey_err(k);
- if (ret) {
- bch2_btree_iter_set_pos(iter, iter->pos);
- goto out;
+ if (bkey_err(k)) {
+ bch2_btree_iter_set_pos(trans, iter, iter->pos);
+ break;
}
}
if (unlikely(iter->flags & BTREE_ITER_with_journal))
- k = btree_trans_peek_journal(trans, iter, k);
+ btree_trans_peek_journal(trans, iter, search_key, &k);
if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
trans->nr_updates))
- bch2_btree_trans_peek_updates(trans, iter, &k);
+ bch2_btree_trans_peek_updates(trans, iter, search_key, &k);
if (k.k && bkey_deleted(k.k)) {
/*
@@ -2268,120 +2324,138 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
search_key = bpos_successor(l->b->key.k.p);
} else {
/* End of btree: */
- bch2_btree_iter_set_pos(iter, SPOS_MAX);
+ bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
k = bkey_s_c_null;
- goto out;
+ break;
}
}
-out:
- bch2_btree_iter_verify(iter);
+
+ bch2_btree_iter_verify(trans, iter);
+
+ if (trace___btree_iter_peek_enabled()) {
+ CLASS(printbuf, buf)();
+
+ int ret = bkey_err(k);
+ if (ret)
+ prt_str(&buf, bch2_err_str(ret));
+ else if (k.k)
+ bch2_bkey_val_to_text(&buf, trans->c, k);
+ else
+ prt_str(&buf, "(null)");
+ trace___btree_iter_peek(trans->c, buf.buf);
+ }
return k;
}
/**
- * bch2_btree_iter_peek_upto() - returns first key greater than or equal to
+ * bch2_btree_iter_peek_max() - returns first key greater than or equal to
* iterator's current position
+ * @trans: btree transaction object
* @iter: iterator to peek from
* @end: search limit: returns keys less than or equal to @end
*
* Returns: key if found, or an error extractable with bkey_err().
*/
-struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
+struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos end)
{
- struct btree_trans *trans = iter->trans;
struct bpos search_key = btree_iter_search_key(iter);
struct bkey_s_c k;
- struct bpos iter_pos;
+ struct bpos iter_pos = iter->pos;
int ret;
- bch2_trans_verify_not_unlocked(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
+ bch2_btree_iter_verify_entry_exit(iter);
EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX));
+ ret = trans_maybe_inject_restart(trans, _RET_IP_);
+ if (unlikely(ret)) {
+ k = bkey_s_c_err(ret);
+ goto out_no_locked;
+ }
+
if (iter->update_path) {
- bch2_path_put_nokeep(trans, iter->update_path,
- iter->flags & BTREE_ITER_intent);
+ bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_intent);
iter->update_path = 0;
}
- bch2_btree_iter_verify_entry_exit(iter);
-
while (1) {
- k = __bch2_btree_iter_peek(iter, search_key);
+ k = __bch2_btree_iter_peek(trans, iter, search_key);
if (unlikely(!k.k))
goto end;
if (unlikely(bkey_err(k)))
goto out_no_locked;
- /*
- * We need to check against @end before FILTER_SNAPSHOTS because
- * if we get to a different inode that requested we might be
- * seeing keys for a different snapshot tree that will all be
- * filtered out.
- *
- * But we can't do the full check here, because bkey_start_pos()
- * isn't monotonically increasing before FILTER_SNAPSHOTS, and
- * that's what we check against in extents mode:
- */
- if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
- ? bkey_gt(k.k->p, end)
- : k.k->p.inode > end.inode))
- goto end;
+ if (iter->flags & BTREE_ITER_filter_snapshots) {
+ /*
+ * We need to check against @end before FILTER_SNAPSHOTS because
+ * if we get to a different inode that requested we might be
+ * seeing keys for a different snapshot tree that will all be
+ * filtered out.
+ *
+ * But we can't do the full check here, because bkey_start_pos()
+ * isn't monotonically increasing before FILTER_SNAPSHOTS, and
+ * that's what we check against in extents mode:
+ */
+ if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
+ ? bkey_gt(k.k->p, end)
+ : k.k->p.inode > end.inode))
+ goto end;
+
+ if (iter->update_path &&
+ !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) {
+ bch2_path_put(trans, iter->update_path,
+ iter->flags & BTREE_ITER_intent);
+ iter->update_path = 0;
+ }
- if (iter->update_path &&
- !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) {
- bch2_path_put_nokeep(trans, iter->update_path,
- iter->flags & BTREE_ITER_intent);
- iter->update_path = 0;
- }
+ if ((iter->flags & BTREE_ITER_intent) &&
+ !(iter->flags & BTREE_ITER_is_extents) &&
+ !iter->update_path) {
+ struct bpos pos = k.k->p;
- if ((iter->flags & BTREE_ITER_filter_snapshots) &&
- (iter->flags & BTREE_ITER_intent) &&
- !(iter->flags & BTREE_ITER_is_extents) &&
- !iter->update_path) {
- struct bpos pos = k.k->p;
+ if (pos.snapshot < iter->snapshot) {
+ search_key = bpos_successor(k.k->p);
+ continue;
+ }
- if (pos.snapshot < iter->snapshot) {
- search_key = bpos_successor(k.k->p);
- continue;
- }
+ pos.snapshot = iter->snapshot;
- pos.snapshot = iter->snapshot;
+ /*
+ * advance, same as on exit for iter->path, but only up
+ * to snapshot
+ */
+ __btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
+ iter->update_path = iter->path;
+
+ iter->update_path = bch2_btree_path_set_pos(trans,
+ iter->update_path, pos,
+ iter->flags & BTREE_ITER_intent,
+ _THIS_IP_);
+ ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
+ if (unlikely(ret)) {
+ k = bkey_s_c_err(ret);
+ goto out_no_locked;
+ }
+ }
/*
- * advance, same as on exit for iter->path, but only up
- * to snapshot
+ * We can never have a key in a leaf node at POS_MAX, so
+ * we don't have to check these successor() calls:
*/
- __btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
- iter->update_path = iter->path;
-
- iter->update_path = bch2_btree_path_set_pos(trans,
- iter->update_path, pos,
- iter->flags & BTREE_ITER_intent,
- _THIS_IP_);
- ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
- if (unlikely(ret)) {
- k = bkey_s_c_err(ret);
- goto out_no_locked;
+ if (!bch2_snapshot_is_ancestor(trans->c,
+ iter->snapshot,
+ k.k->p.snapshot)) {
+ search_key = bpos_successor(k.k->p);
+ continue;
}
- }
-
- /*
- * We can never have a key in a leaf node at POS_MAX, so
- * we don't have to check these successor() calls:
- */
- if ((iter->flags & BTREE_ITER_filter_snapshots) &&
- !bch2_snapshot_is_ancestor(trans->c,
- iter->snapshot,
- k.k->p.snapshot)) {
- search_key = bpos_successor(k.k->p);
- continue;
- }
- if (bkey_whiteout(k.k) &&
- !(iter->flags & BTREE_ITER_all_snapshots)) {
- search_key = bkey_successor(iter, k.k->p);
- continue;
+ if (bkey_whiteout(k.k) &&
+ !(iter->flags & BTREE_ITER_key_cache_fill)) {
+ search_key = bkey_successor(iter, k.k->p);
+ continue;
+ }
}
/*
@@ -2421,17 +2495,30 @@ out_no_locked:
if (!(iter->flags & BTREE_ITER_all_snapshots))
iter->pos.snapshot = iter->snapshot;
- ret = bch2_btree_iter_verify_ret(iter, k);
+ ret = bch2_btree_iter_verify_ret(trans, iter, k);
if (unlikely(ret)) {
- bch2_btree_iter_set_pos(iter, iter->pos);
+ bch2_btree_iter_set_pos(trans, iter, iter->pos);
k = bkey_s_c_err(ret);
}
bch2_btree_iter_verify_entry_exit(iter);
+ if (trace_btree_iter_peek_max_enabled()) {
+ CLASS(printbuf, buf)();
+
+ int ret = bkey_err(k);
+ if (ret)
+ prt_str(&buf, bch2_err_str(ret));
+ else if (k.k)
+ bch2_bkey_val_to_text(&buf, trans->c, k);
+ else
+ prt_str(&buf, "(null)");
+ trace_btree_iter_peek_max(trans->c, buf.buf);
+ }
+
return k;
end:
- bch2_btree_iter_set_pos(iter, end);
+ bch2_btree_iter_set_pos(trans, iter, end);
k = bkey_s_c_null;
goto out_no_locked;
}
@@ -2439,186 +2526,298 @@ end:
/**
* bch2_btree_iter_next() - returns first key greater than iterator's current
* position
+ * @trans: btree transaction object
* @iter: iterator to peek from
*
* Returns: key if found, or an error extractable with bkey_err().
*/
-struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_next(struct btree_trans *trans, struct btree_iter *iter)
{
- if (!bch2_btree_iter_advance(iter))
+ if (!bch2_btree_iter_advance(trans, iter))
return bkey_s_c_null;
- return bch2_btree_iter_peek(iter);
+ return bch2_btree_iter_peek(trans, iter);
+}
+
+static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos search_key)
+{
+ struct bkey_s_c k, k2;
+
+ bch2_btree_iter_verify(trans, iter);
+
+ while (1) {
+ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
+ iter->flags & BTREE_ITER_intent,
+ btree_iter_ip_allocated(iter));
+
+ int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+ if (unlikely(ret)) {
+ /* ensure that iter->k is consistent with iter->pos: */
+ bch2_btree_iter_set_pos(trans, iter, iter->pos);
+ k = bkey_s_c_err(ret);
+ break;
+ }
+
+ struct btree_path *path = btree_iter_path(trans, iter);
+ struct btree_path_level *l = path_l(path);
+
+ if (unlikely(!l->b)) {
+ /* No btree nodes at requested level: */
+ bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
+ k = bkey_s_c_null;
+ break;
+ }
+
+ btree_path_set_should_be_locked(trans, path);
+
+ k = btree_path_level_peek_all(trans->c, l, &iter->k);
+ if (!k.k || bpos_gt(k.k->p, search_key)) {
+ k = btree_path_level_prev(trans, path, l, &iter->k);
+
+ BUG_ON(k.k && bpos_gt(k.k->p, search_key));
+ }
+
+ if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
+ k.k &&
+ (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) {
+ k = k2;
+ if (bkey_err(k2)) {
+ bch2_btree_iter_set_pos(trans, iter, iter->pos);
+ break;
+ }
+ }
+
+ if (unlikely(iter->flags & BTREE_ITER_with_journal))
+ btree_trans_peek_prev_journal(trans, iter, search_key, &k);
+
+ if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
+ trans->nr_updates))
+ bch2_btree_trans_peek_prev_updates(trans, iter, search_key, &k);
+
+ if (likely(k.k && !bkey_deleted(k.k))) {
+ break;
+ } else if (k.k) {
+ search_key = bpos_predecessor(k.k->p);
+ } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
+ /* Advance to previous leaf node: */
+ search_key = bpos_predecessor(path->l[0].b->data->min_key);
+ } else {
+ /* Start of btree: */
+ bch2_btree_iter_set_pos(trans, iter, POS_MIN);
+ k = bkey_s_c_null;
+ break;
+ }
+ }
+
+ bch2_btree_iter_verify(trans, iter);
+ return k;
}
/**
- * bch2_btree_iter_peek_prev() - returns first key less than or equal to
+ * bch2_btree_iter_peek_prev_min() - returns first key less than or equal to
* iterator's current position
+ * @trans: btree transaction object
* @iter: iterator to peek from
+ * @end: search limit: returns keys greater than or equal to @end
*
* Returns: key if found, or an error extractable with bkey_err().
*/
-struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos end)
{
- struct btree_trans *trans = iter->trans;
+ if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) &&
+ !bkey_eq(iter->pos, POS_MAX) &&
+ !((iter->flags & BTREE_ITER_is_extents) &&
+ iter->pos.offset == U64_MAX)) {
+
+ /*
+ * bkey_start_pos(), for extents, is not monotonically
+ * increasing until after filtering for snapshots:
+ *
+ * Thus, for extents we need to search forward until we find a
+ * real visible extents - easiest to just use peek_slot() (which
+ * internally uses peek() for extents)
+ */
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter);
+ if (bkey_err(k))
+ return k;
+
+ if (!bkey_deleted(k.k) &&
+ (!(iter->flags & BTREE_ITER_is_extents) ||
+ bkey_lt(bkey_start_pos(k.k), iter->pos)))
+ return k;
+ }
+
struct bpos search_key = iter->pos;
struct bkey_s_c k;
- struct bkey saved_k;
- const struct bch_val *saved_v;
btree_path_idx_t saved_path = 0;
- int ret;
-
- bch2_trans_verify_not_unlocked(trans);
- EBUG_ON(btree_iter_path(trans, iter)->cached ||
- btree_iter_path(trans, iter)->level);
- if (iter->flags & BTREE_ITER_with_journal)
- return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported);
-
- bch2_btree_iter_verify(iter);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
bch2_btree_iter_verify_entry_exit(iter);
+ EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && iter->pos.inode != end.inode);
- if (iter->flags & BTREE_ITER_filter_snapshots)
- search_key.snapshot = U32_MAX;
+ int ret = trans_maybe_inject_restart(trans, _RET_IP_);
+ if (unlikely(ret)) {
+ k = bkey_s_c_err(ret);
+ goto out_no_locked;
+ }
while (1) {
- iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
- iter->flags & BTREE_ITER_intent,
- btree_iter_ip_allocated(iter));
-
- ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
- if (unlikely(ret)) {
- /* ensure that iter->k is consistent with iter->pos: */
- bch2_btree_iter_set_pos(iter, iter->pos);
- k = bkey_s_c_err(ret);
+ k = __bch2_btree_iter_peek_prev(trans, iter, search_key);
+ if (unlikely(!k.k))
+ goto end;
+ if (unlikely(bkey_err(k)))
goto out_no_locked;
- }
-
- struct btree_path *path = btree_iter_path(trans, iter);
- k = btree_path_level_peek(trans, path, &path->l[0], &iter->k);
- if (!k.k ||
- ((iter->flags & BTREE_ITER_is_extents)
- ? bpos_ge(bkey_start_pos(k.k), search_key)
- : bpos_gt(k.k->p, search_key)))
- k = btree_path_level_prev(trans, path, &path->l[0], &iter->k);
+ if (iter->flags & BTREE_ITER_filter_snapshots) {
+ struct btree_path *s = saved_path ? trans->paths + saved_path : NULL;
+ if (s && bpos_lt(k.k->p, SPOS(s->pos.inode, s->pos.offset, iter->snapshot))) {
+ /*
+ * If we have a saved candidate, and we're past
+ * the last possible snapshot overwrite, return
+ * it:
+ */
+ bch2_path_put(trans, iter->path,
+ iter->flags & BTREE_ITER_intent);
+ iter->path = saved_path;
+ saved_path = 0;
+ k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k);
+ break;
+ }
- if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
- trans->nr_updates))
- bch2_btree_trans_peek_prev_updates(trans, iter, &k);
+ /*
+ * We need to check against @end before FILTER_SNAPSHOTS because
+ * if we get to a different inode that requested we might be
+ * seeing keys for a different snapshot tree that will all be
+ * filtered out.
+ */
+ if (unlikely(bkey_lt(k.k->p, end)))
+ goto end;
- if (likely(k.k)) {
- if (iter->flags & BTREE_ITER_filter_snapshots) {
- if (k.k->p.snapshot == iter->snapshot)
- goto got_key;
+ if (!bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) {
+ search_key = bpos_predecessor(k.k->p);
+ continue;
+ }
+ if (k.k->p.snapshot != iter->snapshot) {
/*
- * If we have a saved candidate, and we're no
- * longer at the same _key_ (not pos), return
- * that candidate
+ * Have a key visible in iter->snapshot, but
+ * might have overwrites: - save it and keep
+ * searching. Unless it's a whiteout - then drop
+ * our previous saved candidate:
*/
- if (saved_path && !bkey_eq(k.k->p, saved_k.p)) {
- bch2_path_put_nokeep(trans, iter->path,
+ if (saved_path) {
+ bch2_path_put(trans, saved_path,
iter->flags & BTREE_ITER_intent);
- iter->path = saved_path;
saved_path = 0;
- iter->k = saved_k;
- k.v = saved_v;
- goto got_key;
}
- if (bch2_snapshot_is_ancestor(trans->c,
- iter->snapshot,
- k.k->p.snapshot)) {
- if (saved_path)
- bch2_path_put_nokeep(trans, saved_path,
- iter->flags & BTREE_ITER_intent);
+ if (!bkey_whiteout(k.k)) {
saved_path = btree_path_clone(trans, iter->path,
iter->flags & BTREE_ITER_intent,
_THIS_IP_);
- path = btree_iter_path(trans, iter);
- trace_btree_path_save_pos(trans, path, trans->paths + saved_path);
- saved_k = *k.k;
- saved_v = k.v;
+ trace_btree_path_save_pos(trans,
+ trans->paths + iter->path,
+ trans->paths + saved_path);
}
search_key = bpos_predecessor(k.k->p);
continue;
}
-got_key:
- if (bkey_whiteout(k.k) &&
- !(iter->flags & BTREE_ITER_all_snapshots)) {
+
+ if (bkey_whiteout(k.k)) {
search_key = bkey_predecessor(iter, k.k->p);
- if (iter->flags & BTREE_ITER_filter_snapshots)
- search_key.snapshot = U32_MAX;
+ search_key.snapshot = U32_MAX;
continue;
}
-
- btree_path_set_should_be_locked(trans, path);
- break;
- } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
- /* Advance to previous leaf node: */
- search_key = bpos_predecessor(path->l[0].b->data->min_key);
- } else {
- /* Start of btree: */
- bch2_btree_iter_set_pos(iter, POS_MIN);
- k = bkey_s_c_null;
- goto out_no_locked;
}
- }
- EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos));
+ EBUG_ON(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(k.k->p, iter->pos) :
+ iter->flags & BTREE_ITER_is_extents ? bkey_ge(bkey_start_pos(k.k), iter->pos) :
+ bkey_gt(k.k->p, iter->pos));
+
+ if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_lt(k.k->p, end) :
+ iter->flags & BTREE_ITER_is_extents ? bkey_le(k.k->p, end) :
+ bkey_lt(k.k->p, end)))
+ goto end;
+
+ break;
+ }
/* Extents can straddle iter->pos: */
- if (bkey_lt(k.k->p, iter->pos))
- iter->pos = k.k->p;
+ iter->pos = bpos_min(iter->pos, k.k->p);;
if (iter->flags & BTREE_ITER_filter_snapshots)
iter->pos.snapshot = iter->snapshot;
out_no_locked:
if (saved_path)
- bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_intent);
+ bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_intent);
bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
+
+ if (trace_btree_iter_peek_prev_min_enabled()) {
+ CLASS(printbuf, buf)();
+ int ret = bkey_err(k);
+ if (ret)
+ prt_str(&buf, bch2_err_str(ret));
+ else if (k.k)
+ bch2_bkey_val_to_text(&buf, trans->c, k);
+ else
+ prt_str(&buf, "(null)");
+ trace_btree_iter_peek_prev_min(trans->c, buf.buf);
+ }
return k;
+end:
+ bch2_btree_iter_set_pos(trans, iter, end);
+ k = bkey_s_c_null;
+ goto out_no_locked;
}
/**
* bch2_btree_iter_prev() - returns first key less than iterator's current
* position
+ * @trans: btree transaction object
* @iter: iterator to peek from
*
* Returns: key if found, or an error extractable with bkey_err().
*/
-struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *trans, struct btree_iter *iter)
{
- if (!bch2_btree_iter_rewind(iter))
+ if (!bch2_btree_iter_rewind(trans, iter))
return bkey_s_c_null;
- return bch2_btree_iter_peek_prev(iter);
+ return bch2_btree_iter_peek_prev(trans, iter);
}
-struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btree_iter *iter)
{
- struct btree_trans *trans = iter->trans;
struct bpos search_key;
struct bkey_s_c k;
int ret;
- bch2_trans_verify_not_unlocked(trans);
- bch2_btree_iter_verify(iter);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
+ bch2_btree_iter_verify(trans, iter);
bch2_btree_iter_verify_entry_exit(iter);
EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache));
+ ret = trans_maybe_inject_restart(trans, _RET_IP_);
+ if (unlikely(ret)) {
+ k = bkey_s_c_err(ret);
+ goto out;
+ }
+
/* extents can't span inode numbers: */
if ((iter->flags & BTREE_ITER_is_extents) &&
unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
- if (iter->pos.inode == KEY_INODE_MAX)
- return bkey_s_c_null;
+ if (iter->pos.inode == KEY_INODE_MAX) {
+ k = bkey_s_c_null;
+ goto out2;
+ }
- bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
+ bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos));
}
search_key = btree_iter_search_key(iter);
@@ -2629,9 +2828,17 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (unlikely(ret)) {
k = bkey_s_c_err(ret);
- goto out_no_locked;
+ goto out;
+ }
+
+ struct btree_path *path = btree_iter_path(trans, iter);
+ if (unlikely(!btree_path_node(path, path->level))) {
+ k = bkey_s_c_null;
+ goto out2;
}
+ btree_path_set_should_be_locked(trans, path);
+
if ((iter->flags & BTREE_ITER_cached) ||
!(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) {
k = bkey_s_c_null;
@@ -2648,16 +2855,21 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
goto out;
if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
- (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
+ (k = btree_trans_peek_key_cache(trans, iter, iter->pos)).k) {
if (!bkey_err(k))
iter->k = *k.k;
/* We're not returning a key from iter->path: */
- goto out_no_locked;
+ goto out;
}
- k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k);
+ k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k);
if (unlikely(!k.k))
- goto out_no_locked;
+ goto out;
+
+ if (unlikely(k.k->type == KEY_TYPE_whiteout &&
+ (iter->flags & BTREE_ITER_filter_snapshots) &&
+ !(iter->flags & BTREE_ITER_key_cache_fill)))
+ iter->k.type = KEY_TYPE_deleted;
} else {
struct bpos next;
struct bpos end = iter->pos;
@@ -2670,8 +2882,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
if (iter->flags & BTREE_ITER_intent) {
struct btree_iter iter2;
- bch2_trans_copy_iter(&iter2, iter);
- k = bch2_btree_iter_peek_upto(&iter2, end);
+ bch2_trans_copy_iter(trans, &iter2, iter);
+ k = bch2_btree_iter_peek_max(trans, &iter2, end);
if (k.k && !bkey_err(k)) {
swap(iter->key_cache_path, iter2.key_cache_path);
@@ -2682,15 +2894,15 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
} else {
struct bpos pos = iter->pos;
- k = bch2_btree_iter_peek_upto(iter, end);
+ k = bch2_btree_iter_peek_max(trans, iter, end);
if (unlikely(bkey_err(k)))
- bch2_btree_iter_set_pos(iter, pos);
+ bch2_btree_iter_set_pos(trans, iter, pos);
else
iter->pos = pos;
}
if (unlikely(bkey_err(k)))
- goto out_no_locked;
+ goto out;
next = k.k ? bkey_start_pos(k.k) : POS_MAX;
@@ -2712,42 +2924,53 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
}
}
out:
- btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
-out_no_locked:
bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(iter);
- ret = bch2_btree_iter_verify_ret(iter, k);
+ bch2_btree_iter_verify(trans, iter);
+ ret = bch2_btree_iter_verify_ret(trans, iter, k);
if (unlikely(ret))
- return bkey_s_c_err(ret);
+ k = bkey_s_c_err(ret);
+out2:
+ if (trace_btree_iter_peek_slot_enabled()) {
+ CLASS(printbuf, buf)();
+
+ int ret = bkey_err(k);
+ if (ret)
+ prt_str(&buf, bch2_err_str(ret));
+ else if (k.k)
+ bch2_bkey_val_to_text(&buf, trans->c, k);
+ else
+ prt_str(&buf, "(null)");
+ trace_btree_iter_peek_slot(trans->c, buf.buf);
+ }
return k;
}
-struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *trans, struct btree_iter *iter)
{
- if (!bch2_btree_iter_advance(iter))
+ if (!bch2_btree_iter_advance(trans, iter))
return bkey_s_c_null;
- return bch2_btree_iter_peek_slot(iter);
+ return bch2_btree_iter_peek_slot(trans, iter);
}
-struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *trans, struct btree_iter *iter)
{
- if (!bch2_btree_iter_rewind(iter))
+ if (!bch2_btree_iter_rewind(trans, iter))
return bkey_s_c_null;
- return bch2_btree_iter_peek_slot(iter);
+ return bch2_btree_iter_peek_slot(trans, iter);
}
/* Obsolete, but still used by rust wrapper in -tools */
-struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *trans, struct btree_iter *iter)
{
struct bkey_s_c k;
- while (btree_trans_too_many_iters(iter->trans) ||
- (k = bch2_btree_iter_peek_type(iter, iter->flags),
+ while (btree_trans_too_many_iters(trans) ||
+ (k = bch2_btree_iter_peek_type(trans, iter, iter->flags),
bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
- bch2_trans_begin(iter->trans);
+ bch2_trans_begin(trans);
return k;
}
@@ -2780,7 +3003,7 @@ static void btree_trans_verify_sorted(struct btree_trans *trans)
struct btree_path *path, *prev = NULL;
struct trans_for_each_path_inorder_iter iter;
- if (!bch2_debug_check_iterators)
+ if (!static_branch_unlikely(&bch2_debug_check_iterators))
return;
trans_for_each_path_inorder(trans, path, iter) {
@@ -2882,7 +3105,7 @@ static inline void btree_path_list_add(struct btree_trans *trans,
void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
{
if (iter->update_path)
- bch2_path_put_nokeep(trans, iter->update_path,
+ bch2_path_put(trans, iter->update_path,
iter->flags & BTREE_ITER_intent);
if (iter->path)
bch2_path_put(trans, iter->path,
@@ -2893,7 +3116,6 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
iter->path = 0;
iter->update_path = 0;
iter->key_cache_path = 0;
- iter->trans = NULL;
}
void bch2_trans_iter_init_outlined(struct btree_trans *trans,
@@ -2902,7 +3124,7 @@ void bch2_trans_iter_init_outlined(struct btree_trans *trans,
unsigned flags)
{
bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
- bch2_btree_iter_flags(trans, btree_id, flags),
+ bch2_btree_iter_flags(trans, btree_id, 0, flags),
_RET_IP_);
}
@@ -2918,8 +3140,11 @@ void bch2_trans_node_iter_init(struct btree_trans *trans,
flags |= BTREE_ITER_snapshot_field;
flags |= BTREE_ITER_all_snapshots;
+ if (!depth && btree_id_cached(trans->c, btree_id))
+ flags |= BTREE_ITER_with_key_cache;
+
bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
- __bch2_btree_iter_flags(trans, btree_id, flags),
+ bch2_btree_iter_flags(trans, btree_id, depth, flags),
_RET_IP_);
iter->min_depth = depth;
@@ -2930,10 +3155,9 @@ void bch2_trans_node_iter_init(struct btree_trans *trans,
BUG_ON(iter->min_depth != depth);
}
-void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
+void bch2_trans_copy_iter(struct btree_trans *trans,
+ struct btree_iter *dst, struct btree_iter *src)
{
- struct btree_trans *trans = src->trans;
-
*dst = *src;
#ifdef TRACK_PATH_ALLOCATED
dst->ip_allocated = _RET_IP_;
@@ -2945,7 +3169,19 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
dst->key_cache_path = 0;
}
-void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+void bch2_trans_kmalloc_trace_to_text(struct printbuf *out,
+ darray_trans_kmalloc_trace *trace)
+{
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, 60);
+
+ darray_for_each(*trace, i)
+ prt_printf(out, "%pS\t%zu\n", (void *) i->ip, i->bytes);
+}
+#endif
+
+void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long ip)
{
struct bch_fs *c = trans->c;
unsigned new_top = trans->mem_top + size;
@@ -2955,51 +3191,66 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
void *new_mem;
void *p;
- WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
+ if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) {
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+ prt_printf(&buf, "bump allocator exceeded BTREE_TRANS_MEM_MAX (%u)\n",
+ BTREE_TRANS_MEM_MAX);
- struct btree_transaction_stats *s = btree_trans_stats(trans);
- s->max_mem = max(s->max_mem, new_bytes);
+ bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace);
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+#endif
+ }
- if (trans->used_mempool) {
- if (trans->mem_bytes >= new_bytes)
- goto out_change_top;
+ ret = trans_maybe_inject_restart(trans, _RET_IP_);
+ if (ret)
+ return ERR_PTR(ret);
- /* No more space from mempool item, need malloc new one */
- new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN);
- if (unlikely(!new_mem)) {
- bch2_trans_unlock(trans);
+ struct btree_transaction_stats *s = btree_trans_stats(trans);
+ if (new_bytes > s->max_mem) {
+ mutex_lock(&s->lock);
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+ darray_resize(&s->trans_kmalloc_trace, trans->trans_kmalloc_trace.nr);
+ s->trans_kmalloc_trace.nr = min(s->trans_kmalloc_trace.size,
+ trans->trans_kmalloc_trace.nr);
+
+ memcpy(s->trans_kmalloc_trace.data,
+ trans->trans_kmalloc_trace.data,
+ sizeof(s->trans_kmalloc_trace.data[0]) *
+ s->trans_kmalloc_trace.nr);
+#endif
+ s->max_mem = new_bytes;
+ mutex_unlock(&s->lock);
+ }
- new_mem = kmalloc(new_bytes, GFP_KERNEL);
- if (!new_mem)
- return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
+ if (trans->used_mempool || new_bytes > BTREE_TRANS_MEM_MAX) {
+ EBUG_ON(trans->mem_bytes >= new_bytes);
+ return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
+ }
- ret = bch2_trans_relock(trans);
- if (ret) {
- kfree(new_mem);
- return ERR_PTR(ret);
- }
- }
- memcpy(new_mem, trans->mem, trans->mem_top);
- trans->used_mempool = false;
- mempool_free(trans->mem, &c->btree_trans_mem_pool);
- goto out_new_mem;
+ if (old_bytes) {
+ trans->realloc_bytes_required = new_bytes;
+ trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
+ return ERR_PTR(btree_trans_restart_ip(trans,
+ BCH_ERR_transaction_restart_mem_realloced, _RET_IP_));
}
- new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
+ EBUG_ON(trans->mem);
+
+ new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN);
if (unlikely(!new_mem)) {
bch2_trans_unlock(trans);
- new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
+ new_mem = kmalloc(new_bytes, GFP_KERNEL);
if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
new_bytes = BTREE_TRANS_MEM_MAX;
- memcpy(new_mem, trans->mem, trans->mem_top);
trans->used_mempool = true;
- kfree(trans->mem);
}
- if (!new_mem)
- return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
+ EBUG_ON(!new_mem);
trans->mem = new_mem;
trans->mem_bytes = new_bytes;
@@ -3008,15 +3259,10 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
if (ret)
return ERR_PTR(ret);
}
-out_new_mem:
+
trans->mem = new_mem;
trans->mem_bytes = new_bytes;
- if (old_bytes) {
- trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
- return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
- }
-out_change_top:
p = trans->mem + trans->mem_top;
trans->mem_top += size;
memset(p, 0, size);
@@ -3076,7 +3322,27 @@ u32 bch2_trans_begin(struct btree_trans *trans)
trans->restart_count++;
trans->mem_top = 0;
- trans->journal_entries = NULL;
+
+ if (trans->restarted == BCH_ERR_transaction_restart_mem_realloced) {
+ EBUG_ON(!trans->mem || !trans->mem_bytes);
+ unsigned new_bytes = trans->realloc_bytes_required;
+ void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
+ if (unlikely(!new_mem)) {
+ bch2_trans_unlock(trans);
+ new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
+
+ EBUG_ON(new_bytes > BTREE_TRANS_MEM_MAX);
+
+ if (!new_mem) {
+ new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
+ new_bytes = BTREE_TRANS_MEM_MAX;
+ trans->used_mempool = true;
+ kfree(trans->mem);
+ }
+ }
+ trans->mem = new_mem;
+ trans->mem_bytes = new_bytes;
+ }
trans_for_each_path(trans, path, i) {
path->should_be_locked = false;
@@ -3122,14 +3388,26 @@ u32 bch2_trans_begin(struct btree_trans *trans)
trans->last_begin_ip = _RET_IP_;
- trans_set_locked(trans);
+#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
+ if (trans->restarted) {
+ trans->restart_count_this_trans++;
+ } else {
+ trans->restart_count_this_trans = 0;
+ }
+#endif
+
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+ trans->trans_kmalloc_trace.nr = 0;
+#endif
+
+ trans_set_locked(trans, false);
if (trans->restarted) {
bch2_btree_path_traverse_all(trans);
trans->notrace_relock_fail = false;
}
- bch2_trans_verify_not_unlocked(trans);
+ bch2_trans_verify_not_unlocked_or_in_restart(trans);
return trans->restart_count;
}
@@ -3222,46 +3500,64 @@ got_trans:
}
trans->nr_paths_max = s->nr_max_paths;
- trans->journal_entries_size = s->journal_entries_size;
}
trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
trans->srcu_lock_time = jiffies;
trans->srcu_held = true;
- trans_set_locked(trans);
+ trans_set_locked(trans, false);
closure_init_stack_release(&trans->ref);
return trans;
}
-static void check_btree_paths_leaked(struct btree_trans *trans)
-{
#ifdef CONFIG_BCACHEFS_DEBUG
- struct bch_fs *c = trans->c;
+
+static bool btree_paths_leaked(struct btree_trans *trans)
+{
struct btree_path *path;
unsigned i;
trans_for_each_path(trans, path, i)
if (path->ref)
- goto leaked;
- return;
-leaked:
- bch_err(c, "btree paths leaked from %s!", trans->fn);
- trans_for_each_path(trans, path, i)
- if (path->ref)
- printk(KERN_ERR " btree %s %pS\n",
- bch2_btree_id_str(path->btree_id),
- (void *) path->ip_allocated);
- /* Be noisy about this: */
- bch2_fatal_error(c);
-#endif
+ return true;
+ return false;
}
+static void check_btree_paths_leaked(struct btree_trans *trans)
+{
+ if (btree_paths_leaked(trans)) {
+ struct bch_fs *c = trans->c;
+ struct btree_path *path;
+ unsigned i;
+
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+
+ prt_printf(&buf, "btree paths leaked from %s!\n", trans->fn);
+ trans_for_each_path(trans, path, i)
+ if (path->ref)
+ prt_printf(&buf, "btree %s %pS\n",
+ bch2_btree_id_str(path->btree_id),
+ (void *) path->ip_allocated);
+
+ bch2_fs_emergency_read_only2(c, &buf);
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ }
+}
+#else
+static inline void check_btree_paths_leaked(struct btree_trans *trans) {}
+#endif
+
void bch2_trans_put(struct btree_trans *trans)
__releases(&c->btree_trans_barrier)
{
struct bch_fs *c = trans->c;
+ if (trans->restarted)
+ bch2_trans_in_restart_error(trans);
+
bch2_trans_unlock(trans);
trans_for_each_update(trans, i)
@@ -3285,6 +3581,13 @@ void bch2_trans_put(struct btree_trans *trans)
closure_return_sync(&trans->ref);
trans->locking_wait.task = NULL;
+#ifdef CONFIG_BCACHEFS_DEBUG
+ darray_exit(&trans->last_restarted_trace);
+#endif
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+ darray_exit(&trans->trans_kmalloc_trace);
+#endif
+
unsigned long *paths_allocated = trans->paths_allocated;
trans->paths_allocated = NULL;
trans->paths = NULL;
@@ -3330,16 +3633,16 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
struct btree_bkey_cached_common *b)
{
struct six_lock_count c = six_lock_counts(&b->lock);
- struct task_struct *owner;
pid_t pid;
- rcu_read_lock();
- owner = READ_ONCE(b->lock.owner);
- pid = owner ? owner->pid : 0;
- rcu_read_unlock();
+ scoped_guard(rcu) {
+ struct task_struct *owner = READ_ONCE(b->lock.owner);
+ pid = owner ? owner->pid : 0;
+ }
- prt_printf(out, "\t%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
- b->level, bch2_btree_id_str(b->btree_id));
+ prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b');
+ bch2_btree_id_to_text(out, b->btree_id);
+ prt_printf(out, " l=%u:", b->level);
bch2_bpos_to_text(out, btree_node_pos(b));
prt_printf(out, "\t locks %u:%u:%u held by pid %u",
@@ -3364,7 +3667,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn);
/* trans->paths is rcu protected vs. freeing */
- rcu_read_lock();
+ guard(rcu)();
out->atomic++;
struct btree_path *paths = rcu_dereference(trans->paths);
@@ -3378,11 +3681,11 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
if (!path->nodes_locked)
continue;
- prt_printf(out, " path %u %c l=%u %s:",
- idx,
- path->cached ? 'c' : 'b',
- path->level,
- bch2_btree_id_str(path->btree_id));
+ prt_printf(out, " path %u %c ",
+ idx,
+ path->cached ? 'c' : 'b');
+ bch2_btree_id_to_text(out, path->btree_id);
+ prt_printf(out, " l=%u:", path->level);
bch2_bpos_to_text(out, path->pos);
prt_newline(out);
@@ -3407,7 +3710,6 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
}
out:
--out->atomic;
- rcu_read_unlock();
}
void bch2_fs_btree_iter_exit(struct bch_fs *c)
@@ -3437,6 +3739,9 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
for (s = c->btree_transaction_stats;
s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
s++) {
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+ darray_exit(&s->trans_kmalloc_trace);
+#endif
kfree(s->max_paths_text);
bch2_time_stats_exit(&s->lock_hold_times);
}
@@ -3488,7 +3793,7 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
#ifdef CONFIG_LOCKDEP
fs_reclaim_acquire(GFP_KERNEL);
struct btree_trans *trans = bch2_trans_get(c);
- trans_set_locked(trans);
+ trans_set_locked(trans, false);
bch2_trans_put(trans);
fs_reclaim_release(GFP_KERNEL);
#endif