diff options
Diffstat (limited to 'fs/bcachefs/btree_iter.c')
-rw-r--r-- | fs/bcachefs/btree_iter.c | 1313 |
1 files changed, 809 insertions, 504 deletions
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index eef9b89c561d..f8829b667ad3 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -16,6 +16,7 @@ #include "journal_io.h" #include "replicas.h" #include "snapshot.h" +#include "super.h" #include "trace.h" #include <linux/random.h> @@ -114,11 +115,9 @@ static inline bool btree_path_pos_in_node(struct btree_path *path, !btree_path_pos_after_node(path, b); } -/* Btree iterator: */ +/* Debug: */ -#ifdef CONFIG_BCACHEFS_DEBUG - -static void bch2_btree_path_verify_cached(struct btree_trans *trans, +static void __bch2_btree_path_verify_cached(struct btree_trans *trans, struct btree_path *path) { struct bkey_cached *ck; @@ -135,7 +134,7 @@ static void bch2_btree_path_verify_cached(struct btree_trans *trans, btree_node_unlock(trans, path, 0); } -static void bch2_btree_path_verify_level(struct btree_trans *trans, +static void __bch2_btree_path_verify_level(struct btree_trans *trans, struct btree_path *path, unsigned level) { struct btree_path_level *l; @@ -147,16 +146,13 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans, struct printbuf buf3 = PRINTBUF; const char *msg; - if (!bch2_debug_check_iterators) - return; - l = &path->l[level]; tmp = l->iter; locked = btree_node_locked(path, level); if (path->cached) { if (!level) - bch2_btree_path_verify_cached(trans, path); + __bch2_btree_path_verify_cached(trans, path); return; } @@ -217,7 +213,7 @@ err: msg, level, buf1.buf, buf2.buf, buf3.buf); } -static void bch2_btree_path_verify(struct btree_trans *trans, +static void __bch2_btree_path_verify(struct btree_trans *trans, struct btree_path *path) { struct bch_fs *c = trans->c; @@ -229,25 +225,23 @@ static void bch2_btree_path_verify(struct btree_trans *trans, break; } - bch2_btree_path_verify_level(trans, path, i); + __bch2_btree_path_verify_level(trans, path, i); } - bch2_btree_path_verify_locks(path); + bch2_btree_path_verify_locks(trans, path); } -void bch2_trans_verify_paths(struct btree_trans *trans) +void __bch2_trans_verify_paths(struct btree_trans *trans) { struct btree_path *path; unsigned iter; trans_for_each_path(trans, path, iter) - bch2_btree_path_verify(trans, path); + __bch2_btree_path_verify(trans, path); } -static void bch2_btree_iter_verify(struct btree_iter *iter) +static void __bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter *iter) { - struct btree_trans *trans = iter->trans; - BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached); BUG_ON((iter->flags & BTREE_ITER_is_extents) && @@ -258,11 +252,11 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) !btree_type_has_snapshot_field(iter->btree_id)); if (iter->update_path) - bch2_btree_path_verify(trans, &trans->paths[iter->update_path]); - bch2_btree_path_verify(trans, btree_iter_path(trans, iter)); + __bch2_btree_path_verify(trans, &trans->paths[iter->update_path]); + __bch2_btree_path_verify(trans, btree_iter_path(trans, iter)); } -static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) +static void __bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) { BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && !iter->pos.snapshot); @@ -270,20 +264,19 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) && iter->pos.snapshot != iter->snapshot); - BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) || - bkey_gt(iter->pos, iter->k.p)); + BUG_ON(iter->flags & BTREE_ITER_all_snapshots ? !bpos_eq(iter->pos, iter->k.p) : + !(iter->flags & BTREE_ITER_is_extents) ? !bkey_eq(iter->pos, iter->k.p) : + (bkey_lt(iter->pos, bkey_start_pos(&iter->k)) || + bkey_gt(iter->pos, iter->k.p))); } -static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) +static int __bch2_btree_iter_verify_ret(struct btree_trans *trans, + struct btree_iter *iter, struct bkey_s_c k) { - struct btree_trans *trans = iter->trans; struct btree_iter copy; struct bkey_s_c prev; int ret = 0; - if (!bch2_debug_check_iterators) - return 0; - if (!(iter->flags & BTREE_ITER_filter_snapshots)) return 0; @@ -297,7 +290,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, BTREE_ITER_nopreserve| BTREE_ITER_all_snapshots); - prev = bch2_btree_iter_prev(©); + prev = bch2_btree_iter_prev(trans, ©); if (!prev.k) goto out; @@ -324,10 +317,10 @@ out: return ret; } -void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, +void __bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, struct bpos pos) { - bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); struct btree_path *path; struct trans_for_each_path_inorder_iter iter; @@ -357,17 +350,40 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, panic("not locked: %s %s\n", bch2_btree_id_str(id), buf.buf); } -#else - static inline void bch2_btree_path_verify_level(struct btree_trans *trans, - struct btree_path *path, unsigned l) {} + struct btree_path *path, unsigned l) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_btree_path_verify_level(trans, path, l); +} + static inline void bch2_btree_path_verify(struct btree_trans *trans, - struct btree_path *path) {} -static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} -static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} -static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; } + struct btree_path *path) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_btree_path_verify(trans, path); +} -#endif +static inline void bch2_btree_iter_verify(struct btree_trans *trans, + struct btree_iter *iter) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_btree_iter_verify(trans, iter); +} + +static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_btree_iter_verify_entry_exit(iter); +} + +static inline int bch2_btree_iter_verify_ret(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k) +{ + return static_branch_unlikely(&bch2_debug_check_iterators) + ? __bch2_btree_iter_verify_ret(trans, iter, k) + : 0; +} /* Btree path: fixups after btree updates */ @@ -521,7 +537,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, __bch2_btree_node_iter_fix(path, b, node_iter, t, where, clobber_u64s, new_u64s); - if (bch2_debug_check_iterators) + if (static_branch_unlikely(&bch2_debug_check_iterators)) bch2_btree_node_iter_verify(node_iter, b); } @@ -560,20 +576,6 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, bch2_btree_node_iter_peek_all(&l->iter, l->b)); } -static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans, - struct btree_path *path, - struct btree_path_level *l, - struct bkey *u) -{ - struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, - bch2_btree_node_iter_peek(&l->iter, l->b)); - - path->pos = k.k ? k.k->p : l->b->key.k.p; - trans->paths_sorted = false; - bch2_btree_path_verify_level(trans, path, l - path->l); - return k; -} - static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, struct btree_path *path, struct btree_path_level *l, @@ -697,6 +699,19 @@ void bch2_trans_node_add(struct btree_trans *trans, bch2_trans_revalidate_updates_in_node(trans, b); } +void bch2_trans_node_drop(struct btree_trans *trans, + struct btree *b) +{ + struct btree_path *path; + unsigned i, level = b->c.level; + + trans_for_each_path(trans, path, i) + if (path->l[level].b == b) { + btree_node_unlock(trans, path, level); + path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); + } +} + /* * A btree node has been modified in such a way as to invalidate iterators - fix * them: @@ -720,7 +735,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans, unsigned long trace_ip) { struct bch_fs *c = trans->c; - struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b; + struct btree_root *r = bch2_btree_id_root(c, path->btree_id); enum six_lock_type lock_type; unsigned i; int ret; @@ -728,7 +743,12 @@ static inline int btree_path_lock_root(struct btree_trans *trans, EBUG_ON(path->nodes_locked); while (1) { - b = READ_ONCE(*rootp); + struct btree *b = READ_ONCE(r->b); + if (unlikely(!b)) { + BUG_ON(!r->error); + return r->error; + } + path->level = READ_ONCE(b->c.level); if (unlikely(path->level < depth_want)) { @@ -748,14 +768,12 @@ static inline int btree_path_lock_root(struct btree_trans *trans, ret = btree_node_lock(trans, path, &b->c, path->level, lock_type, trace_ip); if (unlikely(ret)) { - if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed)) - continue; if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ret; BUG(); } - if (likely(b == READ_ONCE(*rootp) && + if (likely(b == READ_ONCE(r->b) && b->c.level == path->level && !race_fault())) { for (i = 0; i < path->level; i++) @@ -825,6 +843,8 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p bch2_bkey_buf_init(&tmp); + jiter->fail_if_too_many_whiteouts = true; + while (nr-- && !ret) { if (!bch2_btree_node_relock(trans, path, path->level)) break; @@ -870,8 +890,7 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, struct btree_path *path, - unsigned flags, - struct bkey_buf *out) + unsigned flags) { struct bch_fs *c = trans->c; struct btree_path_level *l = path_l(path); @@ -895,7 +914,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, goto err; } - bch2_bkey_buf_reassemble(out, c, k); + bkey_reassemble(&trans->btree_path_down, k); if ((flags & BTREE_ITER_prefetch) && c->opts.btree_node_prefetch) @@ -906,6 +925,22 @@ err: return ret; } +static noinline_for_stack int btree_node_missing_err(struct btree_trans *trans, + struct btree_path *path) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "node not found at pos "); + bch2_bpos_to_text(&buf, path->pos); + prt_str(&buf, " within parent node "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&path_l(path)->b->key)); + + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); + return bch_err_throw(c, btree_need_topology_repair); +} + static __always_inline int btree_path_down(struct btree_trans *trans, struct btree_path *path, unsigned flags, @@ -916,51 +951,38 @@ static __always_inline int btree_path_down(struct btree_trans *trans, struct btree *b; unsigned level = path->level - 1; enum six_lock_type lock_type = __btree_lock_want(path, level); - struct bkey_buf tmp; int ret; EBUG_ON(!btree_node_locked(path, path->level)); - bch2_bkey_buf_init(&tmp); - if (unlikely(trans->journal_replay_not_finished)) { - ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp); + ret = btree_node_iter_and_journal_peek(trans, path, flags); if (ret) - goto err; + return ret; } else { struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b); - if (!k) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "node not found at pos "); - bch2_bpos_to_text(&buf, path->pos); - prt_str(&buf, " within parent node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key)); - - bch2_fs_fatal_error(c, "%s", buf.buf); - printbuf_exit(&buf); - ret = -BCH_ERR_btree_need_topology_repair; - goto err; - } + if (unlikely(!k)) + return btree_node_missing_err(trans, path); - bch2_bkey_buf_unpack(&tmp, c, l->b, k); + bch2_bkey_unpack(l->b, &trans->btree_path_down, k); - if ((flags & BTREE_ITER_prefetch) && + if (unlikely((flags & BTREE_ITER_prefetch)) && c->opts.btree_node_prefetch) { ret = btree_path_prefetch(trans, path); if (ret) - goto err; + return ret; } } - b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip); + b = bch2_btree_node_get(trans, path, &trans->btree_path_down, + level, lock_type, trace_ip); ret = PTR_ERR_OR_ZERO(b); if (unlikely(ret)) - goto err; + return ret; - if (likely(!trans->journal_replay_not_finished && - tmp.k->k.type == KEY_TYPE_btree_ptr_v2) && - unlikely(b != btree_node_mem_ptr(tmp.k))) + if (unlikely(b != btree_node_mem_ptr(&trans->btree_path_down)) && + likely(!trans->journal_replay_not_finished && + trans->btree_path_down.k.type == KEY_TYPE_btree_ptr_v2)) btree_node_mem_ptr_set(trans, path, level + 1, b); if (btree_node_read_locked(path, level + 1)) @@ -971,10 +993,8 @@ static __always_inline int btree_path_down(struct btree_trans *trans, path->level = level; bch2_btree_path_level_init(trans, path, b); - bch2_btree_path_verify_locks(path); -err: - bch2_bkey_buf_exit(&tmp, c); - return ret; + bch2_btree_path_verify_locks(trans, path); + return 0; } static int bch2_btree_path_traverse_all(struct btree_trans *trans) @@ -986,7 +1006,7 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans) int ret = 0; if (trans->in_traverse_all) - return -BCH_ERR_transaction_restart_in_traverse_all; + return bch_err_throw(c, transaction_restart_in_traverse_all); trans->in_traverse_all = true; retry_all: @@ -1000,7 +1020,7 @@ retry_all: bch2_trans_unlock(trans); cond_resched(); - trans_set_locked(trans); + trans_set_locked(trans, false); if (unlikely(trans->memory_allocation_failure)) { struct closure cl; @@ -1083,7 +1103,7 @@ static void btree_path_set_level_down(struct btree_trans *trans, if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) btree_node_unlock(trans, path, l); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); bch2_btree_path_verify(trans, path); } @@ -1156,7 +1176,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, } if (path->cached) { - ret = bch2_btree_path_traverse_cached(trans, path, flags); + ret = bch2_btree_path_traverse_cached(trans, path_idx, flags); goto out; } @@ -1267,7 +1287,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, { int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos); - bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); EBUG_ON(!trans->paths[path_idx].ref); trace_btree_path_set_pos(trans, trans->paths + path_idx, &new_pos); @@ -1281,7 +1301,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, if (unlikely(path->cached)) { btree_node_unlock(trans, path, 0); path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); goto out; } @@ -1310,7 +1330,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, } if (unlikely(level != path->level)) { - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); __bch2_btree_path_unlock(trans, path); } out: @@ -1379,45 +1399,45 @@ static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_p void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent) { - struct btree_path *path = trans->paths + path_idx, *dup; + struct btree_path *path = trans->paths + path_idx, *dup = NULL; if (!__btree_path_put(trans, path, intent)) return; + if (!path->preserve && !path->should_be_locked) + goto free; + dup = path->preserve ? have_path_at_pos(trans, path) : have_node_at_pos(trans, path); - - trace_btree_path_free(trans, path_idx, dup); - - if (!dup && !(!path->preserve && !is_btree_node(path, path->level))) + if (!dup) return; - if (path->should_be_locked && !trans->restarted) { - if (!dup) - return; - + /* + * If we need this path locked, the duplicate also has te be locked + * before we free this one: + */ + if (path->should_be_locked && + !dup->should_be_locked && + !trans->restarted) { if (!(trans->locked ? bch2_btree_path_relock_norestart(trans, dup) : bch2_btree_path_can_relock(trans, dup))) return; - } - if (dup) { - dup->preserve |= path->preserve; - dup->should_be_locked |= path->should_be_locked; + dup->should_be_locked = true; } - __bch2_path_free(trans, path_idx); -} - -static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path, - bool intent) -{ - if (!__btree_path_put(trans, trans->paths + path, intent)) - return; + BUG_ON(path->should_be_locked && + !trans->restarted && + trans->locked && + !btree_node_locked(dup, dup->level)); - __bch2_path_free(trans, path); + path->should_be_locked = false; + dup->preserve |= path->preserve; +free: + trace_btree_path_free(trans, path_idx, dup); + __bch2_path_free(trans, path_idx); } void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count) @@ -1427,17 +1447,31 @@ void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_ (void *) trans->last_begin_ip); } -void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) +static void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) { +#ifdef CONFIG_BCACHEFS_DEBUG + struct printbuf buf = PRINTBUF; + bch2_prt_backtrace(&buf, &trans->last_restarted_trace); + panic("in transaction restart: %s, last restarted by\n%s", + bch2_err_str(trans->restarted), + buf.buf); +#else panic("in transaction restart: %s, last restarted by %pS\n", bch2_err_str(trans->restarted), (void *) trans->last_restarted_ip); +#endif } -void __noreturn bch2_trans_unlocked_error(struct btree_trans *trans) +void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *trans) { - panic("trans should be locked, unlocked by %pS\n", - (void *) trans->last_unlock_ip); + if (trans->restarted) + bch2_trans_in_restart_error(trans); + + if (!trans->locked) + panic("trans should be locked, unlocked by %pS\n", + (void *) trans->last_unlock_ip); + + BUG(); } noinline __cold @@ -1450,10 +1484,11 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) trans_for_each_update(trans, i) { struct bkey_s_c old = { &i->old_k, i->old_v }; - prt_printf(buf, "update: btree=%s cached=%u %pS\n", - bch2_btree_id_str(i->btree_id), - i->cached, - (void *) i->ip_allocated); + prt_str(buf, "update: btree="); + bch2_btree_id_to_text(buf, i->btree_id); + prt_printf(buf, " cached=%u %pS\n", + i->cached, + (void *) i->ip_allocated); prt_printf(buf, " old "); bch2_bkey_val_to_text(buf, trans->c, old); @@ -1464,35 +1499,27 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) prt_newline(buf); } - for (struct jset_entry *e = trans->journal_entries; + for (struct jset_entry *e = btree_trans_journal_entries_start(trans); e != btree_trans_journal_entries_top(trans); - e = vstruct_next(e)) + e = vstruct_next(e)) { bch2_journal_entry_to_text(buf, trans->c, e); + prt_newline(buf); + } printbuf_indent_sub(buf, 2); } -noinline __cold -void bch2_dump_trans_updates(struct btree_trans *trans) -{ - struct printbuf buf = PRINTBUF; - - bch2_trans_updates_to_text(&buf, trans); - bch2_print_str(trans->c, buf.buf); - printbuf_exit(&buf); -} - static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) { struct btree_path *path = trans->paths + path_idx; - prt_printf(out, "path: idx %3u ref %u:%u %c %c %c btree=%s l=%u pos ", + prt_printf(out, "path: idx %3u ref %u:%u %c %c %c ", path_idx, path->ref, path->intent_ref, path->preserve ? 'P' : ' ', path->should_be_locked ? 'S' : ' ', - path->cached ? 'C' : 'B', - bch2_btree_id_str(path->btree_id), - path->level); + path->cached ? 'C' : 'B'); + bch2_btree_id_level_to_text(out, path->btree_id, path->level); + prt_str(out, " pos "); bch2_bpos_to_text(out, path->pos); if (!path->cached && btree_node_locked(path, path->level)) { @@ -1578,7 +1605,7 @@ void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort) __bch2_trans_paths_to_text(&buf, trans, nosort); bch2_trans_updates_to_text(&buf, trans); - bch2_print_str(trans->c, buf.buf); + bch2_print_str(trans->c, KERN_ERR, buf.buf); printbuf_exit(&buf); } @@ -1717,12 +1744,15 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, struct trans_for_each_path_inorder_iter iter; btree_path_idx_t path_pos = 0, path_idx; - bch2_trans_verify_not_unlocked(trans); - bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); bch2_trans_verify_locks(trans); btree_trans_sort_paths(trans); + if (intent) + locks_want = max(locks_want, level + 1); + locks_want = min(locks_want, BTREE_MAX_DEPTH); + trans_for_each_path_inorder(trans, path, iter) { if (__btree_path_cmp(path, btree_id, @@ -1737,7 +1767,8 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, if (path_pos && trans->paths[path_pos].cached == cached && trans->paths[path_pos].btree_id == btree_id && - trans->paths[path_pos].level == level) { + trans->paths[path_pos].level == level && + bch2_btree_path_upgrade_norestart(trans, trans->paths + path_pos, locks_want)) { trace_btree_path_get(trans, trans->paths + path_pos, &pos); __btree_path_get(trans, trans->paths + path_pos, intent); @@ -1769,9 +1800,6 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, if (!(flags & BTREE_ITER_nopreserve)) path->preserve = true; - if (path->intent_ref) - locks_want = max(locks_want, level + 1); - /* * If the path has locks_want greater than requested, we don't downgrade * it here - on transaction restart because btree node split needs to @@ -1780,10 +1808,6 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, * a successful transaction commit. */ - locks_want = min(locks_want, BTREE_MAX_DEPTH); - if (locks_want > path->locks_want) - bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL); - return path_idx; } @@ -1833,7 +1857,7 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey * !bkey_eq(path->pos, ck->key.pos)); *u = ck->k->k; - k = bkey_i_to_s_c(ck->k); + k = (struct bkey_s_c) { u, &ck->k->v }; } return k; @@ -1843,11 +1867,8 @@ hole: return (struct bkey_s_c) { u, NULL }; } - -void bch2_set_btree_iter_dontneed(struct btree_iter *iter) +void bch2_set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter) { - struct btree_trans *trans = iter->trans; - if (!iter->path || trans->restarted) return; @@ -1859,25 +1880,22 @@ void bch2_set_btree_iter_dontneed(struct btree_iter *iter) /* Btree iterators: */ int __must_check -__bch2_btree_iter_traverse(struct btree_iter *iter) +__bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter) { - return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); + return bch2_btree_path_traverse(trans, iter->path, iter->flags); } int __must_check -bch2_btree_iter_traverse(struct btree_iter *iter) +bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter) { - struct btree_trans *trans = iter->trans; - int ret; - - bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); iter->path = bch2_btree_path_set_pos(trans, iter->path, btree_iter_search_key(iter), iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); - ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); + int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (ret) return ret; @@ -1889,14 +1907,14 @@ bch2_btree_iter_traverse(struct btree_iter *iter) /* Iterate across nodes (leaf and interior nodes) */ -struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) +struct btree *bch2_btree_iter_peek_node(struct btree_trans *trans, + struct btree_iter *iter) { - struct btree_trans *trans = iter->trans; struct btree *b = NULL; int ret; EBUG_ON(trans->paths[iter->path].cached); - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (ret) @@ -1918,7 +1936,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); out: bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); return b; err: @@ -1927,26 +1945,26 @@ err: } /* Only kept for -tools */ -struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter) +struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *trans, + struct btree_iter *iter) { struct btree *b; - while (b = bch2_btree_iter_peek_node(iter), + while (b = bch2_btree_iter_peek_node(trans, iter), bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart)) - bch2_trans_begin(iter->trans); + bch2_trans_begin(trans); return b; } -struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) +struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_iter *iter) { - struct btree_trans *trans = iter->trans; struct btree *b = NULL; int ret; EBUG_ON(trans->paths[iter->path].cached); - bch2_trans_verify_not_in_restart(trans); - bch2_btree_iter_verify(iter); + bch2_trans_verify_not_unlocked_or_in_restart(trans); + bch2_btree_iter_verify(trans, iter); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (ret) @@ -1961,17 +1979,24 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) /* got to end? */ if (!btree_path_node(path, path->level + 1)) { + path->should_be_locked = false; btree_path_set_level_up(trans, path); return NULL; } + /* + * We don't correctly handle nodes with extra intent locks here: + * downgrade so we don't violate locking invariants + */ + bch2_btree_path_downgrade(trans, path); + if (!bch2_btree_node_relock(trans, path, path->level + 1)) { + trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); __bch2_btree_path_unlock(trans, path); path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); - trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); goto err; } @@ -2013,7 +2038,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) EBUG_ON(btree_iter_path(trans, iter)->uptodate); out: bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); return b; err: @@ -2023,7 +2048,7 @@ err: /* Iterate across keys (in leaf nodes only) */ -inline bool bch2_btree_iter_advance(struct btree_iter *iter) +inline bool bch2_btree_iter_advance(struct btree_trans *trans, struct btree_iter *iter) { struct bpos pos = iter->k.p; bool ret = !(iter->flags & BTREE_ITER_all_snapshots @@ -2032,11 +2057,11 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter) if (ret && !(iter->flags & BTREE_ITER_is_extents)) pos = bkey_successor(iter, pos); - bch2_btree_iter_set_pos(iter, pos); + bch2_btree_iter_set_pos(trans, iter, pos); return ret; } -inline bool bch2_btree_iter_rewind(struct btree_iter *iter) +inline bool bch2_btree_iter_rewind(struct btree_trans *trans, struct btree_iter *iter) { struct bpos pos = bkey_start_pos(&iter->k); bool ret = !(iter->flags & BTREE_ITER_all_snapshots @@ -2045,20 +2070,20 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) if (ret && !(iter->flags & BTREE_ITER_is_extents)) pos = bkey_predecessor(iter, pos); - bch2_btree_iter_set_pos(iter, pos); + bch2_btree_iter_set_pos(trans, iter, pos); return ret; } static noinline void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c *k) + struct bpos search_key, struct bkey_s_c *k) { struct bpos end = path_l(btree_iter_path(trans, iter))->b->data->min_key; trans_for_each_update(trans, i) if (!i->key_cache_already_flushed && i->btree_id == iter->btree_id && - bpos_le(i->k->k.p, iter->pos) && + bpos_le(i->k->k.p, search_key) && bpos_ge(i->k->k.p, k->k ? k->k->p : end)) { iter->k = i->k->k; *k = bkey_i_to_s_c(i->k); @@ -2067,6 +2092,7 @@ void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_ static noinline void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter, + struct bpos search_key, struct bkey_s_c *k) { struct btree_path *path = btree_iter_path(trans, iter); @@ -2075,7 +2101,7 @@ void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter trans_for_each_update(trans, i) if (!i->key_cache_already_flushed && i->btree_id == iter->btree_id && - bpos_ge(i->k->k.p, path->pos) && + bpos_ge(i->k->k.p, search_key) && bpos_le(i->k->k.p, k->k ? k->k->p : end)) { iter->k = i->k->k; *k = bkey_i_to_s_c(i->k); @@ -2097,13 +2123,14 @@ void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_ static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, struct btree_iter *iter, + struct bpos search_pos, struct bpos end_pos) { struct btree_path *path = btree_iter_path(trans, iter); - return bch2_journal_keys_peek_upto(trans->c, iter->btree_id, + return bch2_journal_keys_peek_max(trans->c, iter->btree_id, path->level, - path->pos, + search_pos, end_pos, &iter->journal_idx); } @@ -2113,7 +2140,7 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, struct btree_iter *iter) { struct btree_path *path = btree_iter_path(trans, iter); - struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos); + struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos, path->pos); if (k) { iter->k = k->k; @@ -2124,21 +2151,50 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, } static noinline -struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) +void btree_trans_peek_journal(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos search_key, + struct bkey_s_c *k) { struct btree_path *path = btree_iter_path(trans, iter); struct bkey_i *next_journal = - bch2_btree_journal_peek(trans, iter, - k.k ? k.k->p : path_l(path)->b->key.k.p); - + bch2_btree_journal_peek(trans, iter, search_key, + k->k ? k->k->p : path_l(path)->b->key.k.p); if (next_journal) { iter->k = next_journal->k; - k = bkey_i_to_s_c(next_journal); + *k = bkey_i_to_s_c(next_journal); } +} - return k; +static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos search_key, + struct bpos end_pos) +{ + struct btree_path *path = btree_iter_path(trans, iter); + + return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id, + path->level, + search_key, + end_pos, + &iter->journal_idx); +} + +static noinline +void btree_trans_peek_prev_journal(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos search_key, + struct bkey_s_c *k) +{ + struct btree_path *path = btree_iter_path(trans, iter); + struct bkey_i *next_journal = + bch2_btree_journal_peek_prev(trans, iter, search_key, + k->k ? k->k->p : path_l(path)->b->data->min_key); + + if (next_journal) { + iter->k = next_journal->k; + *k = bkey_i_to_s_c(next_journal); + } } /* @@ -2146,16 +2202,15 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, * bkey_s_c_null: */ static noinline -struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) +struct bkey_s_c btree_trans_peek_key_cache(struct btree_trans *trans, struct btree_iter *iter, + struct bpos pos) { - struct btree_trans *trans = iter->trans; struct bch_fs *c = trans->c; struct bkey u; struct bkey_s_c k; int ret; - bch2_trans_verify_not_in_restart(trans); - bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); if ((iter->flags & BTREE_ITER_key_cache_fill) && bpos_eq(iter->pos, pos)) @@ -2181,28 +2236,30 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos if (unlikely(ret)) return bkey_s_c_err(ret); - btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); - k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u); - if (k.k && !bkey_err(k)) { - iter->k = u; - k.k = &iter->k; - } + if (!k.k) + return k; + + if ((iter->flags & BTREE_ITER_all_snapshots) && + !bpos_eq(pos, k.k->p)) + return bkey_s_c_null; + + iter->k = u; + k.k = &iter->k; + btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); return k; } -static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) +static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct btree_iter *iter, + struct bpos search_key) { - struct btree_trans *trans = iter->trans; struct bkey_s_c k, k2; int ret; EBUG_ON(btree_iter_path(trans, iter)->cached); - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); while (1) { - struct btree_path_level *l; - iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); @@ -2210,19 +2267,19 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (unlikely(ret)) { /* ensure that iter->k is consistent with iter->pos: */ - bch2_btree_iter_set_pos(iter, iter->pos); + bch2_btree_iter_set_pos(trans, iter, iter->pos); k = bkey_s_c_err(ret); - goto out; + break; } struct btree_path *path = btree_iter_path(trans, iter); - l = path_l(path); + struct btree_path_level *l = path_l(path); if (unlikely(!l->b)) { /* No btree nodes at requested level: */ - bch2_btree_iter_set_pos(iter, SPOS_MAX); + bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); k = bkey_s_c_null; - goto out; + break; } btree_path_set_should_be_locked(trans, path); @@ -2231,21 +2288,20 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && k.k && - (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { + (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) { k = k2; - ret = bkey_err(k); - if (ret) { - bch2_btree_iter_set_pos(iter, iter->pos); - goto out; + if (bkey_err(k)) { + bch2_btree_iter_set_pos(trans, iter, iter->pos); + break; } } if (unlikely(iter->flags & BTREE_ITER_with_journal)) - k = btree_trans_peek_journal(trans, iter, k); + btree_trans_peek_journal(trans, iter, search_key, &k); if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) - bch2_btree_trans_peek_updates(trans, iter, &k); + bch2_btree_trans_peek_updates(trans, iter, search_key, &k); if (k.k && bkey_deleted(k.k)) { /* @@ -2268,120 +2324,138 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp search_key = bpos_successor(l->b->key.k.p); } else { /* End of btree: */ - bch2_btree_iter_set_pos(iter, SPOS_MAX); + bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); k = bkey_s_c_null; - goto out; + break; } } -out: - bch2_btree_iter_verify(iter); + + bch2_btree_iter_verify(trans, iter); + + if (trace___btree_iter_peek_enabled()) { + CLASS(printbuf, buf)(); + + int ret = bkey_err(k); + if (ret) + prt_str(&buf, bch2_err_str(ret)); + else if (k.k) + bch2_bkey_val_to_text(&buf, trans->c, k); + else + prt_str(&buf, "(null)"); + trace___btree_iter_peek(trans->c, buf.buf); + } return k; } /** - * bch2_btree_iter_peek_upto() - returns first key greater than or equal to + * bch2_btree_iter_peek_max() - returns first key greater than or equal to * iterator's current position + * @trans: btree transaction object * @iter: iterator to peek from * @end: search limit: returns keys less than or equal to @end * * Returns: key if found, or an error extractable with bkey_err(). */ -struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end) +struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree_iter *iter, + struct bpos end) { - struct btree_trans *trans = iter->trans; struct bpos search_key = btree_iter_search_key(iter); struct bkey_s_c k; - struct bpos iter_pos; + struct bpos iter_pos = iter->pos; int ret; - bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); + bch2_btree_iter_verify_entry_exit(iter); EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX)); + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } + if (iter->update_path) { - bch2_path_put_nokeep(trans, iter->update_path, - iter->flags & BTREE_ITER_intent); + bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_intent); iter->update_path = 0; } - bch2_btree_iter_verify_entry_exit(iter); - while (1) { - k = __bch2_btree_iter_peek(iter, search_key); + k = __bch2_btree_iter_peek(trans, iter, search_key); if (unlikely(!k.k)) goto end; if (unlikely(bkey_err(k))) goto out_no_locked; - /* - * We need to check against @end before FILTER_SNAPSHOTS because - * if we get to a different inode that requested we might be - * seeing keys for a different snapshot tree that will all be - * filtered out. - * - * But we can't do the full check here, because bkey_start_pos() - * isn't monotonically increasing before FILTER_SNAPSHOTS, and - * that's what we check against in extents mode: - */ - if (unlikely(!(iter->flags & BTREE_ITER_is_extents) - ? bkey_gt(k.k->p, end) - : k.k->p.inode > end.inode)) - goto end; + if (iter->flags & BTREE_ITER_filter_snapshots) { + /* + * We need to check against @end before FILTER_SNAPSHOTS because + * if we get to a different inode that requested we might be + * seeing keys for a different snapshot tree that will all be + * filtered out. + * + * But we can't do the full check here, because bkey_start_pos() + * isn't monotonically increasing before FILTER_SNAPSHOTS, and + * that's what we check against in extents mode: + */ + if (unlikely(!(iter->flags & BTREE_ITER_is_extents) + ? bkey_gt(k.k->p, end) + : k.k->p.inode > end.inode)) + goto end; + + if (iter->update_path && + !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { + bch2_path_put(trans, iter->update_path, + iter->flags & BTREE_ITER_intent); + iter->update_path = 0; + } - if (iter->update_path && - !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { - bch2_path_put_nokeep(trans, iter->update_path, - iter->flags & BTREE_ITER_intent); - iter->update_path = 0; - } + if ((iter->flags & BTREE_ITER_intent) && + !(iter->flags & BTREE_ITER_is_extents) && + !iter->update_path) { + struct bpos pos = k.k->p; - if ((iter->flags & BTREE_ITER_filter_snapshots) && - (iter->flags & BTREE_ITER_intent) && - !(iter->flags & BTREE_ITER_is_extents) && - !iter->update_path) { - struct bpos pos = k.k->p; + if (pos.snapshot < iter->snapshot) { + search_key = bpos_successor(k.k->p); + continue; + } - if (pos.snapshot < iter->snapshot) { - search_key = bpos_successor(k.k->p); - continue; - } + pos.snapshot = iter->snapshot; - pos.snapshot = iter->snapshot; + /* + * advance, same as on exit for iter->path, but only up + * to snapshot + */ + __btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent); + iter->update_path = iter->path; + + iter->update_path = bch2_btree_path_set_pos(trans, + iter->update_path, pos, + iter->flags & BTREE_ITER_intent, + _THIS_IP_); + ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } + } /* - * advance, same as on exit for iter->path, but only up - * to snapshot + * We can never have a key in a leaf node at POS_MAX, so + * we don't have to check these successor() calls: */ - __btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent); - iter->update_path = iter->path; - - iter->update_path = bch2_btree_path_set_pos(trans, - iter->update_path, pos, - iter->flags & BTREE_ITER_intent, - _THIS_IP_); - ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags); - if (unlikely(ret)) { - k = bkey_s_c_err(ret); - goto out_no_locked; + if (!bch2_snapshot_is_ancestor(trans->c, + iter->snapshot, + k.k->p.snapshot)) { + search_key = bpos_successor(k.k->p); + continue; } - } - - /* - * We can never have a key in a leaf node at POS_MAX, so - * we don't have to check these successor() calls: - */ - if ((iter->flags & BTREE_ITER_filter_snapshots) && - !bch2_snapshot_is_ancestor(trans->c, - iter->snapshot, - k.k->p.snapshot)) { - search_key = bpos_successor(k.k->p); - continue; - } - if (bkey_whiteout(k.k) && - !(iter->flags & BTREE_ITER_all_snapshots)) { - search_key = bkey_successor(iter, k.k->p); - continue; + if (bkey_whiteout(k.k) && + !(iter->flags & BTREE_ITER_key_cache_fill)) { + search_key = bkey_successor(iter, k.k->p); + continue; + } } /* @@ -2421,17 +2495,30 @@ out_no_locked: if (!(iter->flags & BTREE_ITER_all_snapshots)) iter->pos.snapshot = iter->snapshot; - ret = bch2_btree_iter_verify_ret(iter, k); + ret = bch2_btree_iter_verify_ret(trans, iter, k); if (unlikely(ret)) { - bch2_btree_iter_set_pos(iter, iter->pos); + bch2_btree_iter_set_pos(trans, iter, iter->pos); k = bkey_s_c_err(ret); } bch2_btree_iter_verify_entry_exit(iter); + if (trace_btree_iter_peek_max_enabled()) { + CLASS(printbuf, buf)(); + + int ret = bkey_err(k); + if (ret) + prt_str(&buf, bch2_err_str(ret)); + else if (k.k) + bch2_bkey_val_to_text(&buf, trans->c, k); + else + prt_str(&buf, "(null)"); + trace_btree_iter_peek_max(trans->c, buf.buf); + } + return k; end: - bch2_btree_iter_set_pos(iter, end); + bch2_btree_iter_set_pos(trans, iter, end); k = bkey_s_c_null; goto out_no_locked; } @@ -2439,186 +2526,298 @@ end: /** * bch2_btree_iter_next() - returns first key greater than iterator's current * position + * @trans: btree transaction object * @iter: iterator to peek from * * Returns: key if found, or an error extractable with bkey_err(). */ -struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_next(struct btree_trans *trans, struct btree_iter *iter) { - if (!bch2_btree_iter_advance(iter)) + if (!bch2_btree_iter_advance(trans, iter)) return bkey_s_c_null; - return bch2_btree_iter_peek(iter); + return bch2_btree_iter_peek(trans, iter); +} + +static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter, + struct bpos search_key) +{ + struct bkey_s_c k, k2; + + bch2_btree_iter_verify(trans, iter); + + while (1) { + iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, + iter->flags & BTREE_ITER_intent, + btree_iter_ip_allocated(iter)); + + int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) { + /* ensure that iter->k is consistent with iter->pos: */ + bch2_btree_iter_set_pos(trans, iter, iter->pos); + k = bkey_s_c_err(ret); + break; + } + + struct btree_path *path = btree_iter_path(trans, iter); + struct btree_path_level *l = path_l(path); + + if (unlikely(!l->b)) { + /* No btree nodes at requested level: */ + bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); + k = bkey_s_c_null; + break; + } + + btree_path_set_should_be_locked(trans, path); + + k = btree_path_level_peek_all(trans->c, l, &iter->k); + if (!k.k || bpos_gt(k.k->p, search_key)) { + k = btree_path_level_prev(trans, path, l, &iter->k); + + BUG_ON(k.k && bpos_gt(k.k->p, search_key)); + } + + if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && + k.k && + (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) { + k = k2; + if (bkey_err(k2)) { + bch2_btree_iter_set_pos(trans, iter, iter->pos); + break; + } + } + + if (unlikely(iter->flags & BTREE_ITER_with_journal)) + btree_trans_peek_prev_journal(trans, iter, search_key, &k); + + if (unlikely((iter->flags & BTREE_ITER_with_updates) && + trans->nr_updates)) + bch2_btree_trans_peek_prev_updates(trans, iter, search_key, &k); + + if (likely(k.k && !bkey_deleted(k.k))) { + break; + } else if (k.k) { + search_key = bpos_predecessor(k.k->p); + } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) { + /* Advance to previous leaf node: */ + search_key = bpos_predecessor(path->l[0].b->data->min_key); + } else { + /* Start of btree: */ + bch2_btree_iter_set_pos(trans, iter, POS_MIN); + k = bkey_s_c_null; + break; + } + } + + bch2_btree_iter_verify(trans, iter); + return k; } /** - * bch2_btree_iter_peek_prev() - returns first key less than or equal to + * bch2_btree_iter_peek_prev_min() - returns first key less than or equal to * iterator's current position + * @trans: btree transaction object * @iter: iterator to peek from + * @end: search limit: returns keys greater than or equal to @end * * Returns: key if found, or an error extractable with bkey_err(). */ -struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct btree_iter *iter, + struct bpos end) { - struct btree_trans *trans = iter->trans; + if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) && + !bkey_eq(iter->pos, POS_MAX) && + !((iter->flags & BTREE_ITER_is_extents) && + iter->pos.offset == U64_MAX)) { + + /* + * bkey_start_pos(), for extents, is not monotonically + * increasing until after filtering for snapshots: + * + * Thus, for extents we need to search forward until we find a + * real visible extents - easiest to just use peek_slot() (which + * internally uses peek() for extents) + */ + struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); + if (bkey_err(k)) + return k; + + if (!bkey_deleted(k.k) && + (!(iter->flags & BTREE_ITER_is_extents) || + bkey_lt(bkey_start_pos(k.k), iter->pos))) + return k; + } + struct bpos search_key = iter->pos; struct bkey_s_c k; - struct bkey saved_k; - const struct bch_val *saved_v; btree_path_idx_t saved_path = 0; - int ret; - - bch2_trans_verify_not_unlocked(trans); - EBUG_ON(btree_iter_path(trans, iter)->cached || - btree_iter_path(trans, iter)->level); - if (iter->flags & BTREE_ITER_with_journal) - return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported); - - bch2_btree_iter_verify(iter); + bch2_trans_verify_not_unlocked_or_in_restart(trans); bch2_btree_iter_verify_entry_exit(iter); + EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && iter->pos.inode != end.inode); - if (iter->flags & BTREE_ITER_filter_snapshots) - search_key.snapshot = U32_MAX; + int ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } while (1) { - iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); - - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (unlikely(ret)) { - /* ensure that iter->k is consistent with iter->pos: */ - bch2_btree_iter_set_pos(iter, iter->pos); - k = bkey_s_c_err(ret); + k = __bch2_btree_iter_peek_prev(trans, iter, search_key); + if (unlikely(!k.k)) + goto end; + if (unlikely(bkey_err(k))) goto out_no_locked; - } - - struct btree_path *path = btree_iter_path(trans, iter); - k = btree_path_level_peek(trans, path, &path->l[0], &iter->k); - if (!k.k || - ((iter->flags & BTREE_ITER_is_extents) - ? bpos_ge(bkey_start_pos(k.k), search_key) - : bpos_gt(k.k->p, search_key))) - k = btree_path_level_prev(trans, path, &path->l[0], &iter->k); + if (iter->flags & BTREE_ITER_filter_snapshots) { + struct btree_path *s = saved_path ? trans->paths + saved_path : NULL; + if (s && bpos_lt(k.k->p, SPOS(s->pos.inode, s->pos.offset, iter->snapshot))) { + /* + * If we have a saved candidate, and we're past + * the last possible snapshot overwrite, return + * it: + */ + bch2_path_put(trans, iter->path, + iter->flags & BTREE_ITER_intent); + iter->path = saved_path; + saved_path = 0; + k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k); + break; + } - if (unlikely((iter->flags & BTREE_ITER_with_updates) && - trans->nr_updates)) - bch2_btree_trans_peek_prev_updates(trans, iter, &k); + /* + * We need to check against @end before FILTER_SNAPSHOTS because + * if we get to a different inode that requested we might be + * seeing keys for a different snapshot tree that will all be + * filtered out. + */ + if (unlikely(bkey_lt(k.k->p, end))) + goto end; - if (likely(k.k)) { - if (iter->flags & BTREE_ITER_filter_snapshots) { - if (k.k->p.snapshot == iter->snapshot) - goto got_key; + if (!bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) { + search_key = bpos_predecessor(k.k->p); + continue; + } + if (k.k->p.snapshot != iter->snapshot) { /* - * If we have a saved candidate, and we're no - * longer at the same _key_ (not pos), return - * that candidate + * Have a key visible in iter->snapshot, but + * might have overwrites: - save it and keep + * searching. Unless it's a whiteout - then drop + * our previous saved candidate: */ - if (saved_path && !bkey_eq(k.k->p, saved_k.p)) { - bch2_path_put_nokeep(trans, iter->path, + if (saved_path) { + bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_intent); - iter->path = saved_path; saved_path = 0; - iter->k = saved_k; - k.v = saved_v; - goto got_key; } - if (bch2_snapshot_is_ancestor(trans->c, - iter->snapshot, - k.k->p.snapshot)) { - if (saved_path) - bch2_path_put_nokeep(trans, saved_path, - iter->flags & BTREE_ITER_intent); + if (!bkey_whiteout(k.k)) { saved_path = btree_path_clone(trans, iter->path, iter->flags & BTREE_ITER_intent, _THIS_IP_); - path = btree_iter_path(trans, iter); - trace_btree_path_save_pos(trans, path, trans->paths + saved_path); - saved_k = *k.k; - saved_v = k.v; + trace_btree_path_save_pos(trans, + trans->paths + iter->path, + trans->paths + saved_path); } search_key = bpos_predecessor(k.k->p); continue; } -got_key: - if (bkey_whiteout(k.k) && - !(iter->flags & BTREE_ITER_all_snapshots)) { + + if (bkey_whiteout(k.k)) { search_key = bkey_predecessor(iter, k.k->p); - if (iter->flags & BTREE_ITER_filter_snapshots) - search_key.snapshot = U32_MAX; + search_key.snapshot = U32_MAX; continue; } - - btree_path_set_should_be_locked(trans, path); - break; - } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) { - /* Advance to previous leaf node: */ - search_key = bpos_predecessor(path->l[0].b->data->min_key); - } else { - /* Start of btree: */ - bch2_btree_iter_set_pos(iter, POS_MIN); - k = bkey_s_c_null; - goto out_no_locked; } - } - EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos)); + EBUG_ON(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(k.k->p, iter->pos) : + iter->flags & BTREE_ITER_is_extents ? bkey_ge(bkey_start_pos(k.k), iter->pos) : + bkey_gt(k.k->p, iter->pos)); + + if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_lt(k.k->p, end) : + iter->flags & BTREE_ITER_is_extents ? bkey_le(k.k->p, end) : + bkey_lt(k.k->p, end))) + goto end; + + break; + } /* Extents can straddle iter->pos: */ - if (bkey_lt(k.k->p, iter->pos)) - iter->pos = k.k->p; + iter->pos = bpos_min(iter->pos, k.k->p);; if (iter->flags & BTREE_ITER_filter_snapshots) iter->pos.snapshot = iter->snapshot; out_no_locked: if (saved_path) - bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_intent); + bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_intent); bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); + + if (trace_btree_iter_peek_prev_min_enabled()) { + CLASS(printbuf, buf)(); + int ret = bkey_err(k); + if (ret) + prt_str(&buf, bch2_err_str(ret)); + else if (k.k) + bch2_bkey_val_to_text(&buf, trans->c, k); + else + prt_str(&buf, "(null)"); + trace_btree_iter_peek_prev_min(trans->c, buf.buf); + } return k; +end: + bch2_btree_iter_set_pos(trans, iter, end); + k = bkey_s_c_null; + goto out_no_locked; } /** * bch2_btree_iter_prev() - returns first key less than iterator's current * position + * @trans: btree transaction object * @iter: iterator to peek from * * Returns: key if found, or an error extractable with bkey_err(). */ -struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *trans, struct btree_iter *iter) { - if (!bch2_btree_iter_rewind(iter)) + if (!bch2_btree_iter_rewind(trans, iter)) return bkey_s_c_null; - return bch2_btree_iter_peek_prev(iter); + return bch2_btree_iter_peek_prev(trans, iter); } -struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btree_iter *iter) { - struct btree_trans *trans = iter->trans; struct bpos search_key; struct bkey_s_c k; int ret; - bch2_trans_verify_not_unlocked(trans); - bch2_btree_iter_verify(iter); + bch2_trans_verify_not_unlocked_or_in_restart(trans); + bch2_btree_iter_verify(trans, iter); bch2_btree_iter_verify_entry_exit(iter); EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache)); + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out; + } + /* extents can't span inode numbers: */ if ((iter->flags & BTREE_ITER_is_extents) && unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { - if (iter->pos.inode == KEY_INODE_MAX) - return bkey_s_c_null; + if (iter->pos.inode == KEY_INODE_MAX) { + k = bkey_s_c_null; + goto out2; + } - bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); + bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos)); } search_key = btree_iter_search_key(iter); @@ -2629,9 +2828,17 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (unlikely(ret)) { k = bkey_s_c_err(ret); - goto out_no_locked; + goto out; + } + + struct btree_path *path = btree_iter_path(trans, iter); + if (unlikely(!btree_path_node(path, path->level))) { + k = bkey_s_c_null; + goto out2; } + btree_path_set_should_be_locked(trans, path); + if ((iter->flags & BTREE_ITER_cached) || !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) { k = bkey_s_c_null; @@ -2648,16 +2855,21 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) goto out; if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && - (k = btree_trans_peek_key_cache(iter, iter->pos)).k) { + (k = btree_trans_peek_key_cache(trans, iter, iter->pos)).k) { if (!bkey_err(k)) iter->k = *k.k; /* We're not returning a key from iter->path: */ - goto out_no_locked; + goto out; } - k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k); + k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k); if (unlikely(!k.k)) - goto out_no_locked; + goto out; + + if (unlikely(k.k->type == KEY_TYPE_whiteout && + (iter->flags & BTREE_ITER_filter_snapshots) && + !(iter->flags & BTREE_ITER_key_cache_fill))) + iter->k.type = KEY_TYPE_deleted; } else { struct bpos next; struct bpos end = iter->pos; @@ -2670,8 +2882,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) if (iter->flags & BTREE_ITER_intent) { struct btree_iter iter2; - bch2_trans_copy_iter(&iter2, iter); - k = bch2_btree_iter_peek_upto(&iter2, end); + bch2_trans_copy_iter(trans, &iter2, iter); + k = bch2_btree_iter_peek_max(trans, &iter2, end); if (k.k && !bkey_err(k)) { swap(iter->key_cache_path, iter2.key_cache_path); @@ -2682,15 +2894,15 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } else { struct bpos pos = iter->pos; - k = bch2_btree_iter_peek_upto(iter, end); + k = bch2_btree_iter_peek_max(trans, iter, end); if (unlikely(bkey_err(k))) - bch2_btree_iter_set_pos(iter, pos); + bch2_btree_iter_set_pos(trans, iter, pos); else iter->pos = pos; } if (unlikely(bkey_err(k))) - goto out_no_locked; + goto out; next = k.k ? bkey_start_pos(k.k) : POS_MAX; @@ -2712,42 +2924,53 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } } out: - btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); -out_no_locked: bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(iter); - ret = bch2_btree_iter_verify_ret(iter, k); + bch2_btree_iter_verify(trans, iter); + ret = bch2_btree_iter_verify_ret(trans, iter, k); if (unlikely(ret)) - return bkey_s_c_err(ret); + k = bkey_s_c_err(ret); +out2: + if (trace_btree_iter_peek_slot_enabled()) { + CLASS(printbuf, buf)(); + + int ret = bkey_err(k); + if (ret) + prt_str(&buf, bch2_err_str(ret)); + else if (k.k) + bch2_bkey_val_to_text(&buf, trans->c, k); + else + prt_str(&buf, "(null)"); + trace_btree_iter_peek_slot(trans->c, buf.buf); + } return k; } -struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *trans, struct btree_iter *iter) { - if (!bch2_btree_iter_advance(iter)) + if (!bch2_btree_iter_advance(trans, iter)) return bkey_s_c_null; - return bch2_btree_iter_peek_slot(iter); + return bch2_btree_iter_peek_slot(trans, iter); } -struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *trans, struct btree_iter *iter) { - if (!bch2_btree_iter_rewind(iter)) + if (!bch2_btree_iter_rewind(trans, iter)) return bkey_s_c_null; - return bch2_btree_iter_peek_slot(iter); + return bch2_btree_iter_peek_slot(trans, iter); } /* Obsolete, but still used by rust wrapper in -tools */ -struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *trans, struct btree_iter *iter) { struct bkey_s_c k; - while (btree_trans_too_many_iters(iter->trans) || - (k = bch2_btree_iter_peek_type(iter, iter->flags), + while (btree_trans_too_many_iters(trans) || + (k = bch2_btree_iter_peek_type(trans, iter, iter->flags), bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) - bch2_trans_begin(iter->trans); + bch2_trans_begin(trans); return k; } @@ -2780,7 +3003,7 @@ static void btree_trans_verify_sorted(struct btree_trans *trans) struct btree_path *path, *prev = NULL; struct trans_for_each_path_inorder_iter iter; - if (!bch2_debug_check_iterators) + if (!static_branch_unlikely(&bch2_debug_check_iterators)) return; trans_for_each_path_inorder(trans, path, iter) { @@ -2882,7 +3105,7 @@ static inline void btree_path_list_add(struct btree_trans *trans, void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) { if (iter->update_path) - bch2_path_put_nokeep(trans, iter->update_path, + bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_intent); if (iter->path) bch2_path_put(trans, iter->path, @@ -2893,7 +3116,6 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) iter->path = 0; iter->update_path = 0; iter->key_cache_path = 0; - iter->trans = NULL; } void bch2_trans_iter_init_outlined(struct btree_trans *trans, @@ -2902,7 +3124,7 @@ void bch2_trans_iter_init_outlined(struct btree_trans *trans, unsigned flags) { bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, - bch2_btree_iter_flags(trans, btree_id, flags), + bch2_btree_iter_flags(trans, btree_id, 0, flags), _RET_IP_); } @@ -2918,8 +3140,11 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, flags |= BTREE_ITER_snapshot_field; flags |= BTREE_ITER_all_snapshots; + if (!depth && btree_id_cached(trans->c, btree_id)) + flags |= BTREE_ITER_with_key_cache; + bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth, - __bch2_btree_iter_flags(trans, btree_id, flags), + bch2_btree_iter_flags(trans, btree_id, depth, flags), _RET_IP_); iter->min_depth = depth; @@ -2930,10 +3155,9 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, BUG_ON(iter->min_depth != depth); } -void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) +void bch2_trans_copy_iter(struct btree_trans *trans, + struct btree_iter *dst, struct btree_iter *src) { - struct btree_trans *trans = src->trans; - *dst = *src; #ifdef TRACK_PATH_ALLOCATED dst->ip_allocated = _RET_IP_; @@ -2945,7 +3169,19 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) dst->key_cache_path = 0; } -void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE +void bch2_trans_kmalloc_trace_to_text(struct printbuf *out, + darray_trans_kmalloc_trace *trace) +{ + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 60); + + darray_for_each(*trace, i) + prt_printf(out, "%pS\t%zu\n", (void *) i->ip, i->bytes); +} +#endif + +void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long ip) { struct bch_fs *c = trans->c; unsigned new_top = trans->mem_top + size; @@ -2955,51 +3191,66 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) void *new_mem; void *p; - WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); + if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) { +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "bump allocator exceeded BTREE_TRANS_MEM_MAX (%u)\n", + BTREE_TRANS_MEM_MAX); - struct btree_transaction_stats *s = btree_trans_stats(trans); - s->max_mem = max(s->max_mem, new_bytes); + bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace); + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); +#endif + } - if (trans->used_mempool) { - if (trans->mem_bytes >= new_bytes) - goto out_change_top; + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (ret) + return ERR_PTR(ret); - /* No more space from mempool item, need malloc new one */ - new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN); - if (unlikely(!new_mem)) { - bch2_trans_unlock(trans); + struct btree_transaction_stats *s = btree_trans_stats(trans); + if (new_bytes > s->max_mem) { + mutex_lock(&s->lock); +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_resize(&s->trans_kmalloc_trace, trans->trans_kmalloc_trace.nr); + s->trans_kmalloc_trace.nr = min(s->trans_kmalloc_trace.size, + trans->trans_kmalloc_trace.nr); + + memcpy(s->trans_kmalloc_trace.data, + trans->trans_kmalloc_trace.data, + sizeof(s->trans_kmalloc_trace.data[0]) * + s->trans_kmalloc_trace.nr); +#endif + s->max_mem = new_bytes; + mutex_unlock(&s->lock); + } - new_mem = kmalloc(new_bytes, GFP_KERNEL); - if (!new_mem) - return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); + if (trans->used_mempool || new_bytes > BTREE_TRANS_MEM_MAX) { + EBUG_ON(trans->mem_bytes >= new_bytes); + return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); + } - ret = bch2_trans_relock(trans); - if (ret) { - kfree(new_mem); - return ERR_PTR(ret); - } - } - memcpy(new_mem, trans->mem, trans->mem_top); - trans->used_mempool = false; - mempool_free(trans->mem, &c->btree_trans_mem_pool); - goto out_new_mem; + if (old_bytes) { + trans->realloc_bytes_required = new_bytes; + trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); + return ERR_PTR(btree_trans_restart_ip(trans, + BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); } - new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); + EBUG_ON(trans->mem); + + new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN); if (unlikely(!new_mem)) { bch2_trans_unlock(trans); - new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); + new_mem = kmalloc(new_bytes, GFP_KERNEL); if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); new_bytes = BTREE_TRANS_MEM_MAX; - memcpy(new_mem, trans->mem, trans->mem_top); trans->used_mempool = true; - kfree(trans->mem); } - if (!new_mem) - return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); + EBUG_ON(!new_mem); trans->mem = new_mem; trans->mem_bytes = new_bytes; @@ -3008,15 +3259,10 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) if (ret) return ERR_PTR(ret); } -out_new_mem: + trans->mem = new_mem; trans->mem_bytes = new_bytes; - if (old_bytes) { - trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); - return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); - } -out_change_top: p = trans->mem + trans->mem_top; trans->mem_top += size; memset(p, 0, size); @@ -3076,7 +3322,27 @@ u32 bch2_trans_begin(struct btree_trans *trans) trans->restart_count++; trans->mem_top = 0; - trans->journal_entries = NULL; + + if (trans->restarted == BCH_ERR_transaction_restart_mem_realloced) { + EBUG_ON(!trans->mem || !trans->mem_bytes); + unsigned new_bytes = trans->realloc_bytes_required; + void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); + if (unlikely(!new_mem)) { + bch2_trans_unlock(trans); + new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); + + EBUG_ON(new_bytes > BTREE_TRANS_MEM_MAX); + + if (!new_mem) { + new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); + new_bytes = BTREE_TRANS_MEM_MAX; + trans->used_mempool = true; + kfree(trans->mem); + } + } + trans->mem = new_mem; + trans->mem_bytes = new_bytes; + } trans_for_each_path(trans, path, i) { path->should_be_locked = false; @@ -3122,14 +3388,26 @@ u32 bch2_trans_begin(struct btree_trans *trans) trans->last_begin_ip = _RET_IP_; - trans_set_locked(trans); +#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS + if (trans->restarted) { + trans->restart_count_this_trans++; + } else { + trans->restart_count_this_trans = 0; + } +#endif + +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + trans->trans_kmalloc_trace.nr = 0; +#endif + + trans_set_locked(trans, false); if (trans->restarted) { bch2_btree_path_traverse_all(trans); trans->notrace_relock_fail = false; } - bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); return trans->restart_count; } @@ -3222,46 +3500,64 @@ got_trans: } trans->nr_paths_max = s->nr_max_paths; - trans->journal_entries_size = s->journal_entries_size; } trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); trans->srcu_lock_time = jiffies; trans->srcu_held = true; - trans_set_locked(trans); + trans_set_locked(trans, false); closure_init_stack_release(&trans->ref); return trans; } -static void check_btree_paths_leaked(struct btree_trans *trans) -{ #ifdef CONFIG_BCACHEFS_DEBUG - struct bch_fs *c = trans->c; + +static bool btree_paths_leaked(struct btree_trans *trans) +{ struct btree_path *path; unsigned i; trans_for_each_path(trans, path, i) if (path->ref) - goto leaked; - return; -leaked: - bch_err(c, "btree paths leaked from %s!", trans->fn); - trans_for_each_path(trans, path, i) - if (path->ref) - printk(KERN_ERR " btree %s %pS\n", - bch2_btree_id_str(path->btree_id), - (void *) path->ip_allocated); - /* Be noisy about this: */ - bch2_fatal_error(c); -#endif + return true; + return false; } +static void check_btree_paths_leaked(struct btree_trans *trans) +{ + if (btree_paths_leaked(trans)) { + struct bch_fs *c = trans->c; + struct btree_path *path; + unsigned i; + + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "btree paths leaked from %s!\n", trans->fn); + trans_for_each_path(trans, path, i) + if (path->ref) + prt_printf(&buf, "btree %s %pS\n", + bch2_btree_id_str(path->btree_id), + (void *) path->ip_allocated); + + bch2_fs_emergency_read_only2(c, &buf); + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + } +} +#else +static inline void check_btree_paths_leaked(struct btree_trans *trans) {} +#endif + void bch2_trans_put(struct btree_trans *trans) __releases(&c->btree_trans_barrier) { struct bch_fs *c = trans->c; + if (trans->restarted) + bch2_trans_in_restart_error(trans); + bch2_trans_unlock(trans); trans_for_each_update(trans, i) @@ -3285,6 +3581,13 @@ void bch2_trans_put(struct btree_trans *trans) closure_return_sync(&trans->ref); trans->locking_wait.task = NULL; +#ifdef CONFIG_BCACHEFS_DEBUG + darray_exit(&trans->last_restarted_trace); +#endif +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_exit(&trans->trans_kmalloc_trace); +#endif + unsigned long *paths_allocated = trans->paths_allocated; trans->paths_allocated = NULL; trans->paths = NULL; @@ -3330,16 +3633,16 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out, struct btree_bkey_cached_common *b) { struct six_lock_count c = six_lock_counts(&b->lock); - struct task_struct *owner; pid_t pid; - rcu_read_lock(); - owner = READ_ONCE(b->lock.owner); - pid = owner ? owner->pid : 0; - rcu_read_unlock(); + scoped_guard(rcu) { + struct task_struct *owner = READ_ONCE(b->lock.owner); + pid = owner ? owner->pid : 0; + } - prt_printf(out, "\t%px %c l=%u %s:", b, b->cached ? 'c' : 'b', - b->level, bch2_btree_id_str(b->btree_id)); + prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b'); + bch2_btree_id_to_text(out, b->btree_id); + prt_printf(out, " l=%u:", b->level); bch2_bpos_to_text(out, btree_node_pos(b)); prt_printf(out, "\t locks %u:%u:%u held by pid %u", @@ -3364,7 +3667,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn); /* trans->paths is rcu protected vs. freeing */ - rcu_read_lock(); + guard(rcu)(); out->atomic++; struct btree_path *paths = rcu_dereference(trans->paths); @@ -3378,11 +3681,11 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) if (!path->nodes_locked) continue; - prt_printf(out, " path %u %c l=%u %s:", - idx, - path->cached ? 'c' : 'b', - path->level, - bch2_btree_id_str(path->btree_id)); + prt_printf(out, " path %u %c ", + idx, + path->cached ? 'c' : 'b'); + bch2_btree_id_to_text(out, path->btree_id); + prt_printf(out, " l=%u:", path->level); bch2_bpos_to_text(out, path->pos); prt_newline(out); @@ -3407,7 +3710,6 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) } out: --out->atomic; - rcu_read_unlock(); } void bch2_fs_btree_iter_exit(struct bch_fs *c) @@ -3437,6 +3739,9 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) for (s = c->btree_transaction_stats; s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); s++) { +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_exit(&s->trans_kmalloc_trace); +#endif kfree(s->max_paths_text); bch2_time_stats_exit(&s->lock_hold_times); } @@ -3488,7 +3793,7 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) #ifdef CONFIG_LOCKDEP fs_reclaim_acquire(GFP_KERNEL); struct btree_trans *trans = bch2_trans_get(c); - trans_set_locked(trans); + trans_set_locked(trans, false); bch2_trans_put(trans); fs_reclaim_release(GFP_KERNEL); #endif |