1 files changed, 271 insertions, 109 deletions
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 3c7f941dde39..c94debb12d2f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -13,12 +13,13 @@
 #include "disk_accounting.h"
 #include "errcode.h"
 #include "error.h"
-#include "fs-common.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
 #include "logged_ops.h"
 #include "move.h"
+#include "movinggc.h"
+#include "namei.h"
 #include "quota.h"
 #include "rebalance.h"
 #include "recovery.h"
@@ -32,31 +33,96 @@
 #include <linux/sort.h>
 #include <linux/stat.h>
 
-#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
-
-void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
+int bch2_btree_lost_data(struct bch_fs *c,
+			 struct printbuf *msg,
+			 enum btree_id btree)
 {
-	if (btree >= BTREE_ID_NR_MAX)
-		return;
-
 	u64 b = BIT_ULL(btree);
+	int ret = 0;
+
+	mutex_lock(&c->sb_lock);
+	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
 
 	if (!(c->sb.btrees_lost_data & b)) {
-		bch_err(c, "flagging btree %s lost data", bch2_btree_id_str(btree));
+		prt_printf(msg, "flagging btree ");
+		bch2_btree_id_to_text(msg, btree);
+		prt_printf(msg, " lost data\n");
 
-		mutex_lock(&c->sb_lock);
-		bch2_sb_field_get(c->disk_sb.sb, ext)->btrees_lost_data |= cpu_to_le64(b);
-		bch2_write_super(c);
-		mutex_unlock(&c->sb_lock);
+		ext->btrees_lost_data |= cpu_to_le64(b);
+	}
+
+	/* Once we have runtime self healing for topology errors we won't need this: */
+	ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret;
+
+	/* Btree node accounting will be off: */
+	__set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
+	ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret;
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+	/*
+	 * These are much more minor, and don't need to be corrected right away,
+	 * but in debug mode we want the next fsck run to be clean:
+	 */
+	ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0) ?: ret;
+	ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0) ?: ret;
+#endif
+
+	switch (btree) {
+	case BTREE_ID_alloc:
+		ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
+
+		__set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
+		__set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
+		__set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
+		__set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent);
+		__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
+		__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
+		goto out;
+	case BTREE_ID_backpointers:
+		ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers, 0) ?: ret;
+		ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers, 0) ?: ret;
+		goto out;
+	case BTREE_ID_need_discard:
+		ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
+		goto out;
+	case BTREE_ID_freespace:
+		ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
+		goto out;
+	case BTREE_ID_bucket_gens:
+		ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
+		goto out;
+	case BTREE_ID_lru:
+		ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
+		goto out;
+	case BTREE_ID_accounting:
+		ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret;
+		goto out;
+	case BTREE_ID_snapshots:
+		ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0) ?: ret;
+		ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret;
+		ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret;
+		goto out;
+	default:
+		ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret;
+		ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret;
+		goto out;
 	}
+out:
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return ret;
 }
 
-/* for -o reconstruct_alloc: */
-static void bch2_reconstruct_alloc(struct bch_fs *c)
+static void kill_btree(struct bch_fs *c, enum btree_id btree)
 {
-	bch2_journal_log_msg(c, "dropping alloc info");
-	bch_info(c, "dropping and reconstructing all alloc info");
+	bch2_btree_id_root(c, btree)->alive = false;
+	bch2_shoot_down_journal_keys(c, btree, 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+}
 
+/* for -o reconstruct_alloc: */
+void bch2_reconstruct_alloc(struct bch_fs *c)
+{
 	mutex_lock(&c->sb_lock);
 	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
 
@@ -79,6 +145,8 @@ static void bch2_reconstruct_alloc(struct bch_fs *c)
 	__set_bit_le64(BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, ext->errors_silent);
 	__set_bit_le64(BCH_FSCK_ERR_fs_usage_replicas_wrong, ext->errors_silent);
 
+	__set_bit_le64(BCH_FSCK_ERR_alloc_key_to_missing_lru_entry, ext->errors_silent);
+
 	__set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
 	__set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
 	__set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
@@ -96,19 +164,14 @@ static void bch2_reconstruct_alloc(struct bch_fs *c)
 
 	c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
 
+	c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info));
+
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
-	bch2_shoot_down_journal_keys(c, BTREE_ID_alloc,
-				     0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
-	bch2_shoot_down_journal_keys(c, BTREE_ID_backpointers,
-				     0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
-	bch2_shoot_down_journal_keys(c, BTREE_ID_need_discard,
-				     0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
-	bch2_shoot_down_journal_keys(c, BTREE_ID_freespace,
-				     0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
-	bch2_shoot_down_journal_keys(c, BTREE_ID_bucket_gens,
-				     0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+	for (unsigned i = 0; i < btree_id_nr_alive(c); i++)
+		if (btree_id_is_alloc(i))
+			kill_btree(c, i);
 }
 
 /*
@@ -142,7 +205,7 @@ static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
 	bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
 				  BTREE_MAX_DEPTH, k->level,
 				  BTREE_ITER_intent);
-	int ret = bch2_btree_iter_traverse(&iter);
+	int ret = bch2_btree_iter_traverse(trans, &iter);
 	if (ret)
 		goto out;
 
@@ -205,16 +268,38 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
 	bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
 				  BTREE_MAX_DEPTH, k->level,
 				  iter_flags);
-	ret = bch2_btree_iter_traverse(&iter);
+	ret = bch2_btree_iter_traverse(trans, &iter);
 	if (ret)
 		goto out;
 
 	struct btree_path *path = btree_iter_path(trans, &iter);
 	if (unlikely(!btree_path_node(path, k->level))) {
+		struct bch_fs *c = trans->c;
+
+		CLASS(printbuf, buf)();
+		prt_str(&buf, "btree=");
+		bch2_btree_id_to_text(&buf, k->btree_id);
+		prt_printf(&buf, " level=%u ", k->level);
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k->k));
+
+		if (!(c->recovery.passes_complete & (BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes)|
+						     BIT_ULL(BCH_RECOVERY_PASS_check_topology)))) {
+			bch_err(c, "have key in journal replay for btree depth that does not exist, confused\n%s",
+				buf.buf);
+			ret = -EINVAL;
+		}
+
+		if (!k->allocated) {
+			bch_notice(c, "dropping key in journal replay for depth that does not exist because we're recovering from scan\n%s",
+				   buf.buf);
+			k->overwritten = true;
+			goto out;
+		}
+
 		bch2_trans_iter_exit(trans, &iter);
 		bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
 					  BTREE_MAX_DEPTH, 0, iter_flags);
-		ret =   bch2_btree_iter_traverse(&iter) ?:
+		ret =   bch2_btree_iter_traverse(trans, &iter) ?:
 			bch2_btree_increase_depth(trans, iter.path, 0) ?:
 			-BCH_ERR_transaction_restart_nested;
 		goto out;
@@ -225,7 +310,12 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
 		goto out;
 
 	if (k->k->k.type == KEY_TYPE_accounting) {
-		ret = bch2_trans_update_buffered(trans, BTREE_ID_accounting, k->k);
+		struct bkey_i *n = bch2_trans_subbuf_alloc(trans, &trans->accounting, k->k->k.u64s);
+		ret = PTR_ERR_OR_ZERO(n);
+		if (ret)
+			goto out;
+
+		bkey_copy(n, k->k);
 		goto out;
 	}
 
@@ -333,9 +423,9 @@ int bch2_journal_replay(struct bch_fs *c)
 	 * Now, replay any remaining keys in the order in which they appear in
 	 * the journal, unpinning those journal entries as we go:
 	 */
-	sort(keys_sorted.data, keys_sorted.nr,
-	     sizeof(keys_sorted.data[0]),
-	     journal_sort_seq_cmp, NULL);
+	sort_nonatomic(keys_sorted.data, keys_sorted.nr,
+		       sizeof(keys_sorted.data[0]),
+		       journal_sort_seq_cmp, NULL);
 
 	darray_for_each(keys_sorted, kp) {
 		cond_resched();
@@ -354,10 +444,13 @@ int bch2_journal_replay(struct bch_fs *c)
 				 ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim
 				 : 0),
 			     bch2_journal_replay_key(trans, k));
-		bch_err_msg(c, ret, "while replaying key at btree %s level %u:",
-			    bch2_btree_id_str(k->btree_id), k->level);
-		if (ret)
+		if (ret) {
+			struct printbuf buf = PRINTBUF;
+			bch2_btree_id_level_to_text(&buf, k->btree_id, k->level);
+			bch_err_msg(c, ret, "while replaying key at %s:", buf.buf);
+			printbuf_exit(&buf);
 			goto err;
+		}
 
 		BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten);
 	}
@@ -370,7 +463,7 @@ int bch2_journal_replay(struct bch_fs *c)
 	trans = NULL;
 
 	if (!c->opts.retain_recovery_info &&
-	    c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay)
+	    c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay)
 		bch2_journal_keys_put_initial(c);
 
 	replay_now_at(j, j->replay_journal_seq_end);
@@ -403,7 +496,9 @@ static int journal_replay_entry_early(struct bch_fs *c,
 
 	switch (entry->type) {
 	case BCH_JSET_ENTRY_btree_root: {
-		struct btree_root *r;
+
+		if (unlikely(!entry->u64s))
+			return 0;
 
 		if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX,
 				c, invalid_btree_id,
@@ -417,15 +512,11 @@ static int journal_replay_entry_early(struct bch_fs *c,
 				return ret;
 		}
 
-		r = bch2_btree_id_root(c, entry->btree_id);
+		struct btree_root *r = bch2_btree_id_root(c, entry->btree_id);
 
-		if (entry->u64s) {
-			r->level = entry->level;
-			bkey_copy(&r->key, (struct bkey_i *) entry->start);
-			r->error = 0;
-		} else {
-			r->error = -BCH_ERR_btree_node_read_error;
-		}
+		r->level = entry->level;
+		bkey_copy(&r->key, (struct bkey_i *) entry->start);
+		r->error = 0;
 		r->alive = true;
 		break;
 	}
@@ -505,6 +596,7 @@ static int journal_replay_early(struct bch_fs *c,
 
 static int read_btree_roots(struct bch_fs *c)
 {
+	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
 	for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
@@ -513,33 +605,20 @@ static int read_btree_roots(struct bch_fs *c)
 		if (!r->alive)
 			continue;
 
-		if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc)
-			continue;
+		printbuf_reset(&buf);
+		bch2_btree_id_level_to_text(&buf, i, r->level);
 
 		if (mustfix_fsck_err_on((ret = r->error),
 					c, btree_root_bkey_invalid,
 					"invalid btree root %s",
-					bch2_btree_id_str(i)) ||
+					buf.buf) ||
 		    mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)),
 					c, btree_root_read_error,
-					"error reading btree root %s l=%u: %s",
-					bch2_btree_id_str(i), r->level, bch2_err_str(ret))) {
-			if (btree_id_is_alloc(i)) {
-				c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations);
-				c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info);
-				c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus);
-				c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
-				c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs);
-				c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+					"error reading btree root %s: %s",
+					buf.buf, bch2_err_str(ret))) {
+			if (btree_id_is_alloc(i))
 				r->error = 0;
-			} else if (!(c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) {
-				bch_info(c, "will run btree node scan");
-				c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes);
-				c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
-			}
-
 			ret = 0;
-			bch2_btree_lost_data(c, i);
 		}
 	}
 
@@ -553,6 +632,7 @@ static int read_btree_roots(struct bch_fs *c)
 		}
 	}
 fsck_err:
+	printbuf_exit(&buf);
 	return ret;
 }
 
@@ -563,6 +643,7 @@ static bool check_version_upgrade(struct bch_fs *c)
 					 bch2_latest_compatible_version(c->sb.version));
 	unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
 	unsigned new_version = 0;
+	bool ret = false;
 
 	if (old_version < bcachefs_metadata_required_upgrade_below) {
 		if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible ||
@@ -617,15 +698,33 @@ static bool check_version_upgrade(struct bch_fs *c)
 				     bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
 		}
 
-		bch_info(c, "%s", buf.buf);
+		bch_notice(c, "%s", buf.buf);
+		printbuf_exit(&buf);
 
-		bch2_sb_upgrade(c, new_version);
+		ret = true;
+	}
+
+	if (new_version > c->sb.version_incompat_allowed &&
+	    c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) {
+		struct printbuf buf = PRINTBUF;
 
+		prt_str(&buf, "Now allowing incompatible features up to ");
+		bch2_version_to_text(&buf, new_version);
+		prt_str(&buf, ", previously allowed up to ");
+		bch2_version_to_text(&buf, c->sb.version_incompat_allowed);
+		prt_newline(&buf);
+
+		bch_notice(c, "%s", buf.buf);
 		printbuf_exit(&buf);
-		return true;
+
+		ret = true;
 	}
 
-	return false;
+	if (ret)
+		bch2_sb_upgrade(c, new_version,
+				c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible);
+
+	return ret;
 }
 
 int bch2_fs_recovery(struct bch_fs *c)
@@ -660,8 +759,30 @@ int bch2_fs_recovery(struct bch_fs *c)
 		goto err;
 	}
 
-	if (c->opts.norecovery)
-		c->opts.recovery_pass_last = BCH_RECOVERY_PASS_journal_replay - 1;
+	if (c->opts.norecovery) {
+		c->opts.recovery_pass_last = c->opts.recovery_pass_last
+			? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read)
+			: BCH_RECOVERY_PASS_snapshots_read;
+		c->opts.nochanges = true;
+	}
+
+	if (c->opts.nochanges)
+		c->opts.read_only = true;
+
+	if (c->opts.journal_rewind) {
+		bch_info(c, "rewinding journal, fsck required");
+		c->opts.fsck = true;
+	}
+
+	if (go_rw_in_recovery(c)) {
+		/*
+		 * start workqueues/kworkers early - kthread creation checks for
+		 * pending signals, which is _very_ annoying
+		 */
+		ret = bch2_fs_init_rw(c);
+		if (ret)
+			goto err;
+	}
 
 	mutex_lock(&c->sb_lock);
 	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
@@ -708,17 +829,20 @@ int bch2_fs_recovery(struct bch_fs *c)
 
 	c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
 
+	if (c->sb.version_upgrade_complete < bcachefs_metadata_version_autofix_errors) {
+		SET_BCH_SB_ERROR_ACTION(c->disk_sb.sb, BCH_ON_ERROR_fix_safe);
+		write_sb = true;
+	}
+
 	if (write_sb)
 		bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
-	if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
-		c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
-
-	if (c->opts.fsck)
-		set_bit(BCH_FS_fsck_running, &c->flags);
 	if (c->sb.clean)
 		set_bit(BCH_FS_clean_recovery, &c->flags);
+	if (c->opts.fsck)
+		set_bit(BCH_FS_in_fsck, &c->flags);
+	set_bit(BCH_FS_in_recovery, &c->flags);
 
 	ret = bch2_blacklist_table_initialize(c);
 	if (ret) {
@@ -797,7 +921,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 use_clean:
 		if (!clean) {
 			bch_err(c, "no superblock clean section found");
-			ret = -BCH_ERR_fsck_repair_impossible;
+			ret = bch_err_throw(c, fsck_repair_impossible);
 			goto err;
 
 		}
@@ -807,15 +931,44 @@ use_clean:
 	c->journal_replay_seq_start	= last_seq;
 	c->journal_replay_seq_end	= blacklist_seq - 1;
 
-	if (c->opts.reconstruct_alloc)
-		bch2_reconstruct_alloc(c);
-
 	zero_out_btree_mem_ptr(&c->journal_keys);
 
 	ret = journal_replay_early(c, clean);
 	if (ret)
 		goto err;
 
+	ret = bch2_fs_resize_on_mount(c);
+	if (ret) {
+		up_write(&c->state_lock);
+		goto err;
+	}
+
+	if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
+		bch_info(c, "filesystem is an unresized image file, mounting ro");
+		c->opts.read_only = true;
+	}
+
+	if (!c->opts.read_only &&
+	    (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) {
+		bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate");
+
+		bch2_reconstruct_alloc(c);
+	} else if (c->opts.reconstruct_alloc) {
+		bch2_journal_log_msg(c, "dropping alloc info");
+		bch_info(c, "dropping and reconstructing all alloc info");
+
+		bch2_reconstruct_alloc(c);
+	}
+
+	if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) {
+		/* We can't go RW to fix errors without alloc info */
+		if (c->opts.fix_errors == FSCK_FIX_yes ||
+		    c->opts.fix_errors == FSCK_FIX_ask)
+			c->opts.fix_errors = FSCK_FIX_no;
+		if (c->opts.errors == BCH_ON_ERROR_fix_safe)
+			c->opts.errors = BCH_ON_ERROR_continue;
+	}
+
 	/*
 	 * After an unclean shutdown, skip then next few journal sequence
 	 * numbers as they may have been referenced by btree writes that
@@ -824,7 +977,7 @@ use_clean:
 	 * journal sequence numbers:
 	 */
 	if (!c->sb.clean)
-		journal_seq += 8;
+		journal_seq += JOURNAL_BUF_NR * 4;
 
 	if (blacklist_seq != journal_seq) {
 		ret =   bch2_journal_log_msg(c, "blacklisting entries %llu-%llu",
@@ -839,7 +992,7 @@ use_clean:
 
 	ret =   bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu",
 				     journal_seq, last_seq, blacklist_seq - 1) ?:
-		bch2_fs_journal_start(&c->journal, journal_seq);
+		bch2_fs_journal_start(&c->journal, last_seq, journal_seq);
 	if (ret)
 		goto err;
 
@@ -857,8 +1010,10 @@ use_clean:
 	set_bit(BCH_FS_btree_running, &c->flags);
 
 	ret = bch2_sb_set_upgrade_extra(c);
+	if (ret)
+		goto err;
 
-	ret = bch2_run_recovery_passes(c);
+	ret = bch2_run_recovery_passes(c, 0);
 	if (ret)
 		goto err;
 
@@ -869,17 +1024,17 @@ use_clean:
 	 * multithreaded use:
 	 */
 	set_bit(BCH_FS_may_go_rw, &c->flags);
-	clear_bit(BCH_FS_fsck_running, &c->flags);
+	clear_bit(BCH_FS_in_fsck, &c->flags);
 
 	/* in case we don't run journal replay, i.e. norecovery mode */
 	set_bit(BCH_FS_accounting_replay_done, &c->flags);
 
+	bch2_async_btree_node_rewrites_flush(c);
+
 	/* fsync if we fixed errors */
-	if (test_bit(BCH_FS_errors_fixed, &c->flags) &&
-	    bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) {
+	if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
 		bch2_journal_flush_all_pins(&c->journal);
 		bch2_journal_meta(&c->journal);
-		bch2_write_ref_put(c, BCH_WRITE_REF_fsync);
 	}
 
 	/* If we fixed errors, verify that fs is actually clean now: */
@@ -892,9 +1047,8 @@ use_clean:
 		bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean");
 		clear_bit(BCH_FS_errors_fixed, &c->flags);
 
-		c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
-
-		ret = bch2_run_recovery_passes(c);
+		ret = bch2_run_recovery_passes(c,
+			BCH_RECOVERY_PASS_check_alloc_info);
 		if (ret)
 			goto err;
 
@@ -938,7 +1092,7 @@ use_clean:
 
 	if (c->opts.fsck &&
 	    !test_bit(BCH_FS_error, &c->flags) &&
-	    c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 &&
+	    c->recovery.pass_done == BCH_RECOVERY_PASS_NR - 1 &&
 	    ext->btrees_lost_data) {
 		ext->btrees_lost_data = 0;
 		write_sb = true;
@@ -981,13 +1135,6 @@ use_clean:
 out:
 	bch2_flush_fsck_errs(c);
 
-	if (!c->opts.retain_recovery_info) {
-		bch2_journal_keys_put_initial(c);
-		bch2_find_btree_nodes_exit(&c->found_btree_nodes);
-	}
-	if (!IS_ERR(clean))
-		kfree(clean);
-
 	if (!ret &&
 	    test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) &&
 	    !c->opts.nochanges) {
@@ -996,11 +1143,23 @@ out:
 	}
 
 	bch_err_fn(c, ret);
+final_out:
+	if (!IS_ERR(clean))
+		kfree(clean);
 	return ret;
 err:
 fsck_err:
-	bch2_fs_emergency_read_only(c);
-	goto out;
+	{
+		struct printbuf buf = PRINTBUF;
+		bch2_log_msg_start(c, &buf);
+
+		prt_printf(&buf, "error in recovery: %s\n", bch2_err_str(ret));
+		bch2_fs_emergency_read_only2(c, &buf);
+
+		bch2_print_str(c, KERN_ERR, buf.buf);
+		printbuf_exit(&buf);
+	}
+	goto final_out;
 }
 
 int bch2_fs_initialize(struct bch_fs *c)
@@ -1021,7 +1180,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	bch2_check_version_downgrade(c);
 
 	if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) {
-		bch2_sb_upgrade(c, bcachefs_metadata_version_current);
+		bch2_sb_upgrade(c, bcachefs_metadata_version_current, false);
 		SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
 		bch2_write_super(c);
 	}
@@ -1035,7 +1194,6 @@ int bch2_fs_initialize(struct bch_fs *c)
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
-	c->curr_recovery_pass = BCH_RECOVERY_PASS_NR;
 	set_bit(BCH_FS_btree_running, &c->flags);
 	set_bit(BCH_FS_may_go_rw, &c->flags);
 
@@ -1050,14 +1208,17 @@ int bch2_fs_initialize(struct bch_fs *c)
 	 * journal_res_get() will crash if called before this has
 	 * set up the journal.pin FIFO and journal.cur pointer:
 	 */
-	bch2_fs_journal_start(&c->journal, 1);
-	set_bit(BCH_FS_accounting_replay_done, &c->flags);
-	bch2_journal_set_replay_done(&c->journal);
+	ret = bch2_fs_journal_start(&c->journal, 1, 1);
+	if (ret)
+		goto err;
 
 	ret = bch2_fs_read_write_early(c);
 	if (ret)
 		goto err;
 
+	set_bit(BCH_FS_accounting_replay_done, &c->flags);
+	bch2_journal_set_replay_done(&c->journal);
+
 	for_each_member_device(c, ca) {
 		ret = bch2_dev_usage_init(ca, false);
 		if (ret) {
@@ -1076,9 +1237,6 @@ int bch2_fs_initialize(struct bch_fs *c)
 	if (ret)
 		goto err;
 
-	for_each_online_member(c, ca)
-		ca->new_fs_bucket_idx = 0;
-
 	ret = bch2_fs_freespace_init(c);
 	if (ret)
 		goto err;
@@ -1117,7 +1275,10 @@ int bch2_fs_initialize(struct bch_fs *c)
 	if (ret)
 		goto err;
 
-	c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1;
+	c->recovery.pass_done = BCH_RECOVERY_PASS_NR - 1;
+
+	bch2_copygc_wakeup(c);
+	bch2_rebalance_wakeup(c);
 
 	if (enabled_qtypes(c)) {
 		ret = bch2_fs_quota_read(c);
@@ -1137,6 +1298,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 
+	c->recovery.curr_pass = BCH_RECOVERY_PASS_NR;
 	return 0;
 err:
 	bch_err_fn(c, ret);