diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2019-04-05 04:53:12 +0300 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@linux.dev> | 2023-10-23 00:08:20 +0300 |
commit | 1dd7f9d98de0740b42f1ac3f0b1d8af9c76801de (patch) | |
tree | bcc22ad8766da57180ccc67812966aab79434512 /fs | |
parent | ece254b258980cfd5a0fa11adce8e178c8d34181 (diff) | |
download | linux-1dd7f9d98de0740b42f1ac3f0b1d8af9c76801de.tar.xz |
bcachefs: Rewrite journal_seq_blacklist machinery
Now, we store blacklisted journal sequence numbers in the superblock,
not the journal: this helps to greatly simplify the code, and more
importantly it's now implemented in a way that doesn't require all btree
nodes to be visited before starting the journal - instead, we
unconditionally blacklist the next 4 journal sequence numbers after an
unclean shutdown.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/bcachefs/bcachefs.h | 16 | ||||
-rw-r--r-- | fs/bcachefs/bcachefs_format.h | 18 | ||||
-rw-r--r-- | fs/bcachefs/btree_io.c | 24 | ||||
-rw-r--r-- | fs/bcachefs/btree_iter.c | 2 | ||||
-rw-r--r-- | fs/bcachefs/inode.h | 2 | ||||
-rw-r--r-- | fs/bcachefs/journal.c | 65 | ||||
-rw-r--r-- | fs/bcachefs/journal.h | 4 | ||||
-rw-r--r-- | fs/bcachefs/journal_io.c | 108 | ||||
-rw-r--r-- | fs/bcachefs/journal_io.h | 1 | ||||
-rw-r--r-- | fs/bcachefs/journal_seq_blacklist.c | 491 | ||||
-rw-r--r-- | fs/bcachefs/journal_seq_blacklist.h | 15 | ||||
-rw-r--r-- | fs/bcachefs/journal_types.h | 22 | ||||
-rw-r--r-- | fs/bcachefs/recovery.c | 154 | ||||
-rw-r--r-- | fs/bcachefs/super-io.c | 1 | ||||
-rw-r--r-- | fs/bcachefs/super.c | 9 |
15 files changed, 460 insertions, 472 deletions
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index d8c487e33592..8acdc7ffeca3 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -185,6 +185,7 @@ #include <linux/closure.h> #include <linux/kobject.h> #include <linux/list.h> +#include <linux/math64.h> #include <linux/mutex.h> #include <linux/percpu-refcount.h> #include <linux/percpu-rwsem.h> @@ -486,6 +487,7 @@ enum { BCH_FS_RW, /* shutdown: */ + BCH_FS_STOPPING, BCH_FS_EMERGENCY_RO, BCH_FS_WRITE_DISABLE_COMPLETE, @@ -511,6 +513,15 @@ struct bch_fs_pcpu { u64 sectors_available; }; +struct journal_seq_blacklist_table { + size_t nr; + struct journal_seq_blacklist_table_entry { + u64 start; + u64 end; + bool dirty; + } entries[0]; +}; + struct bch_fs { struct closure cl; @@ -646,6 +657,11 @@ struct bch_fs { struct io_clock io_clock[2]; + /* JOURNAL SEQ BLACKLIST */ + struct journal_seq_blacklist_table * + journal_seq_blacklist_table; + struct work_struct journal_seq_blacklist_gc_work; + /* ALLOCATOR */ spinlock_t freelist_lock; struct closure_waitlist freelist_wait; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 646910a6a4bb..7edc410c5391 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -909,7 +909,8 @@ struct bch_sb_field { x(quota, 4) \ x(disk_groups, 5) \ x(clean, 6) \ - x(replicas, 7) + x(replicas, 7) \ + x(journal_seq_blacklist, 8) enum bch_sb_field_type { #define x(f, nr) BCH_SB_FIELD_##f = nr, @@ -1124,6 +1125,20 @@ struct bch_sb_field_clean { }; }; +struct journal_seq_blacklist_entry { + __le64 start; + __le64 end; +}; + +struct bch_sb_field_journal_seq_blacklist { + struct bch_sb_field field; + + union { + struct journal_seq_blacklist_entry start[0]; + __u64 _data[0]; + }; +}; + /* Superblock: */ /* @@ -1279,6 +1294,7 @@ enum bch_sb_features { BCH_FEATURE_ZSTD = 2, BCH_FEATURE_ATOMIC_NLINK = 3, /* should have gone under compat */ BCH_FEATURE_EC = 4, + BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5, BCH_FEATURE_NR, }; diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 10b3d53b6ebb..fa261a175f5e 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -770,7 +770,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry struct btree_node *sorted; struct bkey_packed *k; struct bset *i; - bool used_mempool; + bool used_mempool, blacklisted; unsigned u64s; int ret, retry_read = 0, write = READ; @@ -844,20 +844,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry b->written += sectors; - ret = bch2_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b); - if (ret < 0) { - btree_err(BTREE_ERR_FATAL, c, b, i, - "insufficient memory"); - goto err; - } + blacklisted = bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(i->journal_seq), + true); - if (ret) { - btree_err_on(first, - BTREE_ERR_FIXABLE, c, b, i, - "first btree node bset has blacklisted journal seq"); - if (!first) - continue; - } + btree_err_on(blacklisted && first, + BTREE_ERR_FIXABLE, c, b, i, + "first btree node bset has blacklisted journal seq"); + if (blacklisted && !first) + continue; bch2_btree_node_iter_large_push(iter, b, i->start, @@ -930,7 +925,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry out: mempool_free(iter, &c->fill_iter); return retry_read; -err: fsck_err: if (ret == BTREE_RETRY_READ) { retry_read = 1; diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 02eb28bfe9b9..6b9af53a3e77 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1156,6 +1156,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth) if (!btree_iter_node(iter, iter->level)) return NULL; + bch2_trans_cond_resched(iter->trans); + btree_iter_up(iter); if (!bch2_btree_node_relock(iter, iter->level)) diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index ada639c06619..af0c355f2f04 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -4,8 +4,6 @@ #include "opts.h" -#include <linux/math64.h> - extern const char * const bch2_inode_opts[]; const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c); diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index dbecb4072af0..2e84af8a044c 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -988,27 +988,57 @@ void bch2_fs_journal_stop(struct journal *j) cancel_delayed_work_sync(&j->reclaim_work); } -void bch2_fs_journal_start(struct journal *j) +int bch2_fs_journal_start(struct journal *j, u64 cur_seq, + struct list_head *journal_entries) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_seq_blacklist *bl; - u64 blacklist = 0; + struct journal_entry_pin_list *p; + struct journal_replay *i; + u64 last_seq = cur_seq, nr, seq; + + if (!list_empty(journal_entries)) + last_seq = le64_to_cpu(list_last_entry(journal_entries, + struct journal_replay, + list)->j.last_seq); + + nr = cur_seq - last_seq; + + if (nr + 1 > j->pin.size) { + free_fifo(&j->pin); + init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL); + if (!j->pin.data) { + bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); + return -ENOMEM; + } + } + + j->last_seq_ondisk = last_seq; + j->pin.front = last_seq; + j->pin.back = cur_seq; + atomic64_set(&j->seq, cur_seq - 1); + + fifo_for_each_entry_ptr(p, &j->pin, seq) { + INIT_LIST_HEAD(&p->list); + INIT_LIST_HEAD(&p->flushed); + atomic_set(&p->count, 0); + p->devs.nr = 0; + } + + list_for_each_entry(i, journal_entries, list) { + seq = le64_to_cpu(i->j.seq); + + BUG_ON(seq < last_seq || seq >= cur_seq); - list_for_each_entry(bl, &j->seq_blacklist, list) - blacklist = max(blacklist, bl->end); + p = journal_seq_pin(j, seq); + + atomic_set(&p->count, 1); + p->devs = i->devs; + } spin_lock(&j->lock); set_bit(JOURNAL_STARTED, &j->flags); - while (journal_cur_seq(j) < blacklist) - journal_pin_new_entry(j, 0); - - /* - * __journal_entry_close() only inits the next journal entry when it - * closes an open journal entry - the very first journal entry gets - * initialized here: - */ journal_pin_new_entry(j, 1); bch2_journal_buf_init(j); @@ -1017,12 +1047,7 @@ void bch2_fs_journal_start(struct journal *j) bch2_journal_space_available(j); spin_unlock(&j->lock); - /* - * Adding entries to the next journal entry before allocating space on - * disk for the next journal entry - this is ok, because these entries - * only have to go down with the next journal entry we write: - */ - bch2_journal_seq_blacklist_write(j); + return 0; } /* init/exit: */ @@ -1090,8 +1115,6 @@ int bch2_fs_journal_init(struct journal *j) INIT_DELAYED_WORK(&j->write_work, journal_write_work); INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work); init_waitqueue_head(&j->pin_flush_wait); - mutex_init(&j->blacklist_lock); - INIT_LIST_HEAD(&j->seq_blacklist); mutex_init(&j->reclaim_lock); mutex_init(&j->discard_lock); diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 809cf25f5a03..3447b4ad462d 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -472,8 +472,10 @@ int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, int bch2_dev_journal_alloc(struct bch_dev *); void bch2_dev_journal_stop(struct journal *, struct bch_dev *); + void bch2_fs_journal_stop(struct journal *); -void bch2_fs_journal_start(struct journal *); +int bch2_fs_journal_start(struct journal *, u64, struct list_head *); + void bch2_dev_journal_exit(struct bch_dev *); int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); void bch2_fs_journal_exit(struct journal *); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 1293bb66e62c..8010b38114ac 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -10,7 +10,6 @@ #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" -#include "journal_seq_blacklist.h" #include "replicas.h" #include "trace.h" @@ -655,45 +654,11 @@ void bch2_journal_entries_free(struct list_head *list) } } -int bch2_journal_set_seq(struct bch_fs *c, u64 last_seq, u64 end_seq) -{ - struct journal *j = &c->journal; - struct journal_entry_pin_list *p; - u64 seq, nr = end_seq - last_seq + 1; - - if (nr > j->pin.size) { - free_fifo(&j->pin); - init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL); - if (!j->pin.data) { - bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); - return -ENOMEM; - } - } - - atomic64_set(&j->seq, end_seq); - j->last_seq_ondisk = last_seq; - - j->pin.front = last_seq; - j->pin.back = end_seq + 1; - - fifo_for_each_entry_ptr(p, &j->pin, seq) { - INIT_LIST_HEAD(&p->list); - INIT_LIST_HEAD(&p->flushed); - atomic_set(&p->count, 0); - p->devs.nr = 0; - } - - return 0; -} - int bch2_journal_read(struct bch_fs *c, struct list_head *list) { - struct journal *j = &c->journal; struct journal_list jlist; struct journal_replay *i; - struct journal_entry_pin_list *p; struct bch_dev *ca; - u64 cur_seq, end_seq; unsigned iter; size_t keys = 0, entries = 0; bool degraded = false; @@ -725,17 +690,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) if (jlist.ret) return jlist.ret; - if (list_empty(list)){ - bch_err(c, "no journal entries found"); - return BCH_FSCK_REPAIR_IMPOSSIBLE; - } - list_for_each_entry(i, list, list) { + struct jset_entry *entry; + struct bkey_i *k, *_n; struct bch_replicas_padded replicas; char buf[80]; - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs); - ret = jset_validate_entries(c, &i->j, READ); if (ret) goto fsck_err; @@ -745,6 +705,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) * the devices - this is wrong: */ + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs); + if (!degraded && (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c, @@ -755,68 +717,18 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) if (ret) return ret; } - } - - i = list_last_entry(list, struct journal_replay, list); - - ret = bch2_journal_set_seq(c, - le64_to_cpu(i->j.last_seq), - le64_to_cpu(i->j.seq)); - if (ret) - return ret; - - mutex_lock(&j->blacklist_lock); - - list_for_each_entry(i, list, list) { - p = journal_seq_pin(j, le64_to_cpu(i->j.seq)); - - atomic_set(&p->count, 1); - p->devs = i->devs; - - if (bch2_journal_seq_blacklist_read(j, i)) { - mutex_unlock(&j->blacklist_lock); - return -ENOMEM; - } - } - - mutex_unlock(&j->blacklist_lock); - - cur_seq = journal_last_seq(j); - end_seq = le64_to_cpu(list_last_entry(list, - struct journal_replay, list)->j.seq); - - list_for_each_entry(i, list, list) { - struct jset_entry *entry; - struct bkey_i *k, *_n; - bool blacklisted; - - mutex_lock(&j->blacklist_lock); - while (cur_seq < le64_to_cpu(i->j.seq) && - bch2_journal_seq_blacklist_find(j, cur_seq)) - cur_seq++; - - blacklisted = bch2_journal_seq_blacklist_find(j, - le64_to_cpu(i->j.seq)); - mutex_unlock(&j->blacklist_lock); - - fsck_err_on(blacklisted, c, - "found blacklisted journal entry %llu", - le64_to_cpu(i->j.seq)); - - fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c, - "journal entries %llu-%llu missing! (replaying %llu-%llu)", - cur_seq, le64_to_cpu(i->j.seq) - 1, - journal_last_seq(j), end_seq); - - cur_seq = le64_to_cpu(i->j.seq) + 1; for_each_jset_key(k, _n, entry, &i->j) keys++; entries++; } - bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", - keys, entries, journal_cur_seq(j)); + if (!list_empty(list)) { + i = list_last_entry(list, struct journal_replay, list); + + bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", + keys, entries, le64_to_cpu(i->j.seq)); + } fsck_err: return ret; } diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index a79c396903f0..4bb174839956 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -35,7 +35,6 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ vstruct_for_each_safe(entry, k, _n) -int bch2_journal_set_seq(struct bch_fs *c, u64, u64); int bch2_journal_read(struct bch_fs *, struct list_head *); void bch2_journal_entries_free(struct list_head *); int bch2_journal_replay(struct bch_fs *, struct list_head *); diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index 45c8d38d12de..0df8dfccd5b5 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -1,13 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" -#include "btree_update.h" -#include "btree_update_interior.h" -#include "error.h" -#include "journal.h" -#include "journal_io.h" -#include "journal_reclaim.h" +#include "btree_iter.h" +#include "eytzinger.h" #include "journal_seq_blacklist.h" +#include "super-io.h" /* * journal_seq_blacklist machinery: @@ -37,327 +34,285 @@ * record that it was blacklisted so that a) on recovery we don't think we have * missing journal entries and b) so that the btree code continues to ignore * that bset, until that btree node is rewritten. - * - * Blacklisted journal sequence numbers are themselves recorded as entries in - * the journal. */ -/* - * Called when journal needs to evict a blacklist entry to reclaim space: find - * any btree nodes that refer to the blacklist journal sequence numbers, and - * rewrite them: - */ -static void journal_seq_blacklist_flush(struct journal *j, - struct journal_entry_pin *pin, u64 seq) +static unsigned +blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) { - struct bch_fs *c = - container_of(j, struct bch_fs, journal); - struct journal_seq_blacklist *bl = - container_of(pin, struct journal_seq_blacklist, pin); - struct blacklisted_node n; - struct closure cl; - unsigned i; - int ret; + return bl + ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) / + sizeof(struct journal_seq_blacklist_entry)) + : 0; +} - closure_init_stack(&cl); +static unsigned sb_blacklist_u64s(unsigned nr) +{ + struct bch_sb_field_journal_seq_blacklist *bl; - for (i = 0;; i++) { - struct btree_trans trans; - struct btree_iter *iter; - struct btree *b; + return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64); +} - bch2_trans_init(&trans, c); +static struct bch_sb_field_journal_seq_blacklist * +blacklist_entry_try_merge(struct bch_fs *c, + struct bch_sb_field_journal_seq_blacklist *bl, + unsigned i) +{ + unsigned nr = blacklist_nr_entries(bl); + + if (le64_to_cpu(bl->start[i].end) >= + le64_to_cpu(bl->start[i + 1].start)) { + bl->start[i].end = bl->start[i + 1].end; + --nr; + memmove(&bl->start[i], + &bl->start[i + 1], + sizeof(bl->start[0]) * (nr - i)); + + bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, + sb_blacklist_u64s(nr)); + BUG_ON(!bl); + } - mutex_lock(&j->blacklist_lock); - if (i >= bl->nr_entries) { - mutex_unlock(&j->blacklist_lock); - break; - } - n = bl->entries[i]; - mutex_unlock(&j->blacklist_lock); + return bl; +} - iter = bch2_trans_get_node_iter(&trans, n.btree_id, n.pos, - 0, 0, 0); +int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) +{ + struct bch_sb_field_journal_seq_blacklist *bl; + unsigned i, nr; + int ret = 0; - b = bch2_btree_iter_peek_node(iter); + mutex_lock(&c->sb_lock); + bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); + nr = blacklist_nr_entries(bl); - /* The node might have already been rewritten: */ + if (bl) { + for (i = 0; i < nr; i++) { + struct journal_seq_blacklist_entry *e = + bl->start + i; - if (b->data->keys.seq == n.seq) { - ret = bch2_btree_node_rewrite(c, iter, n.seq, 0); - if (ret) { - bch2_trans_exit(&trans); - bch2_fs_fatal_error(c, - "error %i rewriting btree node with blacklisted journal seq", - ret); - bch2_journal_halt(j); - return; + if (start == le64_to_cpu(e->start) && + end == le64_to_cpu(e->end)) + goto out; + + if (start <= le64_to_cpu(e->start) && + end >= le64_to_cpu(e->end)) { + e->start = cpu_to_le64(start); + e->end = cpu_to_le64(end); + + if (i + 1 < nr) + bl = blacklist_entry_try_merge(c, + bl, i); + if (i) + bl = blacklist_entry_try_merge(c, + bl, i - 1); + goto out_write_sb; } } - - bch2_trans_exit(&trans); } - for (i = 0;; i++) { - struct btree_update *as; - struct pending_btree_node_free *d; - - mutex_lock(&j->blacklist_lock); - if (i >= bl->nr_entries) { - mutex_unlock(&j->blacklist_lock); - break; - } - n = bl->entries[i]; - mutex_unlock(&j->blacklist_lock); -redo_wait: - mutex_lock(&c->btree_interior_update_lock); - - /* - * Is the node on the list of pending interior node updates - - * being freed? If so, wait for that to finish: - */ - for_each_pending_btree_node_free(c, as, d) - if (n.seq == d->seq && - n.btree_id == d->btree_id && - !d->level && - !bkey_cmp(n.pos, d->key.k.p)) { - closure_wait(&as->wait, &cl); - mutex_unlock(&c->btree_interior_update_lock); - closure_sync(&cl); - goto redo_wait; - } - - mutex_unlock(&c->btree_interior_update_lock); + bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, + sb_blacklist_u64s(nr + 1)); + if (!bl) { + ret = -ENOMEM; + goto out; } - mutex_lock(&j->blacklist_lock); + bl->start[nr].start = cpu_to_le64(start); + bl->start[nr].end = cpu_to_le64(end); +out_write_sb: + c->disk_sb.sb->features[0] |= + 1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3; - bch2_journal_pin_drop(j, &bl->pin); - list_del(&bl->list); - kfree(bl->entries); - kfree(bl); + ret = bch2_write_super(c); +out: + mutex_unlock(&c->sb_lock); - mutex_unlock(&j->blacklist_lock); + return ret; } -/* - * Determine if a particular sequence number is blacklisted - if so, return - * blacklist entry: - */ -struct journal_seq_blacklist * -bch2_journal_seq_blacklist_find(struct journal *j, u64 seq) +static int journal_seq_blacklist_table_cmp(const void *_l, + const void *_r, size_t size) { - struct journal_seq_blacklist *bl; + const struct journal_seq_blacklist_table_entry *l = _l; + const struct journal_seq_blacklist_table_entry *r = _r; - lockdep_assert_held(&j->blacklist_lock); - - list_for_each_entry(bl, &j->seq_blacklist, list) - if (seq >= bl->start && seq <= bl->end) - return bl; - - return NULL; + return (l->start > r->start) - (l->start < r->start); } -/* - * Allocate a new, in memory blacklist entry: - */ -static struct journal_seq_blacklist * -bch2_journal_seq_blacklisted_new(struct journal *j, u64 start, u64 end) +bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, + bool dirty) { - struct journal_seq_blacklist *bl; + struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; + struct journal_seq_blacklist_table_entry search = { .start = seq }; + int idx; - lockdep_assert_held(&j->blacklist_lock); + if (!t) + return false; - /* - * When we start the journal, bch2_journal_start() will skip over @seq: - */ + idx = eytzinger0_find_le(t->entries, t->nr, + sizeof(t->entries[0]), + journal_seq_blacklist_table_cmp, + &search); + if (idx < 0) + return false; - bl = kzalloc(sizeof(*bl), GFP_KERNEL); - if (!bl) - return NULL; + BUG_ON(t->entries[idx].start > seq); - bl->start = start; - bl->end = end; + if (seq >= t->entries[idx].end) + return false; - list_add_tail(&bl->list, &j->seq_blacklist); - return bl; + if (dirty) + t->entries[idx].dirty = true; + return true; } -/* - * Returns true if @seq is newer than the most recent journal entry that got - * written, and data corresponding to @seq should be ignored - also marks @seq - * as blacklisted so that on future restarts the corresponding data will still - * be ignored: - */ -int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b) +int bch2_blacklist_table_initialize(struct bch_fs *c) { - struct journal *j = &c->journal; - struct journal_seq_blacklist *bl = NULL; - struct blacklisted_node *n; - u64 journal_seq; - int ret = 0; - - if (!seq) - return 0; + struct bch_sb_field_journal_seq_blacklist *bl = + bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); + struct journal_seq_blacklist_table *t; + unsigned i, nr = blacklist_nr_entries(bl); - spin_lock(&j->lock); - journal_seq = journal_cur_seq(j); - spin_unlock(&j->lock); + BUG_ON(c->journal_seq_blacklist_table); - /* Interier updates aren't journalled: */ - BUG_ON(b->level); - BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)); + if (!bl) + return 0; - /* - * Decrease this back to j->seq + 2 when we next rev the on disk format: - * increasing it temporarily to work around bug in old kernels - */ - fsck_err_on(seq > journal_seq + 4, c, - "bset journal seq too far in the future: %llu > %llu", - seq, journal_seq); + t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr, + GFP_KERNEL); + if (!t) + return -ENOMEM; - if (seq <= journal_seq && - list_empty_careful(&j->seq_blacklist)) - return 0; + t->nr = nr; - mutex_lock(&j->blacklist_lock); - - if (seq <= journal_seq) { - bl = bch2_journal_seq_blacklist_find(j, seq); - if (!bl) - goto out; - } else { - bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting", - b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq); - - if (!j->new_blacklist) { - j->new_blacklist = bch2_journal_seq_blacklisted_new(j, - journal_seq + 1, - journal_seq + 1); - if (!j->new_blacklist) { - ret = -ENOMEM; - goto out; - } - } - bl = j->new_blacklist; - bl->end = max(bl->end, seq); + for (i = 0; i < nr; i++) { + t->entries[i].start = le64_to_cpu(bl->start[i].start); + t->entries[i].end = le64_to_cpu(bl->start[i].end); } - for (n = bl->entries; n < bl->entries + bl->nr_entries; n++) - if (b->data->keys.seq == n->seq && - b->btree_id == n->btree_id && - !bkey_cmp(b->key.k.p, n->pos)) - goto found_entry; - - if (!bl->nr_entries || - is_power_of_2(bl->nr_entries)) { - n = krealloc(bl->entries, - max_t(size_t, bl->nr_entries * 2, 8) * sizeof(*n), - GFP_KERNEL); - if (!n) { - ret = -ENOMEM; - goto out; - } - bl->entries = n; - } + eytzinger0_sort(t->entries, + t->nr, + sizeof(t->entries[0]), + journal_seq_blacklist_table_cmp, + NULL); - bl->entries[bl->nr_entries++] = (struct blacklisted_node) { - .seq = b->data->keys.seq, - .btree_id = b->btree_id, - .pos = b->key.k.p, - }; -found_entry: - ret = 1; -out: -fsck_err: - mutex_unlock(&j->blacklist_lock); - return ret; + c->journal_seq_blacklist_table = t; + return 0; } -static int __bch2_journal_seq_blacklist_read(struct journal *j, - struct journal_replay *i, - u64 start, u64 end) +static const char * +bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, + struct bch_sb_field *f) { - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_seq_blacklist *bl; - - bch_verbose(c, "blacklisting existing journal seq %llu-%llu", - start, end); + struct bch_sb_field_journal_seq_blacklist *bl = + field_to_type(f, journal_seq_blacklist); + struct journal_seq_blacklist_entry *i; + unsigned nr = blacklist_nr_entries(bl); + + for (i = bl->start; i < bl->start + nr; i++) { + if (le64_to_cpu(i->start) >= + le64_to_cpu(i->end)) + return "entry start >= end"; + + if (i + 1 < bl->start + nr && + le64_to_cpu(i[0].end) > + le64_to_cpu(i[1].start)) + return "entries out of order"; + } - bl = bch2_journal_seq_blacklisted_new(j, start, end); - if (!bl) - return -ENOMEM; + return NULL; +} - bch2_journal_pin_add(j, le64_to_cpu(i->j.seq), &bl->pin, - journal_seq_blacklist_flush); - return 0; +static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, + struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_journal_seq_blacklist *bl = + field_to_type(f, journal_seq_blacklist); + struct journal_seq_blacklist_entry *i; + unsigned nr = blacklist_nr_entries(bl); + + for (i = bl->start; i < bl->start + nr; i++) { + if (i != bl->start) + pr_buf(out, " "); + + pr_buf(out, "%llu-%llu", + le64_to_cpu(i->start), + le64_to_cpu(i->end)); + } } -/* - * After reading the journal, find existing journal seq blacklist entries and - * read them into memory: - */ -int bch2_journal_seq_blacklist_read(struct journal *j, - struct journal_replay *i) +const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { + .validate = bch2_sb_journal_seq_blacklist_validate, + .to_text = bch2_sb_journal_seq_blacklist_to_text +}; + +void bch2_blacklist_entries_gc(struct work_struct *work) { - struct jset_entry *entry; - int ret = 0; + struct bch_fs *c = container_of(work, struct bch_fs, + journal_seq_blacklist_gc_work); + struct journal_seq_blacklist_table *t; + struct bch_sb_field_journal_seq_blacklist *bl; + struct journal_seq_blacklist_entry *src, *dst; + struct btree_trans trans; + unsigned i, nr, new_nr; + int ret; - vstruct_for_each(&i->j, entry) { - switch (entry->type) { - case BCH_JSET_ENTRY_blacklist: { - struct jset_entry_blacklist *bl_entry = - container_of(entry, struct jset_entry_blacklist, entry); + bch2_trans_init(&trans, c); - ret = __bch2_journal_seq_blacklist_read(j, i, - le64_to_cpu(bl_entry->seq), - le64_to_cpu(bl_entry->seq)); - break; - } - case BCH_JSET_ENTRY_blacklist_v2: { - struct jset_entry_blacklist_v2 *bl_entry = - container_of(entry, struct jset_entry_blacklist_v2, entry); - - ret = __bch2_journal_seq_blacklist_read(j, i, - le64_to_cpu(bl_entry->start), - le64_to_cpu(bl_entry->end)); - break; - } - } + for (i = 0; i < BTREE_ID_NR; i++) { + struct btree_iter *iter; + struct btree *b; - if (ret) - break; + for_each_btree_node(&trans, iter, i, POS_MIN, + BTREE_ITER_PREFETCH, b) + if (test_bit(BCH_FS_STOPPING, &c->flags)) { + bch2_trans_exit(&trans); + return; + } + bch2_trans_iter_free(&trans, iter); } - return ret; -} - -/* - * After reading the journal and walking the btree, we might have new journal - * sequence numbers to blacklist - add entries to the next journal entry to be - * written: - */ -void bch2_journal_seq_blacklist_write(struct journal *j) -{ - struct journal_seq_blacklist *bl = j->new_blacklist; - struct jset_entry_blacklist_v2 *bl_entry; - struct jset_entry *entry; + ret = bch2_trans_exit(&trans); + if (ret) + return; + mutex_lock(&c->sb_lock); + bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); if (!bl) - return; + goto out; - entry = bch2_journal_add_entry_noreservation(journal_cur_buf(j), - (sizeof(*bl_entry) - sizeof(*entry)) / sizeof(u64)); + nr = blacklist_nr_entries(bl); + dst = bl->start; - bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); - bl_entry->entry.type = BCH_JSET_ENTRY_blacklist_v2; - bl_entry->start = cpu_to_le64(bl->start); - bl_entry->end = cpu_to_le64(bl->end); + t = c->journal_seq_blacklist_table; + BUG_ON(nr != t->nr); + + for (src = bl->start, i = eytzinger0_first(t->nr); + src < bl->start + nr; + src++, i = eytzinger0_next(i, nr)) { + BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); + BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); + + if (t->entries[i].dirty) + *dst++ = *src; + } - bch2_journal_pin_add(j, - journal_cur_seq(j), - &bl->pin, - journal_seq_blacklist_flush); + new_nr = dst - bl->start; - j->new_blacklist = NULL; + bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr); + + if (new_nr != nr) { + bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, + new_nr ? sb_blacklist_u64s(new_nr) : 0); + BUG_ON(new_nr && !bl); + + if (!new_nr) + c->disk_sb.sb->features[0] &= + ~(1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3); + + bch2_write_super(c); + } +out: + mutex_unlock(&c->sb_lock); } diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h index b4a3b270e9d2..03f4b97247fd 100644 --- a/fs/bcachefs/journal_seq_blacklist.h +++ b/fs/bcachefs/journal_seq_blacklist.h @@ -2,13 +2,12 @@ #ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H #define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H -struct journal_replay; - -struct journal_seq_blacklist * -bch2_journal_seq_blacklist_find(struct journal *, u64); -int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *); -int bch2_journal_seq_blacklist_read(struct journal *, - struct journal_replay *); -void bch2_journal_seq_blacklist_write(struct journal *); +bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); +int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); +int bch2_blacklist_table_initialize(struct bch_fs *); + +extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist; + +void bch2_blacklist_entries_gc(struct work_struct *); #endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 85bf5e2706f7..7349b50bc5e7 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -54,24 +54,6 @@ struct journal_entry_pin { u64 seq; }; -/* corresponds to a btree node with a blacklisted bset: */ -struct blacklisted_node { - __le64 seq; - enum btree_id btree_id; - struct bpos pos; -}; - -struct journal_seq_blacklist { - struct list_head list; - u64 start; - u64 end; - - struct journal_entry_pin pin; - - struct blacklisted_node *entries; - size_t nr_entries; -}; - struct journal_res { bool ref; u8 idx; @@ -222,10 +204,6 @@ struct journal { u64 replay_journal_seq; - struct mutex blacklist_lock; - struct list_head seq_blacklist; - struct journal_seq_blacklist *new_blacklist; - struct write_point wp; spinlock_t err_lock; diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 67b4dda9cfeb..9411a1f550f3 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -12,6 +12,7 @@ #include "error.h" #include "fsck.h" #include "journal_io.h" +#include "journal_seq_blacklist.h" #include "quota.h" #include "recovery.h" #include "replicas.h" @@ -99,18 +100,49 @@ fsck_err: return ret; } +static int +verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c, + struct list_head *journal) +{ + struct journal_replay *i = + list_last_entry(journal, struct journal_replay, list); + u64 start_seq = le64_to_cpu(i->j.last_seq); + u64 end_seq = le64_to_cpu(i->j.seq); + u64 seq = start_seq; + int ret = 0; + + list_for_each_entry(i, journal, list) { + fsck_err_on(seq != le64_to_cpu(i->j.seq), c, + "journal entries %llu-%llu missing! (replaying %llu-%llu)", + seq, le64_to_cpu(i->j.seq) - 1, + start_seq, end_seq); + + seq = le64_to_cpu(i->j.seq); + + fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c, + "found blacklisted journal entry %llu", seq); + + do { + seq++; + } while (bch2_journal_seq_is_blacklisted(c, seq, false)); + } +fsck_err: + return ret; +} + static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) { struct bch_sb_field_clean *clean, *sb_clean; - - if (!c->sb.clean) - return NULL; + int ret; mutex_lock(&c->sb_lock); sb_clean = bch2_sb_get_clean(c->disk_sb.sb); - if (!sb_clean) { + + if (fsck_err_on(!sb_clean, c, + "superblock marked clean but clean section not present")) { + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->sb.clean = false; mutex_unlock(&c->sb_lock); - bch_err(c, "superblock marked clean but clean section not present"); return NULL; } @@ -128,6 +160,9 @@ static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) mutex_unlock(&c->sb_lock); return clean; +fsck_err: + mutex_unlock(&c->sb_lock); + return ERR_PTR(ret); } static int journal_replay_entry_early(struct bch_fs *c, @@ -179,14 +214,32 @@ static int journal_replay_entry_early(struct bch_fs *c, le64_to_cpu(u->v)); break; } + case BCH_JSET_ENTRY_blacklist: { + struct jset_entry_blacklist *bl_entry = + container_of(entry, struct jset_entry_blacklist, entry); + + ret = bch2_journal_seq_blacklist_add(c, + le64_to_cpu(bl_entry->seq), + le64_to_cpu(bl_entry->seq) + 1); + break; + } + case BCH_JSET_ENTRY_blacklist_v2: { + struct jset_entry_blacklist_v2 *bl_entry = + container_of(entry, struct jset_entry_blacklist_v2, entry); + + ret = bch2_journal_seq_blacklist_add(c, + le64_to_cpu(bl_entry->start), + le64_to_cpu(bl_entry->end) + 1); + break; + } } return ret; } -static int load_journal_metadata(struct bch_fs *c, - struct bch_sb_field_clean *clean, - struct list_head *journal) +static int journal_replay_early(struct bch_fs *c, + struct bch_sb_field_clean *clean, + struct list_head *journal) { struct jset_entry *entry; int ret; @@ -300,37 +353,76 @@ static bool journal_empty(struct list_head *journal) int bch2_fs_recovery(struct bch_fs *c) { const char *err = "cannot allocate memory"; - struct bch_sb_field_clean *clean; + struct bch_sb_field_clean *clean = NULL; + u64 journal_seq; LIST_HEAD(journal); int ret; - clean = read_superblock_clean(c); - if (clean) + if (c->sb.clean) + clean = read_superblock_clean(c); + ret = PTR_ERR_OR_ZERO(clean); + if (ret) + goto err; + + if (c->sb.clean) bch_info(c, "recovering from clean shutdown, journal seq %llu", le64_to_cpu(clean->journal_seq)); - if (!clean || c->opts.fsck) { + if (!c->replicas.entries) { + bch_info(c, "building replicas info"); + set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); + } + + if (!c->sb.clean || c->opts.fsck) { + struct jset *j; + ret = bch2_journal_read(c, &journal); if (ret) goto err; - ret = verify_superblock_clean(c, &clean, - &list_last_entry(&journal, struct journal_replay, - list)->j); + fsck_err_on(c->sb.clean && !journal_empty(&journal), c, + "filesystem marked clean but journal not empty"); + + if (!c->sb.clean && list_empty(&journal)){ + bch_err(c, "no journal entries found"); + ret = BCH_FSCK_REPAIR_IMPOSSIBLE; + goto err; + } + + j = &list_last_entry(&journal, struct journal_replay, list)->j; + + ret = verify_superblock_clean(c, &clean, j); if (ret) goto err; + + journal_seq = le64_to_cpu(j->seq) + 1; } else { - ret = bch2_journal_set_seq(c, - le64_to_cpu(clean->journal_seq), - le64_to_cpu(clean->journal_seq)); - if (ret) + journal_seq = le64_to_cpu(clean->journal_seq) + 1; + } + + ret = journal_replay_early(c, clean, &journal); + if (ret) + goto err; + + if (!c->sb.clean) { + ret = bch2_journal_seq_blacklist_add(c, + journal_seq, + journal_seq + 4); + if (ret) { + bch_err(c, "error creating new journal seq blacklist entry"); goto err; + } + + journal_seq += 4; } - fsck_err_on(clean && !journal_empty(&journal), c, - "filesystem marked clean but journal not empty"); + ret = bch2_blacklist_table_initialize(c); + + ret = verify_journal_entries_not_blacklisted_or_missing(c, &journal); + if (ret) + goto err; - ret = load_journal_metadata(c, clean, &journal); + ret = bch2_fs_journal_start(&c->journal, journal_seq, &journal); if (ret) goto err; @@ -351,11 +443,6 @@ int bch2_fs_recovery(struct bch_fs *c) set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); - if (!c->replicas.entries) { - bch_info(c, "building replicas info"); - set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); - } - if (c->opts.fsck || !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) || test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { @@ -377,13 +464,6 @@ int bch2_fs_recovery(struct bch_fs *c) if (c->sb.encryption_type && !c->sb.clean) atomic64_add(1 << 16, &c->key_version); - /* - * bch2_fs_journal_start() can't happen sooner, or btree_gc_finish() - * will give spurious errors about oldest_gen > bucket_gen - - * this is a hack but oh well. - */ - bch2_fs_journal_start(&c->journal); - if (c->opts.noreplay) goto out; @@ -424,6 +504,10 @@ int bch2_fs_recovery(struct bch_fs *c) SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); } mutex_unlock(&c->sb_lock); + + if (c->journal_seq_blacklist_table && + c->journal_seq_blacklist_table->nr > 128) + queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); out: bch2_journal_entries_free(&journal); kfree(clean); @@ -472,7 +556,7 @@ int bch2_fs_initialize(struct bch_fs *c) * journal_res_get() will crash if called before this has * set up the journal.pin FIFO and journal.cur pointer: */ - bch2_fs_journal_start(&c->journal); + bch2_fs_journal_start(&c->journal, 1, &journal); bch2_journal_set_replay_done(&c->journal); err = "error going read write"; diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 9fd77e57cafe..7aaa8b785d57 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -7,6 +7,7 @@ #include "error.h" #include "io.h" #include "journal.h" +#include "journal_seq_blacklist.h" #include "replicas.h" #include "quota.h" #include "super-io.h" diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 8c31a9a67eee..27eacb1cd144 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -30,6 +30,7 @@ #include "io.h" #include "journal.h" #include "journal_reclaim.h" +#include "journal_seq_blacklist.h" #include "move.h" #include "migrate.h" #include "movinggc.h" @@ -468,6 +469,7 @@ static void bch2_fs_free(struct bch_fs *c) kfree(c->replicas.entries); kfree(c->replicas_gc.entries); kfree(rcu_dereference_protected(c->disk_groups, 1)); + kfree(c->journal_seq_blacklist_table); if (c->journal_reclaim_wq) destroy_workqueue(c->journal_reclaim_wq); @@ -496,6 +498,10 @@ void bch2_fs_stop(struct bch_fs *c) bch_verbose(c, "shutting down"); + set_bit(BCH_FS_STOPPING, &c->flags); + + cancel_work_sync(&c->journal_seq_blacklist_gc_work); + for_each_member_device(ca, c, i) if (ca->kobj.state_in_sysfs && ca->disk_sb.bdev) @@ -631,6 +637,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) spin_lock_init(&c->btree_write_error_lock); INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work); + INIT_WORK(&c->journal_seq_blacklist_gc_work, + bch2_blacklist_entries_gc); + INIT_LIST_HEAD(&c->fsck_errors); mutex_init(&c->fsck_error_lock); |